nlib
AhoCorasick.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICK_H_
4 #define INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICK_H_
5 
6 #include "nn/nlib/Swap.h"
7 #include "nn/nlib/succinct/Bp.h"
8 #include "nn/nlib/succinct/Sbv.h"
9 
10 #if defined(_MSC_VER) && defined(nx_succinct_EXPORTS)
11 #undef NLIB_VIS_PUBLIC
12 #define NLIB_VIS_PUBLIC NLIB_WINEXPORT
13 #endif
14 
15 NLIB_NAMESPACE_BEGIN
16 namespace succinct {
17 namespace detail {
18 
19 class GotoArcHolder NLIB_FINAL {
20  public:
21  uint64_t GetPos(uint32_t nodeid, unsigned char c) NLIB_NOEXCEPT {
22  uint64_t rval = m_NumNodeId;
23  rval = rval * c + nodeid;
24  return rval;
25  }
26 
27  public:
28  GotoArcHolder() NLIB_NOEXCEPT;
29  bool Init(uint32_t num_nodeid) NLIB_NOEXCEPT;
30  bool TurnOn(uint32_t nodeid, unsigned char c) NLIB_NOEXCEPT {
31  uint64_t idx = this->GetPos(nodeid, c);
32  return m_Set.TurnOn(idx);
33  }
34  bool Build() NLIB_NOEXCEPT { return m_Set.Build(); }
35  int32_t Goto(uint32_t nodeid, unsigned char c) NLIB_NOEXCEPT;
36  void Reset() NLIB_NOEXCEPT;
37  size_t MemSize() const NLIB_NOEXCEPT { return sizeof(m_NumNodeId) + m_Set.MemSize(); }
38  bool Export(BinaryWriter* w) const NLIB_NOEXCEPT;
39  bool Import(BinaryReader* r) NLIB_NOEXCEPT;
40  void swap(GotoArcHolder& rhs) NLIB_NOEXCEPT { // NOLINT
41  using std::swap;
42  swap(m_Set, rhs.m_Set);
43  swap(m_NumNodeId, rhs.m_NumNodeId);
44  }
45 
46  private:
47  SparseSet m_Set;
48  uint32_t m_NumNodeId;
49  NLIB_DISALLOW_COPY_AND_ASSIGN(GotoArcHolder);
50 };
51 
52 } // namespace detail
53 
55  public:
56  static const size_t LABEL_WIDTH = 8; // 1,2,4,8
57  static const int LABEL_MASK = (1 << LABEL_WIDTH) - 1;
58  static const int NUMLABEL_PER_BYTE = 8 / LABEL_WIDTH;
59 
60  public:
63  NLIB_MOVE_MEMBER_HELPER_4(AhoCorasick, m_LenHolder, m_ReportTreeHolder, m_GotoArcHolder,
64  m_FailureTreeHolder);
66  typedef bool (*MatchCallback)(const char* first, const char* last, uint32_t nodeid,
67  void* user_obj);
68  void Match(const char* doc, MatchCallback callback, void* userobj = NULL) NLIB_NOEXCEPT {
69  this->Match(doc, nlib_strlen(doc), callback, userobj);
70  }
71  NLIB_VIS_PUBLIC void Match(const void* data, size_t n, MatchCallback callback,
72  void* userobj = NULL) NLIB_NOEXCEPT;
73  NLIB_VIS_PUBLIC size_t MemSize() const NLIB_NOEXCEPT;
74 
75  NLIB_VIS_PUBLIC void MemSize(size_t* len_store, size_t* goto_arc, size_t* report_arc,
76  size_t* failure_arc) const NLIB_NOEXCEPT;
77  NLIB_VIS_PUBLIC void Reset() NLIB_NOEXCEPT;
78  NLIB_VIS_PUBLIC bool Export(BinaryWriter* w) const NLIB_NOEXCEPT;
79  NLIB_VIS_PUBLIC bool Import(BinaryReader* r) NLIB_NOEXCEPT;
80 
81  protected:
82  Map m_LenHolder; // holds len
83  Map m_ReportTreeHolder; // holds nodeid
84  detail::GotoArcHolder m_GotoArcHolder;
85  Bp m_FailureTreeHolder;
86 
87  private:
88  friend class AhoCorasickBuilder;
90 };
91 
92 } // namespace succinct
93 NLIB_NAMESPACE_END
94 
95 NLIB_DEFINE_STD_SWAP(::nlib_ns::succinct::AhoCorasick)
96 
97 #if defined(_MSC_VER) && defined(nx_succinct_EXPORTS)
98 #undef NLIB_VIS_PUBLIC
99 #define NLIB_VIS_PUBLIC NLIB_WINIMPORT
100 #endif
101 
102 #endif // INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICK_H_
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Platform.h:2151
Defines the basic classes that form the basis to Rank and Select operations.
#define NLIB_FINAL
Defines final if it is available for use. If not, holds an empty string.
#define NLIB_DISALLOW_COPY_AND_ASSIGN(TypeName)
Prohibits use of the copy constructor and assignment operator for the class specified by TypeName...
Definition: Config.h:126
void Match(const char *doc, MatchCallback callback, void *userobj=NULL) noexcept
Inspects the string to detect the target string.
Definition: AhoCorasick.h:68
Uses the Aho-Corasick algorithm to detect language and patterns.
Definition: AhoCorasick.h:54
Provides a compact tree structure that can provide various operations in constant time...
Definition: Bp.h:20
Implements common features and features that are highly platform-dependent. Also refer to nlib Platfo...
Succinct data structure to hold a nowhere dense set of 64-bit unsigned integers.
Definition: Sbv.h:402
size_t nlib_strlen(const char *s)
Internally calls strlen(). In some cases, it may operate as an independent implementation.
The class for writing binary to streams (to OutputStream).
Definition: BinaryWriter.h:13
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:51
A compact integer to integer read-only associative array.
Definition: Sbv.h:569
The class for reading binary from streams (from InputStream).
Definition: BinaryReader.h:13
Defines the class for constructing and accessing a parentheses representation of a tree...
Creates the index (automaton) used in the Aho-Corasick algorithm.