nlib
AhoCorasickBuilder.h
Go to the documentation of this file.
1 
2 #pragma once
3 #ifndef INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICKBUILDER_H_
4 #define INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICKBUILDER_H_
5 
6 #include <string.h>
7 #include <utility>
8 
10 #include "nn/nlib/ReallocVec.h"
12 
13 #if defined(_MSC_VER) && defined(nx_succinct_EXPORTS)
14 #undef NLIB_VIS_PUBLIC
15 #define NLIB_VIS_PUBLIC NLIB_WINEXPORT
16 #endif
17 
18 NLIB_NAMESPACE_BEGIN
19 namespace succinct {
20 
21 class AhoCorasickBuilder;
22 
23 namespace detail {
24 
25 class StringHolder NLIB_FINAL {
26  public:
27  NLIB_VIS_PUBLIC StringHolder() NLIB_NOEXCEPT : m_Cur(kBufSize) {}
28  NLIB_VIS_PUBLIC ~StringHolder() NLIB_NOEXCEPT;
29  NLIB_VIS_HIDDEN char* MakeString(const void* ptr, size_t len) NLIB_NOEXCEPT;
30  private:
31  static const size_t kBufSize = 1024 * 1024 * 4U;
32 
33  ReallocVec<char*> m_Data;
34  ReallocVec<char*> m_Buf;
35  size_t m_Cur;
36  NLIB_DISALLOW_COPY_AND_ASSIGN(StringHolder);
37 };
38 
39 class NLIB_VIS_PUBLIC PrefixString NLIB_FINAL {
40  public:
41  NLIB_VIS_HIDDEN PrefixString() NLIB_NOEXCEPT : m_Rbegin(NULL) {}
42  NLIB_VIS_HIDDEN ~PrefixString() NLIB_NOEXCEPT {}
44  PrefixString(const unsigned char* first, const unsigned char* last) NLIB_NOEXCEPT {
45  NLIB_UNUSED(first);
46  NLIB_ASSERT(last);
47  m_Rbegin = last - 1;
48  }
49  NLIB_VIS_HIDDEN bool ReverseLexOrderLess(const PrefixString& rhs) const NLIB_NOEXCEPT {
50  NLIB_ASSERT(m_Rbegin && rhs.m_Rbegin);
51  // NOTE:
52  // lexicographical_compare between reversed strings
53  // lexicographical_compare is very slow in DEBUG build of MSVC
54  const unsigned char* rbeg = m_Rbegin;
55  const unsigned char* rbeg_rhs = rhs.m_Rbegin;
56  for (;;) {
57  unsigned char c = *rbeg;
58  unsigned char c_rhs = *rbeg_rhs;
59  if (c < c_rhs) return true;
60  if (c > c_rhs) return false;
61  if (c == 0) return false;
62  --rbeg;
63  --rbeg_rhs;
64  }
65  }
66  NLIB_VIS_HIDDEN const unsigned char* begin() const NLIB_NOEXCEPT {
67  NLIB_ASSERT(m_Rbegin);
68  return m_Rbegin - length() + 1;
69  }
70  NLIB_VIS_HIDDEN const unsigned char* end() const NLIB_NOEXCEPT {
71  NLIB_ASSERT(m_Rbegin);
72  return m_Rbegin + 1;
73  }
74  NLIB_VIS_HIDDEN bool empty() const NLIB_NOEXCEPT {
75  NLIB_ASSERT(m_Rbegin);
76  return *m_Rbegin == 0;
77  }
78  NLIB_VIS_HIDDEN size_t length() const NLIB_NOEXCEPT {
79  NLIB_ASSERT(m_Rbegin);
80  const unsigned char* p;
81  const unsigned char* rbeg = m_Rbegin;
82  for (p = rbeg; *p; --p) {
83  }
84  return rbeg - p;
85  }
86  NLIB_VIS_HIDDEN bool IsTerminal() const NLIB_NOEXCEPT {
87  NLIB_ASSERT(m_Rbegin);
88  return !*(m_Rbegin + 1);
89  }
90 
91  private:
92  const unsigned char* m_Rbegin;
93 };
94 
95 class NLIB_VIS_PUBLIC AcNode NLIB_FINAL {
96  public:
97  struct EdgeList {
98  unsigned char label;
99  AcNode* dest;
100  EdgeList* next;
101  };
102 
103  public:
104  NLIB_VIS_HIDDEN AcNode() NLIB_NOEXCEPT : m_NodeId(0), m_Failure(NULL), m_Edges(NULL) {}
105  NLIB_VIS_HIDDEN ~AcNode() NLIB_NOEXCEPT {}
106  NLIB_VIS_HIDDEN bool operator<(const AcNode& rhs) const NLIB_NOEXCEPT {
107  return m_Prefix.ReverseLexOrderLess(rhs.m_Prefix);
108  }
109  NLIB_VIS_HIDDEN unsigned char GetLevelValue(size_t level) const NLIB_NOEXCEPT {
110  return level < m_NodeId ? *(m_Prefix.end() - 1 - level) : 0;
111  }
112  NLIB_VIS_HIDDEN uint32_t GetNodeId() const NLIB_NOEXCEPT { return m_NodeId; }
113  NLIB_VIS_HIDDEN bool IsTerminal() const NLIB_NOEXCEPT {
114  return m_NodeId != 0 && m_Prefix.IsTerminal();
115  }
116 
117  NLIB_VIS_HIDDEN bool AddGoto(unsigned char c, AcNode* next,
118  Nlist<AcNode::EdgeList>* holder) NLIB_NOEXCEPT;
119  NLIB_VIS_HIDDEN void print() NLIB_NOEXCEPT;
120 
121  NLIB_VIS_HIDDEN const AcNode* Goto(const unsigned char c) const NLIB_NOEXCEPT;
122  NLIB_VIS_HIDDEN AcNode* Find(const unsigned char* first,
123  const unsigned char* last) NLIB_NOEXCEPT;
124  NLIB_VIS_HIDDEN AcNode* GetReportArc() const NLIB_NOEXCEPT {
125  AcNode* failure;
126  for (failure = m_Failure;
127  failure != NULL && !failure->IsTerminal();
128  failure = failure->m_Failure) {
129  }
130  return failure;
131  }
132  NLIB_VIS_HIDDEN AcNode* GetFailureArc() const NLIB_NOEXCEPT { return m_Failure; }
133  NLIB_VIS_HIDDEN const EdgeList* GetEdgeList() const NLIB_NOEXCEPT { return m_Edges; }
134  NLIB_VIS_HIDDEN AcNode* FindArc(unsigned char c) const NLIB_NOEXCEPT {
135  for (EdgeList* e = m_Edges; e != NULL; e = e->next) {
136  if (e->label == c) return e->dest;
137  }
138  return NULL;
139  }
140  NLIB_VIS_HIDDEN PrefixString GetPrefix() const NLIB_NOEXCEPT { return m_Prefix; }
141 
142  private:
143  NLIB_VIS_HIDDEN void SetNodeId(uint32_t nodeid) NLIB_NOEXCEPT { m_NodeId = nodeid; }
144  NLIB_VIS_HIDDEN void SetFailureArc(AcNode* failure) NLIB_NOEXCEPT { m_Failure = failure; }
145  NLIB_VIS_HIDDEN void SetPrefix(PrefixString prefix) NLIB_NOEXCEPT {
146  m_Prefix = prefix;
147  m_NodeId = static_cast<uint32_t>(m_Prefix.length());
148  }
149 
150  private:
151  uint32_t m_NodeId; // suffix-lexicographic order, m_NodeId == 0 if root
152  PrefixString m_Prefix;
153  AcNode* m_Failure;
154  EdgeList* m_Edges;
155 
156  friend class ::nlib_ns::succinct::AhoCorasickBuilder;
158 };
159 
160 } // namespace detail
161 
162 class AhoCorasickBuilder NLIB_FINAL {
164 
165  public:
166  AhoCorasickBuilder() NLIB_NOEXCEPT : m_NumWords(0), m_NumBytes(0) {}
167  NLIB_VIS_PUBLIC ~AhoCorasickBuilder() NLIB_NOEXCEPT;
168  NLIB_VIS_PUBLIC bool Init() NLIB_NOEXCEPT;
169  NLIB_VIS_PUBLIC bool AddWord(const char* str) NLIB_NOEXCEPT;
170  NLIB_VIS_PUBLIC bool AddPattern(const void* p, size_t n) NLIB_NOEXCEPT;
171  NLIB_VIS_PUBLIC bool AddWords(const char* str, size_t len) NLIB_NOEXCEPT;
172  bool AddWords(const char* str) NLIB_NOEXCEPT {
173  return AddWords(str, strlen(str));
174  }
175  NLIB_VIS_PUBLIC AhoCorasick* Build() NLIB_NOEXCEPT;
176  typedef bool (*MatchCallback)(const char* first, const char* last, uint32_t nodeid,
177  void* user_obj);
178  NLIB_VIS_PUBLIC void MatchByBuilder(const char* doc, MatchCallback callback,
179  void* user_obj = NULL) NLIB_NOEXCEPT;
180  NLIB_VIS_PUBLIC void print() NLIB_NOEXCEPT;
181  size_t GetNumWords() const NLIB_NOEXCEPT { return m_NumWords; }
182  size_t GetNumBytes() const NLIB_NOEXCEPT { return m_NumBytes; }
183  size_t GetNumNodes() const NLIB_NOEXCEPT { return m_AcNodeHolder.size(); }
184 
185  private:
186  NLIB_VIS_HIDDEN bool SortNodes() NLIB_NOEXCEPT;
187  struct NLIB_VIS_HIDDEN BuildFailureArcTh {
188  bool operator()() NLIB_NOEXCEPT;
189  AhoCorasickBuilder* This;
190  };
191  struct NLIB_VIS_HIDDEN BuildReportTreeHolderTh {
192  bool operator()() NLIB_NOEXCEPT;
193  AhoCorasickBuilder* This;
194  AhoCorasick* obj;
195  };
196  struct NLIB_VIS_HIDDEN BuildFailureTreeHolderTh {
197  bool operator()() NLIB_NOEXCEPT;
198  AhoCorasickBuilder* This;
199  AhoCorasick* obj;
200  };
201  struct NLIB_VIS_HIDDEN BuildGotoArcHolderTh {
202  bool operator()() NLIB_NOEXCEPT;
203  AhoCorasickBuilder* This;
204  AhoCorasick* obj;
205  };
206  struct NLIB_VIS_HIDDEN BuildLenHolderTh {
207  bool operator()() NLIB_NOEXCEPT;
208  AhoCorasickBuilder* This;
209  AhoCorasick* obj;
210  };
211  struct NLIB_VIS_HIDDEN SortNodesTh {
212  void operator()() NLIB_NOEXCEPT;
213  AhoCorasickBuilder* This;
214  int from;
215  int to;
216  const int* count;
217  };
218 
219  private:
220  size_t m_NumWords;
221  size_t m_NumBytes;
222  AcNodeHolder m_AcNodeHolder;
223  Nlist<detail::AcNode> m_AcNodeMem;
224  Nlist<detail::AcNode::EdgeList> m_EdgeMem;
225  detail::StringHolder m_StringHolder;
226  nlib_ns::threading::ThreadPool m_ThreadPool;
227 
229 };
230 
231 } // namespace succinct
232 NLIB_NAMESPACE_END
233 
234 #if defined(_MSC_VER) && defined(nx_succinct_EXPORTS)
235 #undef NLIB_VIS_PUBLIC
236 #define NLIB_VIS_PUBLIC NLIB_WINIMPORT
237 #endif
238 
239 #endif // INCLUDE_NN_NLIB_SUCCINCT_AHOCORASICKBUILDER_H_
bool operator<(const TimeValue &lhs, const TimeValue &rhs) noexcept
A relational operator.
Definition: DateTime.h:62
#define NLIB_NOEXCEPT
Defines noexcept geared to the environment, or the equivalent.
Definition: Platform.h:2151
#define NLIB_FINAL
Defines final if it is available for use. If not, holds an empty string.
#define NLIB_DISALLOW_COPY_AND_ASSIGN(TypeName)
Prohibits use of the copy constructor and assignment operator for the class specified by TypeName...
Definition: Config.h:126
#define NLIB_VIS_HIDDEN
Symbols for functions and classes are not made available outside of the library.
Definition: Platform_unix.h:50
size_t GetNumBytes() const noexcept
Gets the sum size of registered strings or patterns in bytes.
Uses the Aho-Corasick algorithm to detect language and patterns.
Definition: AhoCorasick.h:54
AhoCorasickBuilder() noexcept
Instantiates the object with default parameters (default constructor).
Defines the class for searching text strings using the Aho-Corasick string-matching algorithm...
Implements common features and features that are highly platform-dependent. Also refer to nlib Platfo...
A container-like class similar to std::vector that can store objects that do not have copy constructo...
Definition: Nlist.h:19
Defines a thread pool.
#define NLIB_VIS_PUBLIC
Symbols for functions and classes are made available outside of the library.
Definition: Platform_unix.h:51
size_t GetNumNodes() const noexcept
Gets the number of created automaton nodes.
Creates the index (automaton) used in the Aho-Corasick algorithm.