nlib
succinct/detection/detection.cpp
青空文庫に収録されている「斜陽」(太宰治)のテキストから形容動詞を抜き出すプログラムです。
ExecAhoCorasick関数を実行することで形容動詞が抜き出され標準出力に出力されます。
単純な方法と性能を比較できるようにするために、コメントを外して ExecBruteForce関数を実行するようにするとキーワードを1つずつ探しだすようになります。
#include <string>
#include <vector>
using std::vector;
using std::string;
using nlib_ns::succinct::AhoCorasick;
vector<char> g_Text;
AhoCorasick g_AhoCorasick;
vector<string> g_Words;
NLIB_PATHMAPPER_FORSAMPLE
bool MyMatchCallback(const char* first, const char* last, uint32_t nodeId, void*) {
// The time complexity of Aho Corasick algorithm is not proportional with the number of keywords.
// the increase of the keywords does not slow down the speed of detection.
#if 1
char str[4096];
int i = 0;
while (first != last) str[i++] = *first++;
str[i] = '\0';
nlib_printf("%" PRIu32 ", \"%s\"\n", nodeId, str);
#else
// Comment out nlib_printf when you check the speed.
// nlib_printf(".");
#endif
return true;
}
void ExecAhoCorasick() {
nlib_printf("AhoCorasick begin\n");
g_AhoCorasick.Match(&g_Text[0], MyMatchCallback);
// g_AhoCorasick.Match(&g_Text[0], NULL);
nlib_printf("AhoCorasick end\n");
}
void ExecBruteForce() {
// The naive implementation for comparision
// The time complexity is O(WN) where:
// W: the count of the words found, which is g_Words.size()
// N: the length of the string, which is g_Text.size()
nlib_printf("Bruteforce begin\n");
for (size_t i = 0; i < g_Words.size(); ++i) {
const string& s = g_Words[i];
size_t beg_pos = 0;
size_t cur = 0;
for (size_t j = 0; j < g_Text.size(); ++j) {
if (g_Text[j] == s[cur]) {
if (cur == 0) beg_pos = j;
++cur;
if (cur == s.length()) {
#if 1
nlib_printf("'%s' detected at g_Text[%" PRIuS " to %" PRIuS "]\n", s.c_str(),
beg_pos, j);
#else
// Comment out nlib_printf when you check the speed.
// nlib_printf(".");
#endif
// you have to go backward.....
j = j - cur + 1;
cur = 0;
beg_pos = 0;
}
} else {
// you have to go backward.....
j = j - cur;
cur = 0;
}
}
}
nlib_printf("Bruteforce end\n");
}
bool ReadTextFile(const char* name, vector<char>* vec) {
FileInputStream stream;
if (stream.Init() != 0) return false;
if (stream.Open(name) != 0) {
nlib_printf("cannot open(r) %s\n", name);
return false;
}
while (!stream.IsEos()) {
int c = stream.Read();
if (c < 0) return false;
vec->push_back(static_cast<char>(c));
}
return true;
}
bool ReadFiles() {
char txtfile[1024];
g_PathMapper.ResolvePath(NULL, txtfile, "nlibpath:///readonly/shayo.txt");
g_Text.reserve(300000);
if (!ReadTextFile(txtfile, &g_Text)) return false;
char acfile[1024];
g_PathMapper.ResolvePath(NULL, acfile, "nlibpath:///readonly/adjv.ac");
FileInputStream stream;
if (stream.Init() != 0) return false;
if (stream.Open(acfile) != 0) {
nlib_printf("cannot open(r) %s\n", acfile);
return false;
}
BinaryReader reader;
reader.Init(&stream, BinaryReader::ENDIAN_LITTLE);
if (!g_AhoCorasick.Import(&reader)) return false;
char patternfile[1024];
g_PathMapper.ResolvePath(NULL, patternfile, "nlibpath:///readonly/adjv.txt");
vector<char> tmp;
tmp.reserve(40000);
if (!ReadTextFile(patternfile, &tmp)) return false;
g_Words.reserve(3500);
string str;
for (size_t i = 0; i < tmp.size(); ++i) {
char c = tmp[i];
if (c == 0x0D) continue;
if (c == 0x0A) {
g_Words.push_back(str);
str.clear();
} else {
str += static_cast<char>(c);
}
}
g_Words.push_back(str);
return true;
}
bool SampleMain(int, char**) {
InitPathMapperForSample();
if (!ReadFiles()) return false;
// ExecBruteForce();
ExecAhoCorasick();
return true;
}
NLIB_MAINFUNC