nlib
succinct/detection/detection.cpp
This program extracts adverbs from Osamu Dazai's "Shayo" from the Aozora Bunko online library.
Running ExecAhoCorasick extracts and outputs adverbs to stdout.
It can be compared against a simpler implementation by removing the comment tags to run ExecBruteForce and search the keywords one by one.
/*--------------------------------------------------------------------------------*
Project: CrossRoad
Copyright (C)Nintendo All rights reserved.
These coded instructions, statements, and computer programs contain proprietary
information of Nintendo and/or its licensed developers and are protected by
national and international copyright laws. They may not be disclosed to third
parties or copied or duplicated in any form, in whole or in part, without the
prior written consent of Nintendo.
The content herein is highly confidential and should be handled accordingly.
*--------------------------------------------------------------------------------*/
#include <string>
#include <vector>
using std::vector;
using std::string;
using nlib_ns::succinct::AhoCorasick;
vector<char> g_text;
AhoCorasick g_ahocorasick;
vector<string> g_words;
NLIB_PATHMAPPER_FORSAMPLE
bool MyMatchCallback(const char* first, const char* last, uint32_t nodeId, void*) {
// The time complexity of Aho Corasick algorithm is not proportional with the number of keywords.
// the increase of the keywords does not slow down the speed of detection.
#if 1
char str[4096];
int i = 0;
while (first != last) str[i++] = *first++;
str[i] = '\0';
nlib_printf("%" PRIu32 ", \"%s\"\n", nodeId, str);
#else
// Comment out nlib_printf when you check the speed.
// nlib_printf(".");
#endif
return true;
}
void ExecAhoCorasick() {
nlib_printf("AhoCorasick begin\n");
g_ahocorasick.Match(&g_text[0], MyMatchCallback);
// g_ahocorasick.Match(&g_text[0], NULL);
nlib_printf("AhoCorasick end\n");
}
void ExecBruteForce() {
// The naive implementation for comparision
// The time complexity is O(WN) where:
// W: the count of the words found, which is g_words.size()
// N: the length of the string, which is g_text.size()
nlib_printf("Bruteforce begin\n");
for (size_t i = 0; i < g_words.size(); ++i) {
const string& s = g_words[i];
size_t beg_pos = 0;
size_t cur = 0;
for (size_t j = 0; j < g_text.size(); ++j) {
if (g_text[j] == s[cur]) {
if (cur == 0) beg_pos = j;
++cur;
if (cur == s.length()) {
#if 1
nlib_printf("'%s' detected at g_text[%" PRIuS " to %" PRIuS "]\n", s.c_str(),
beg_pos, j);
#else
// Comment out nlib_printf when you check the speed.
// nlib_printf(".");
#endif
// you have to go backward.....
j = j - cur + 1;
cur = 0;
beg_pos = 0;
}
} else {
// you have to go backward.....
j = j - cur;
cur = 0;
}
}
}
nlib_printf("Bruteforce end\n");
}
bool ReadTextFile(const char* name, vector<char>* vec) {
FileInputStream stream;
if (nlib_is_error(stream.Init())) return false;
if (nlib_is_error(stream.Open(name))) {
nlib_printf("cannot open(r) %s\n", name);
return false;
}
while (!stream.IsEos()) {
int c = stream.Read();
if (c < 0) return false;
vec->push_back(static_cast<char>(c));
}
return true;
}
bool ReadFiles() {
char txtfile[1024];
g_pathmapper.ResolvePath(NULL, txtfile, "nlibpath:///readonly/shayo.txt");
g_text.reserve(300000);
if (!ReadTextFile(txtfile, &g_text)) return false;
char acfile[1024];
g_pathmapper.ResolvePath(NULL, acfile, "nlibpath:///readonly/adjv.ac");
FileInputStream stream;
if (nlib_is_error(stream.Init())) return false;
if (nlib_is_error(stream.Open(acfile))) {
nlib_printf("cannot open(r) %s\n", acfile);
return false;
}
BinaryReader reader;
reader.Init(BinaryReader::ENDIAN_LITTLE);
reader.Open(&stream);
if (nlib_is_error(g_ahocorasick.Import(&reader))) return false;
char patternfile[1024];
g_pathmapper.ResolvePath(NULL, patternfile, "nlibpath:///readonly/adjv.txt");
vector<char> tmp;
tmp.reserve(40000);
if (!ReadTextFile(patternfile, &tmp)) return false;
g_words.reserve(3500);
string str;
for (size_t i = 0; i < tmp.size(); ++i) {
char c = tmp[i];
if (c == 0x0D) continue;
if (c == 0x0A) {
g_words.push_back(str);
str.clear();
} else {
str += static_cast<char>(c);
}
}
g_words.push_back(str);
return true;
}
bool SampleMain(int, char**) {
InitPathMapperForSample();
if (!ReadFiles()) return false;
// ExecBruteForce();
ExecAhoCorasick();
return true;
}
NLIB_MAINFUNC