nlib
succinct/kwlink/kwlink.cpp
青空文庫に収録されている「こころ」(夏目漱石)のテキストからWikipediaのタイトルを検出し、Wikipediaへのリンクを生成したHTMLテキストを出力します。
このサンプルで利用しているWikipediaのタイトルは123万個ありますが、AC法を利用することにより高速に検出することができます。
#include <stdio.h>
#include <string.h>
#include <map>
#include <string>
#include <vector>
using std::vector;
using std::string;
using std::map;
using nlib_ns::succinct::AhoCorasick;
vector<char> g_Text;
AhoCorasick g_AhoCorasick;
map<int, int> g_Map;
#ifdef _MSC_VER
#pragma warning(disable : 4996)
#endif
NLIB_PATHMAPPER_FORSAMPLE
bool MyMatchCallback(const char* first, const char* last, uint32_t, void*) {
if (last - first >= 6) {
// Does not make a link if 2 characters word and Hiragana at first.
if (last - first == 6) {
unsigned char c0 = first[0];
unsigned char c1 = first[1];
if (c0 == 0xE3 && c1 >= 0x81 && c1 <= 0x83) return true;
}
const char* beg = &g_Text[0];
int diff = static_cast<int>(last - first);
int pos = static_cast<int>(first - beg);
if (g_Map[pos] < diff) g_Map[pos] = diff;
}
return true;
}
void MakeLinks() {
// Converts a text file into HTML, and make the links to the entries in Wikipedia.
size_t cnt;
nlib_dprintf(2, &cnt, "Generating HTML begin\n");
nlib_printf("<html>\n");
int i = 0;
int size = static_cast<int>(g_Text.size());
while (i < size) {
char text[1024];
if (g_Text[i] == 0x0D) {
++i;
continue;
}
if (g_Text[i] == 0x0A) {
nlib_printf("<br/>\n");
++i;
continue;
}
if (g_Map.find(i) == g_Map.end()) {
nlib_utf32_t utf32;
int nbytes = nlib_utf8_to_utf32char(&utf32, &g_Text[i]);
char utf8[5] = {0};
nlib_memcpy(utf8, 5, &g_Text[i], nbytes);
nlib_printf("%s", utf8);
i += nbytes;
continue;
}
int j = g_Map[i];
memcpy(text, &g_Text[i], j);
text[j] = '\0';
nlib_printf("<a href='https://secure.wikimedia.org/wikipedia/ja/wiki/%s'>%s</a>", text,
text);
i += j;
}
nlib_printf("</html>\n");
nlib_dprintf(2, &cnt, "Generating HTML end\n");
}
void ExecAhoCorasick() {
size_t cnt;
nlib_dprintf(2, &cnt, "AhoCorasick begin\n");
g_AhoCorasick.Match(&g_Text[0], MyMatchCallback);
nlib_dprintf(2, &cnt, "AhoCorasick end\n");
}
bool ReadFiles(const char* txtfile) {
size_t cnt;
nlib_dprintf(2, &cnt, "reading files begin\n");
{
FILE* fp = fopen(txtfile, "rb");
if (!fp) return false;
int c;
while ((c = fgetc(fp)) != EOF) {
g_Text.push_back(static_cast<char>(c));
}
fclose(fp);
g_Text.push_back('\0');
}
{
char acfile[1024];
g_PathMapper.ResolvePath(NULL, acfile, "nlibpath:///readonly/wikipedia.ac");
FileInputStream stream;
if (stream.Init() != 0) return false;
if (stream.Open(acfile) != 0) return false;
BinaryReader reader;
reader.Init(&stream);
if (!g_AhoCorasick.Import(&reader)) return false;
}
nlib_dprintf(2, &cnt, "reading files end\n");
return true;
}
bool SampleMain(int argc, char** argv) {
InitPathMapperForSample();
char txtfile[1024];
g_PathMapper.ResolvePath(NULL, txtfile, "nlibpath:///readonly/kokoro.txt");
// The text file must be in UTF-8
if (!ReadFiles(argc > 1 ? argv[1] : txtfile)) goto ERROR_KWLINK;
ExecAhoCorasick();
MakeLinks();
return true;
ERROR_KWLINK:
nlib_printf("ERROR!\n");
return false;
}
NLIB_MAINFUNC