nlib
succinct/kwlink/kwlink.cpp
青空文庫に収録されている「こころ」(夏目漱石)のテキストからWikipediaのタイトルを検出し、Wikipediaへのリンクを生成したHTMLテキストを出力します。
このサンプルで利用しているWikipediaのタイトルは123万個ありますが、AC法を利用することにより高速に検出することができます。
/*---------------------------------------------------------------------------*
Project: CrossRoad
Copyright (C)2012-2016 Nintendo. All rights reserved.
These coded instructions, statements, and computer programs contain
proprietary information of Nintendo of America Inc. and/or Nintendo
Company Ltd., and are protected by Federal copyright law. They may
not be disclosed to third parties or copied or duplicated in any form,
in whole or in part, without the prior written consent of Nintendo.
*---------------------------------------------------------------------------*/
#include <stdio.h>
#include <string.h>
#include <map>
#include <string>
#include <vector>
using std::vector;
using std::string;
using std::map;
using nlib_ns::succinct::AhoCorasick;
vector<char> g_Text;
AhoCorasick g_AhoCorasick;
map<int, int> g_Map;
#ifdef _MSC_VER
#pragma warning(disable : 4996)
#endif
NLIB_PATHMAPPER_FORSAMPLE
bool MyMatchCallback(const char* first, const char* last, uint32_t, void*) {
if (last - first >= 6) {
// Does not make a link if 2 characters word and Hiragana at first.
if (last - first == 6) {
unsigned char c0 = first[0];
unsigned char c1 = first[1];
if (c0 == 0xE3 && c1 >= 0x81 && c1 <= 0x83) return true;
}
const char* beg = &g_Text[0];
int diff = static_cast<int>(last - first);
int pos = static_cast<int>(first - beg);
if (g_Map[pos] < diff) g_Map[pos] = diff;
}
return true;
}
void MakeLinks() {
// Converts a text file into HTML, and make the links to the entries in Wikipedia.
size_t cnt;
nlib_dprintf(2, &cnt, "Generating HTML begin\n");
nlib_printf("<html>\n");
int i = 0;
int size = static_cast<int>(g_Text.size());
while (i < size) {
char text[1024];
if (g_Text[i] == 0x0D) {
++i;
continue;
}
if (g_Text[i] == 0x0A) {
nlib_printf("<br/>\n");
++i;
continue;
}
if (g_Map.find(i) == g_Map.end()) {
nlib_utf32_t utf32;
int nbytes = nlib_utf8_to_utf32char(&utf32, &g_Text[i]);
char utf8[5] = {0};
nlib_memcpy(utf8, 5, &g_Text[i], nbytes);
nlib_printf("%s", utf8);
i += nbytes;
continue;
}
int j = g_Map[i];
memcpy(text, &g_Text[i], j);
text[j] = '\0';
nlib_printf("<a href='https://secure.wikimedia.org/wikipedia/ja/wiki/%s'>%s</a>", text,
text);
i += j;
}
nlib_printf("</html>\n");
nlib_dprintf(2, &cnt, "Generating HTML end\n");
}
void ExecAhoCorasick() {
size_t cnt;
nlib_dprintf(2, &cnt, "AhoCorasick begin\n");
g_AhoCorasick.Match(&g_Text[0], MyMatchCallback);
nlib_dprintf(2, &cnt, "AhoCorasick end\n");
}
bool ReadFiles(const char* txtfile) {
size_t cnt;
nlib_dprintf(2, &cnt, "reading files begin\n");
{
FILE* fp = fopen(txtfile, "rb");
if (!fp) return false;
int c;
while ((c = fgetc(fp)) != EOF) {
g_Text.push_back(static_cast<char>(c));
}
fclose(fp);
g_Text.push_back('\0');
}
{
char acfile[1024];
g_PathMapper.ResolvePath(NULL, acfile, "nlibpath:///readonly/wikipedia.ac");
FileInputStream stream;
if (stream.Init() != 0) return false;
if (stream.Open(acfile) != 0) return false;
BinaryReader reader;
reader.Init();
reader.Open(&stream);
if (!g_AhoCorasick.Import(&reader)) return false;
}
nlib_dprintf(2, &cnt, "reading files end\n");
return true;
}
bool SampleMain(int argc, char** argv) {
InitPathMapperForSample();
char txtfile[1024];
g_PathMapper.ResolvePath(NULL, txtfile, "nlibpath:///readonly/kokoro.txt");
// The text file must be in UTF-8
if (!ReadFiles(argc > 1 ? argv[1] : txtfile)) goto ERROR_KWLINK;
ExecAhoCorasick();
MakeLinks();
return true;
ERROR_KWLINK:
nlib_printf("ERROR!\n");
return false;
}
NLIB_MAINFUNC