nlib
succinct/kwlink/kwlink.cpp
This sample detects Wikipedia titles from the text of "Kokoro" by Souseki Natsume from the Aozora Bunko online library. It then outputs HTML text with links to Wikipedia.
1.23 million Wikipedia titles are used in this sample. The use of the Aho-Corasick algorithm allows for high speed detection.
/*--------------------------------------------------------------------------------*
Project: CrossRoad
Copyright (C)Nintendo All rights reserved.
These coded instructions, statements, and computer programs contain proprietary
information of Nintendo and/or its licensed developers and are protected by
national and international copyright laws. They may not be disclosed to third
parties or copied or duplicated in any form, in whole or in part, without the
prior written consent of Nintendo.
The content herein is highly confidential and should be handled accordingly.
*--------------------------------------------------------------------------------*/
#include <stdio.h>
#include <string.h>
#include <map>
#include <string>
#include <vector>
using std::vector;
using std::string;
using std::map;
using nlib_ns::succinct::AhoCorasick;
vector<char> g_Text;
AhoCorasick g_AhoCorasick;
map<int, int> g_Map;
#ifdef _MSC_VER
#pragma warning(disable : 4996)
#endif
NLIB_PATHMAPPER_FORSAMPLE
bool MyMatchCallback(const char* first, const char* last, uint32_t, void*) {
if (last - first >= 6) {
// Does not make a link if 2 characters word and Hiragana at first.
if (last - first == 6) {
unsigned char c0 = first[0];
unsigned char c1 = first[1];
if (c0 == 0xE3 && c1 >= 0x81 && c1 <= 0x83) return true;
}
const char* beg = &g_Text[0];
int diff = static_cast<int>(last - first);
int pos = static_cast<int>(first - beg);
if (g_Map[pos] < diff) g_Map[pos] = diff;
}
return true;
}
void MakeLinks() {
// Converts a text file into HTML, and make the links to the entries in Wikipedia.
size_t cnt;
nlib_dprintf(2, &cnt, "Generating HTML begin\n");
nlib_printf("<html>\n");
int i = 0;
int size = static_cast<int>(g_Text.size());
while (i < size) {
char text[1024];
if (g_Text[i] == 0x0D) {
++i;
continue;
}
if (g_Text[i] == 0x0A) {
nlib_printf("<br/>\n");
++i;
continue;
}
if (g_Map.find(i) == g_Map.end()) {
nlib_utf32_t utf32;
int nbytes = nlib_utf8_to_utf32char(&utf32, &g_Text[i]);
char utf8[5] = {0};
nlib_memcpy(utf8, 5, &g_Text[i], nbytes);
nlib_printf("%s", utf8);
i += nbytes;
continue;
}
int j = g_Map[i];
memcpy(text, &g_Text[i], j);
text[j] = '\0';
nlib_printf("<a href='https://secure.wikimedia.org/wikipedia/ja/wiki/%s'>%s</a>", text,
text);
i += j;
}
nlib_printf("</html>\n");
nlib_dprintf(2, &cnt, "Generating HTML end\n");
}
void ExecAhoCorasick() {
size_t cnt;
nlib_dprintf(2, &cnt, "AhoCorasick begin\n");
g_AhoCorasick.Match(&g_Text[0], MyMatchCallback);
nlib_dprintf(2, &cnt, "AhoCorasick end\n");
}
bool ReadFiles(const char* txtfile) {
size_t cnt;
nlib_dprintf(2, &cnt, "reading files begin\n");
{
FILE* fp = fopen(txtfile, "rb");
if (!fp) return false;
int c;
while ((c = fgetc(fp)) != EOF) {
g_Text.push_back(static_cast<char>(c));
}
fclose(fp);
g_Text.push_back('\0');
}
{
char acfile[1024];
g_PathMapper.ResolvePath(NULL, acfile, "nlibpath:///readonly/wikipedia.ac");
FileInputStream stream;
if (stream.Init() != 0) return false;
if (stream.Open(acfile) != 0) return false;
BinaryReader reader;
reader.Init();
reader.Open(&stream);
if (!g_AhoCorasick.Import(&reader)) return false;
}
nlib_dprintf(2, &cnt, "reading files end\n");
return true;
}
bool SampleMain(int argc, char** argv) {
InitPathMapperForSample();
char txtfile[1024];
g_PathMapper.ResolvePath(NULL, txtfile, "nlibpath:///readonly/kokoro.txt");
// The text file must be in UTF-8
if (!ReadFiles(argc > 1 ? argv[1] : txtfile)) goto ERROR_KWLINK;
ExecAhoCorasick();
MakeLinks();
return true;
ERROR_KWLINK:
nlib_printf("ERROR!\n");
return false;
}
NLIB_MAINFUNC