add word_gen

This commit is contained in:
xianjimli 2018-06-26 18:19:25 +08:00
parent 52bce8d332
commit dd8d81d09c
8 changed files with 220 additions and 13 deletions

1
.gitignore vendored
View File

@ -24,3 +24,4 @@ bin/resgen
bin/runLua
bin/runTest
bin/strgen
.vscode

View File

@ -137,6 +137,9 @@ bin\demoui
[TODO.md](TODO.md)
## 最新动态
* 2018/06/26
* 更新aworks工程并测试。
* 增加小工具用于生成输入法联想词库。
* 2018/06/25
* 完善候选字控件。

View File

@ -237,7 +237,7 @@ enum { NAME_LEN = 15 };
typedef float float_t;
#define TK_DEFAULT_FONT "default"
#define TK_DEFAULT_FONT_SIZE 20
#define TK_DEFAULT_FONT_SIZE 18
#define TK_MAX_FPS 100
#define TK_OPACITY_ALPHA 0xfa
#define TK_TRANSPARENT_ALPHA 0x05

1
tools/.gitignore vendored
View File

@ -1 +1,2 @@
sdldemo
.vscode

1
tools/word_gen/README.md Normal file
View File

@ -0,0 +1 @@
## 抓取网页,生成输入法联想词库。

View File

@ -1,18 +1,197 @@
var Crawler = require("crawler");
var Segment = require('segment');
const fs = require('fs');
const URL = require('url')
const Crawler = require("crawler");
const Segment = require('segment');
var segment = new Segment()
const segment = new Segment()
segment.useDefault();
var c = new Crawler({
rateLimit: 1000, // `maxConnections` will be forced to 1
callback: function(err, res, done){
const text = res.$("body").text();
const words = segment.doSegment(text);
console.log(words);
done();
let allWords = {};
let doneURLS = {};
let maxURLS = 10;
let maxPages = maxURLS;
const maxWordsPerChar = 10;
let rootURL = ['https://blog.csdn.net/'];
function bufferWriteWord(buff, word, start) {
let offset = start;
let nr = word.length;
for (let i = 0; i < nr; i++) {
const code = word.charCodeAt(i);
buff.writeUInt16LE(code, offset);
offset += 2;
}
buff.writeUInt16LE(0, offset);
offset += 2;
return offset - start;
}
function outputWords(arr) {
let offsetData = 0;
let offsetIndex = 0;
const nr = arr.length;
let f = fs.openSync("words.bin", "w+");
let headerBuffer = Buffer.alloc(8);
let indexBuffer = Buffer.alloc(nr * 8);
let contentBuffer = Buffer.alloc(10 * 1024 * 1024)
headerBuffer.writeUInt32LE(0, 0);
headerBuffer.writeUInt32LE(nr, 4);
arr.forEach(iter => {
const code = iter.c.charCodeAt(0);
indexBuffer.writeUInt32LE(code, offsetIndex);
indexBuffer.writeUInt32LE(offsetData, offsetIndex + 4);
offsetIndex += 8;
contentBuffer.writeUInt32LE(iter.words.length, offsetData);
offsetData += 4;
iter.words.forEach(w => {
offsetData += bufferWriteWord(contentBuffer, w.w, offsetData);
})
});
fs.writeSync(f, headerBuffer);
fs.writeSync(f, indexBuffer);
fs.writeSync(f, contentBuffer, 0, offsetData);
fs.closeSync(f);
}
function tidyResult() {
const arr = [];
for (let c in allWords) {
const words = allWords[c];
let item = {
c: c,
words: []
};
for (let w in words) {
const f = words[w];
item.words.push({
w: w,
f: f
})
}
item.words.sort((a, b) => {
return b.f - a.f;
})
if (item.words.length > maxWordsPerChar) {
item.words.length = maxWordsPerChar;
}
arr.push(item);
}
arr.sort((a, b) => {
return a.c - b.c;
})
console.log(JSON.stringify(arr, null, '\t'));
return arr;
}
function outputAndQuit() {
outputWords(tidyResult());
process.exit(0);
}
function isValidWord(w) {
if (w.length > 8 || w.length < 2) {
return false;
}
for (let i = 0; i < w.length; i++) {
const c = w.charCodeAt(i);
if (c < 0x80) {
return false;
}
}
return true;
}
function addWord(w) {
const c = w.substring(0, 1);
const others = w.substring(1);
if (!isValidWord(w)) {
return;
}
if (!allWords[c]) {
allWords[c] = {};
}
if (!(allWords[c][others])) {
allWords[c][others] = 1;
} else {
allWords[c][others] = allWords[c][others] + 1;
}
}
function addUrls(requestUrl, urls, c) {
for (let i = 0; i < urls.length; i++) {
const iter = urls[i];
const href = iter.attribs.href;
const url = URL.resolve(requestUrl, href);
if (doneURLS[url] || url.indexOf('#') >= 0 || url.indexOf('css') >= 0 || url.indexOf('ico') >= 0) {
continue;
}
maxURLS--;
if (maxURLS >= 0) {
console.log(`fetching: ${maxURLS} ${url}`);
doneURLS[url] = true;
c.queue(url);
}
}
}
var c = new Crawler({
rateLimit: 100,
callback: function (err, res, done) {
if (maxPages <= 0) {
outputAndQuit();
}
if (err) {
console.log(err);
done();
return;
}
const contentType = res.headers['content-type'];
if (!contentType || contentType.indexOf('html') < 0) {
done();
return;
}
const urls = res.$("[href]");
const text = res.$("body").text();
const requestUrl = res.request.uri.href;
const words = segment.doSegment(text);
console.log(`${maxPages} ${requestUrl}`);
words.forEach(element => {
addWord(element.w);
});
maxPages--;
addUrls(requestUrl, urls, c);
done();
}
});
c.queue("https://blog.csdn.net/")
c.queue(rootURL);

View File

@ -592,6 +592,11 @@
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
"integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
},
"querystring": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz",
"integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA="
},
"readable-stream": {
"version": "2.3.6",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
@ -736,6 +741,22 @@
"mime-types": "~2.1.18"
}
},
"url": {
"version": "0.11.0",
"resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz",
"integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=",
"requires": {
"punycode": "1.3.2",
"querystring": "0.2.0"
},
"dependencies": {
"punycode": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz",
"integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0="
}
}
},
"util-deprecate": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",

View File

@ -10,6 +10,7 @@
"license": "ISC",
"dependencies": {
"crawler": "^1.1.4",
"segment": "^0.1.3"
"segment": "^0.1.3",
"url": "^0.11.0"
}
}