mirror of
https://gitee.com/zlgopen/awtk.git
synced 2024-11-29 10:38:47 +08:00
add word_gen
This commit is contained in:
parent
52bce8d332
commit
dd8d81d09c
1
.gitignore
vendored
1
.gitignore
vendored
@ -24,3 +24,4 @@ bin/resgen
|
||||
bin/runLua
|
||||
bin/runTest
|
||||
bin/strgen
|
||||
.vscode
|
||||
|
@ -137,6 +137,9 @@ bin\demoui
|
||||
[TODO.md](TODO.md)
|
||||
|
||||
## 最新动态
|
||||
* 2018/06/26
|
||||
* 更新aworks工程并测试。
|
||||
* 增加小工具用于生成输入法联想词库。
|
||||
|
||||
* 2018/06/25
|
||||
* 完善候选字控件。
|
||||
|
@ -237,7 +237,7 @@ enum { NAME_LEN = 15 };
|
||||
typedef float float_t;
|
||||
|
||||
#define TK_DEFAULT_FONT "default"
|
||||
#define TK_DEFAULT_FONT_SIZE 20
|
||||
#define TK_DEFAULT_FONT_SIZE 18
|
||||
#define TK_MAX_FPS 100
|
||||
#define TK_OPACITY_ALPHA 0xfa
|
||||
#define TK_TRANSPARENT_ALPHA 0x05
|
||||
|
1
tools/.gitignore
vendored
1
tools/.gitignore
vendored
@ -1 +1,2 @@
|
||||
sdldemo
|
||||
.vscode
|
||||
|
1
tools/word_gen/README.md
Normal file
1
tools/word_gen/README.md
Normal file
@ -0,0 +1 @@
|
||||
## 抓取网页,生成输入法联想词库。
|
@ -1,18 +1,197 @@
|
||||
var Crawler = require("crawler");
|
||||
var Segment = require('segment');
|
||||
const fs = require('fs');
|
||||
const URL = require('url')
|
||||
const Crawler = require("crawler");
|
||||
const Segment = require('segment');
|
||||
|
||||
var segment = new Segment()
|
||||
const segment = new Segment()
|
||||
|
||||
segment.useDefault();
|
||||
|
||||
var c = new Crawler({
|
||||
rateLimit: 1000, // `maxConnections` will be forced to 1
|
||||
callback: function(err, res, done){
|
||||
const text = res.$("body").text();
|
||||
const words = segment.doSegment(text);
|
||||
console.log(words);
|
||||
done();
|
||||
let allWords = {};
|
||||
let doneURLS = {};
|
||||
let maxURLS = 10;
|
||||
let maxPages = maxURLS;
|
||||
const maxWordsPerChar = 10;
|
||||
let rootURL = ['https://blog.csdn.net/'];
|
||||
|
||||
function bufferWriteWord(buff, word, start) {
|
||||
let offset = start;
|
||||
let nr = word.length;
|
||||
|
||||
for (let i = 0; i < nr; i++) {
|
||||
const code = word.charCodeAt(i);
|
||||
buff.writeUInt16LE(code, offset);
|
||||
offset += 2;
|
||||
}
|
||||
|
||||
buff.writeUInt16LE(0, offset);
|
||||
offset += 2;
|
||||
|
||||
return offset - start;
|
||||
}
|
||||
|
||||
function outputWords(arr) {
|
||||
let offsetData = 0;
|
||||
let offsetIndex = 0;
|
||||
const nr = arr.length;
|
||||
let f = fs.openSync("words.bin", "w+");
|
||||
let headerBuffer = Buffer.alloc(8);
|
||||
let indexBuffer = Buffer.alloc(nr * 8);
|
||||
let contentBuffer = Buffer.alloc(10 * 1024 * 1024)
|
||||
|
||||
headerBuffer.writeUInt32LE(0, 0);
|
||||
headerBuffer.writeUInt32LE(nr, 4);
|
||||
arr.forEach(iter => {
|
||||
const code = iter.c.charCodeAt(0);
|
||||
indexBuffer.writeUInt32LE(code, offsetIndex);
|
||||
indexBuffer.writeUInt32LE(offsetData, offsetIndex + 4);
|
||||
offsetIndex += 8;
|
||||
|
||||
contentBuffer.writeUInt32LE(iter.words.length, offsetData);
|
||||
offsetData += 4;
|
||||
iter.words.forEach(w => {
|
||||
offsetData += bufferWriteWord(contentBuffer, w.w, offsetData);
|
||||
})
|
||||
});
|
||||
|
||||
fs.writeSync(f, headerBuffer);
|
||||
fs.writeSync(f, indexBuffer);
|
||||
fs.writeSync(f, contentBuffer, 0, offsetData);
|
||||
fs.closeSync(f);
|
||||
}
|
||||
|
||||
function tidyResult() {
|
||||
const arr = [];
|
||||
|
||||
for (let c in allWords) {
|
||||
const words = allWords[c];
|
||||
let item = {
|
||||
c: c,
|
||||
words: []
|
||||
};
|
||||
|
||||
for (let w in words) {
|
||||
const f = words[w];
|
||||
item.words.push({
|
||||
w: w,
|
||||
f: f
|
||||
})
|
||||
}
|
||||
|
||||
item.words.sort((a, b) => {
|
||||
return b.f - a.f;
|
||||
})
|
||||
|
||||
if (item.words.length > maxWordsPerChar) {
|
||||
item.words.length = maxWordsPerChar;
|
||||
}
|
||||
|
||||
arr.push(item);
|
||||
}
|
||||
|
||||
arr.sort((a, b) => {
|
||||
return a.c - b.c;
|
||||
})
|
||||
|
||||
console.log(JSON.stringify(arr, null, '\t'));
|
||||
|
||||
return arr;
|
||||
}
|
||||
|
||||
function outputAndQuit() {
|
||||
outputWords(tidyResult());
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
function isValidWord(w) {
|
||||
if (w.length > 8 || w.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (let i = 0; i < w.length; i++) {
|
||||
const c = w.charCodeAt(i);
|
||||
if (c < 0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
function addWord(w) {
|
||||
const c = w.substring(0, 1);
|
||||
const others = w.substring(1);
|
||||
|
||||
if (!isValidWord(w)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!allWords[c]) {
|
||||
allWords[c] = {};
|
||||
}
|
||||
|
||||
if (!(allWords[c][others])) {
|
||||
allWords[c][others] = 1;
|
||||
} else {
|
||||
allWords[c][others] = allWords[c][others] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
function addUrls(requestUrl, urls, c) {
|
||||
for (let i = 0; i < urls.length; i++) {
|
||||
const iter = urls[i];
|
||||
const href = iter.attribs.href;
|
||||
const url = URL.resolve(requestUrl, href);
|
||||
|
||||
if (doneURLS[url] || url.indexOf('#') >= 0 || url.indexOf('css') >= 0 || url.indexOf('ico') >= 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
maxURLS--;
|
||||
if (maxURLS >= 0) {
|
||||
console.log(`fetching: ${maxURLS} ${url}`);
|
||||
doneURLS[url] = true;
|
||||
c.queue(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var c = new Crawler({
|
||||
rateLimit: 100,
|
||||
callback: function (err, res, done) {
|
||||
if (maxPages <= 0) {
|
||||
outputAndQuit();
|
||||
}
|
||||
|
||||
if (err) {
|
||||
console.log(err);
|
||||
done();
|
||||
return;
|
||||
}
|
||||
|
||||
const contentType = res.headers['content-type'];
|
||||
if (!contentType || contentType.indexOf('html') < 0) {
|
||||
done();
|
||||
return;
|
||||
}
|
||||
|
||||
const urls = res.$("[href]");
|
||||
const text = res.$("body").text();
|
||||
const requestUrl = res.request.uri.href;
|
||||
const words = segment.doSegment(text);
|
||||
|
||||
console.log(`${maxPages} ${requestUrl}`);
|
||||
|
||||
words.forEach(element => {
|
||||
addWord(element.w);
|
||||
});
|
||||
|
||||
maxPages--;
|
||||
addUrls(requestUrl, urls, c);
|
||||
|
||||
done();
|
||||
}
|
||||
});
|
||||
|
||||
c.queue("https://blog.csdn.net/")
|
||||
c.queue(rootURL);
|
21
tools/word_gen/package-lock.json
generated
21
tools/word_gen/package-lock.json
generated
@ -592,6 +592,11 @@
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
|
||||
"integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
|
||||
},
|
||||
"querystring": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz",
|
||||
"integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA="
|
||||
},
|
||||
"readable-stream": {
|
||||
"version": "2.3.6",
|
||||
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz",
|
||||
@ -736,6 +741,22 @@
|
||||
"mime-types": "~2.1.18"
|
||||
}
|
||||
},
|
||||
"url": {
|
||||
"version": "0.11.0",
|
||||
"resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz",
|
||||
"integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=",
|
||||
"requires": {
|
||||
"punycode": "1.3.2",
|
||||
"querystring": "0.2.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"punycode": {
|
||||
"version": "1.3.2",
|
||||
"resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz",
|
||||
"integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0="
|
||||
}
|
||||
}
|
||||
},
|
||||
"util-deprecate": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
|
||||
|
@ -10,6 +10,7 @@
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"crawler": "^1.1.4",
|
||||
"segment": "^0.1.3"
|
||||
"segment": "^0.1.3",
|
||||
"url": "^0.11.0"
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user