2018-06-26 18:19:25 +08:00
|
|
|
|
## 抓取网页,生成输入法联想词库。
|
2018-06-27 15:47:30 +08:00
|
|
|
|
|
|
|
|
|
### 生成数据
|
|
|
|
|
|
|
|
|
|
在当前目录下运行:
|
|
|
|
|
|
2020-02-06 08:47:28 +08:00
|
|
|
|
* 准备
|
|
|
|
|
|
2018-06-27 15:47:30 +08:00
|
|
|
|
```
|
|
|
|
|
npm install
|
2020-02-06 08:47:28 +08:00
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
* 抓取网页,生成words.json
|
|
|
|
|
|
|
|
|
|
> 可以修改maxURLS改变最大网页数量。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
node gen_words_json.js
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
* 生成二进制的words.bin文件
|
|
|
|
|
|
|
|
|
|
> 可以根据自己的需要进行编辑words.json。
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
node to_words_bin.js
|
2018-06-27 15:47:30 +08:00
|
|
|
|
```
|
|
|
|
|
|
2020-04-21 10:22:05 +08:00
|
|
|
|
### 使用现有数据
|
|
|
|
|
|
|
|
|
|
chinese\_with\_freq.txt是从 https://github.com/ling0322/webdict 下载的。
|
|
|
|
|
|
|
|
|
|
如果不想自己生成,可以直接使用该文件:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
node to_json.js
|
|
|
|
|
```
|
|
|
|
|
|
2018-06-27 15:47:30 +08:00
|
|
|
|
### 更新数据
|
|
|
|
|
|
2020-02-06 08:47:28 +08:00
|
|
|
|
在awtk根目录下运行:
|
2018-06-27 15:47:30 +08:00
|
|
|
|
|
|
|
|
|
```
|
2020-05-11 11:07:48 +08:00
|
|
|
|
cp tools/word_gen/words.bin demos/assets/default/raw/data/suggest_words_zh_cn.dat
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
如果不支持文件系统,还需要运行更新资源的脚本:
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
python scripts/update_res.py all
|
2018-06-27 15:47:30 +08:00
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
### 注意:
|
|
|
|
|
node\_modules/segment/lib/module/DictTokenizer.js#getChunks 可能导致OOM。
|
|
|
|
|
|
|
|
|
|
如果遇到问题,可以限制chunks.length的大小,如下面限制为5000。
|
|
|
|
|
|
|
|
|
|
```
|
2018-06-30 18:07:28 +08:00
|
|
|
|
let getChunksCallsNr = 0;
|
2018-06-27 15:47:30 +08:00
|
|
|
|
var getChunks = function (wordpos, pos, text) {
|
2018-06-30 18:07:28 +08:00
|
|
|
|
var words = wordpos[pos] || [];
|
2018-06-27 15:47:30 +08:00
|
|
|
|
// debug('getChunks: ');
|
|
|
|
|
// debug(words);
|
|
|
|
|
// throw new Error();
|
2018-06-30 18:07:28 +08:00
|
|
|
|
var ret = [];
|
2018-07-01 10:28:15 +08:00
|
|
|
|
if(getChunksCallsNr > 150) {
|
2018-06-30 18:07:28 +08:00
|
|
|
|
throw "get Chunks error";
|
|
|
|
|
}
|
2018-07-01 10:28:15 +08:00
|
|
|
|
|
|
|
|
|
getChunksCallsNr++;
|
2018-06-27 15:47:30 +08:00
|
|
|
|
for (var i = 0; i < words.length; i++) {
|
|
|
|
|
var word = words[i];
|
|
|
|
|
//debug(word);
|
|
|
|
|
var nextcur = word.c + word.w.length;
|
|
|
|
|
if (!wordpos[nextcur]) {
|
|
|
|
|
ret.push([word]);
|
2018-06-30 18:07:28 +08:00
|
|
|
|
} else {
|
2018-06-27 15:47:30 +08:00
|
|
|
|
var chunks = getChunks(wordpos, nextcur);
|
|
|
|
|
for (var j = 0; j < chunks.length && j < 5000; j++) {
|
|
|
|
|
ret.push([word].concat(chunks[j]));
|
2018-06-30 18:07:28 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2018-06-27 15:47:30 +08:00
|
|
|
|
}
|
2018-06-30 18:07:28 +08:00
|
|
|
|
getChunksCallsNr--;
|
|
|
|
|
|
2018-06-27 15:47:30 +08:00
|
|
|
|
return ret;
|
|
|
|
|
};
|
|
|
|
|
```
|
|
|
|
|
|