awtk/tools/word_gen/README.md
2018-07-01 10:28:15 +08:00

57 lines
1.1 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## 抓取网页,生成输入法联想词库。
### 生成数据
在当前目录下运行:
```
npm install
node gen.js
```
### 更新数据
在awtk目录下运行
```
./bin/resgen tools/word_gen/words.bin src/input_methods/suggest_words.inc
```
### 注意:
node\_modules/segment/lib/module/DictTokenizer.js#getChunks 可能导致OOM。
如果遇到问题可以限制chunks.length的大小如下面限制为5000。
```
let getChunksCallsNr = 0;
var getChunks = function (wordpos, pos, text) {
var words = wordpos[pos] || [];
// debug('getChunks: ');
// debug(words);
// throw new Error();
var ret = [];
if(getChunksCallsNr > 150) {
throw "get Chunks error";
}
getChunksCallsNr++;
for (var i = 0; i < words.length; i++) {
var word = words[i];
//debug(word);
var nextcur = word.c + word.w.length;
if (!wordpos[nextcur]) {
ret.push([word]);
} else {
var chunks = getChunks(wordpos, nextcur);
for (var j = 0; j < chunks.length && j < 5000; j++) {
ret.push([word].concat(chunks[j]));
}
}
}
getChunksCallsNr--;
return ret;
};
```