awtk/tools/word_gen/gen_words_json.js

194 lines
3.6 KiB
JavaScript
Raw Normal View History

2018-06-26 18:19:25 +08:00
const fs = require('fs');
const URL = require('url')
const Crawler = require("crawler");
2020-02-13 10:40:32 +08:00
const Segment = require('novel-segment');
2018-06-26 14:47:03 +08:00
2018-06-26 18:19:25 +08:00
let allWords = {};
let doneURLS = {};
let maxURLS = 100000;
2018-07-01 10:28:15 +08:00
let errorPages = 0;
2018-06-27 15:47:30 +08:00
let reservedPages = maxURLS;
2018-06-29 17:22:57 +08:00
const maxWordsPerChar = 15;
2018-07-01 10:28:15 +08:00
let rootURL = ['https://www.qisuu.la/du/', 'http://blog.sina.com.cn/', 'https://blog.csdn.net/'];
2018-06-30 13:17:54 +08:00
function isValidURL(url) {
if (url.indexOf('javascript:') >= 0 || url.indexOf('css') >= 0 || url.indexOf(':') > 8) {
return false;
}
if (doneURLS[url] || url.indexOf('#') >= 0 || url.indexOf('ico') >= 0) {
return false;
}
if (url.indexOf('api.') >= 0 || url.indexOf('download') >= 0) {
return false;
}
for (let i = 0; i < rootURL.length; i++) {
let iter = rootURL[i];
if (url.indexOf(iter) >= 0) {
return true;
}
}
return false;
}
2018-06-26 18:19:25 +08:00
function tidyResult() {
const arr = [];
for (let c in allWords) {
const words = allWords[c];
let item = {
c: c,
words: []
};
for (let w in words) {
const f = words[w];
item.words.push({
w: w,
f: f
})
}
item.words.sort((a, b) => {
return b.f - a.f;
})
if (item.words.length > maxWordsPerChar) {
item.words.length = maxWordsPerChar;
}
arr.push(item);
}
arr.sort((a, b) => {
2018-06-27 13:46:42 +08:00
return a.c.charCodeAt(0) - b.c.charCodeAt(0);
2018-06-26 18:19:25 +08:00
})
2018-06-27 13:46:42 +08:00
console.log(JSON.stringify(arr, null, ' '));
2018-06-26 18:19:25 +08:00
return arr;
}
function outputAndQuit() {
const result = tidyResult();
const json = JSON.stringify(result, null, '\t');
fs.writeFileSync("words.json", json);
2018-07-01 10:28:15 +08:00
console.log(`errorPages=${errorPages}`);
console.log(`Output: words.json`);
2018-06-26 18:19:25 +08:00
process.exit(0);
}
function isValidWord(w) {
if (w.length > 8 || w.length < 2) {
return false;
}
for (let i = 0; i < w.length; i++) {
const c = w.charCodeAt(i);
if (c < 0x80) {
return false;
}
}
return true;
}
function addWord(w) {
const c = w.substring(0, 1);
const others = w.substring(1);
if (!isValidWord(w)) {
return;
}
if (!allWords[c]) {
allWords[c] = {};
}
if (!(allWords[c][others])) {
allWords[c][others] = 1;
} else {
allWords[c][others] = allWords[c][others] + 1;
}
}
function addUrls(requestUrl, urls, c) {
for (let i = 0; i < urls.length; i++) {
const iter = urls[i];
const href = iter.attribs.href;
const url = URL.resolve(requestUrl, href);
2018-06-27 13:46:42 +08:00
2018-06-30 13:17:54 +08:00
if (isValidURL(url)) {
2018-06-27 15:47:30 +08:00
maxURLS--;
if (maxURLS >= 0) {
console.log(`fetching: ${maxURLS} ${url}`);
doneURLS[url] = true;
c.queue(url);
}
} else {
console.log(`skip: ${url}`);
2018-06-26 18:19:25 +08:00
}
}
}
2018-06-27 15:47:30 +08:00
function addWords(text) {
const segment = new Segment()
segment.useDefault();
2018-06-26 18:19:25 +08:00
2018-06-30 13:17:54 +08:00
try {
const words = segment.doSegment(text);
words.forEach(element => {
addWord(element.w);
});
} catch (e) {
console.log(e);
2018-07-01 10:28:15 +08:00
errorPages++;
2018-06-30 13:17:54 +08:00
}
2018-06-27 15:47:30 +08:00
}
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
function onTaskDone(err, res, done) {
if (reservedPages <= 0 || err) {
outputAndQuit();
2018-06-30 13:17:54 +08:00
done();
return;
}
if (res.body.indexOf("UTF-8") < 0 && res.body.indexOf("utf-8") < 0) {
2018-06-27 15:47:30 +08:00
done();
return;
}
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
const contentType = res.headers['content-type'];
if (!contentType || contentType.indexOf('html') < 0) {
done();
return;
}
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
addWords(res.$("body").text());
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
reservedPages--;
addUrls(res.request.uri.href, res.$("[href]"), c);
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
console.log(`${reservedPages} ${res.request.uri.href}`);
2018-06-26 18:19:25 +08:00
2018-06-27 15:47:30 +08:00
res = null;
done();
}
var c = new Crawler({
retries: 1,
forceUTF8: false,
2018-06-30 13:17:54 +08:00
timeout: 5000,
skipDuplicates: true,
callback: onTaskDone
2018-06-26 14:47:03 +08:00
});
2018-07-01 10:28:15 +08:00
c.queue(rootURL);