update tools

This commit is contained in:
xianjimli 2018-07-01 10:28:15 +08:00
parent 9dd2ab1302
commit 6e8c9ba39c
3 changed files with 8 additions and 5 deletions

View File

@ -30,10 +30,11 @@ var getChunks = function (wordpos, pos, text) {
// debug(words);
// throw new Error();
var ret = [];
getChunksCallsNr++;
if(getChunksCallsNr > 100) {
if(getChunksCallsNr > 150) {
throw "get Chunks error";
}
getChunksCallsNr++;
for (var i = 0; i < words.length; i++) {
var word = words[i];
//debug(word);

View File

@ -6,9 +6,10 @@ const Segment = require('segment');
let allWords = {};
let doneURLS = {};
let maxURLS = 10000;
let errorPages = 0;
let reservedPages = maxURLS;
const maxWordsPerChar = 15;
let rootURL = ['http://blog.sina.com.cn/', 'https://blog.csdn.net/'];
let rootURL = ['https://www.qisuu.la/du/', 'http://blog.sina.com.cn/', 'https://blog.csdn.net/'];
function isValidURL(url) {
if (url.indexOf('javascript:') >= 0 || url.indexOf('css') >= 0 || url.indexOf(':') > 8) {
@ -119,7 +120,7 @@ function tidyResult() {
function outputAndQuit() {
outputWords(tidyResult());
console.log(`errorPages=${errorPages}`);
process.exit(0);
}
@ -188,6 +189,7 @@ function addWords(text) {
});
} catch (e) {
console.log(e);
errorPages++;
}
}
@ -230,4 +232,4 @@ var c = new Crawler({
callback: onTaskDone
});
c.queue(rootURL);
c.queue(rootURL);

Binary file not shown.