update tools

This commit is contained in:
xianjimli 2018-07-01 10:28:15 +08:00
parent 9dd2ab1302
commit 6e8c9ba39c
3 changed files with 8 additions and 5 deletions

View File

@ -30,10 +30,11 @@ var getChunks = function (wordpos, pos, text) {
// debug(words); // debug(words);
// throw new Error(); // throw new Error();
var ret = []; var ret = [];
getChunksCallsNr++; if(getChunksCallsNr > 150) {
if(getChunksCallsNr > 100) {
throw "get Chunks error"; throw "get Chunks error";
} }
getChunksCallsNr++;
for (var i = 0; i < words.length; i++) { for (var i = 0; i < words.length; i++) {
var word = words[i]; var word = words[i];
//debug(word); //debug(word);

View File

@ -6,9 +6,10 @@ const Segment = require('segment');
let allWords = {}; let allWords = {};
let doneURLS = {}; let doneURLS = {};
let maxURLS = 10000; let maxURLS = 10000;
let errorPages = 0;
let reservedPages = maxURLS; let reservedPages = maxURLS;
const maxWordsPerChar = 15; const maxWordsPerChar = 15;
let rootURL = ['http://blog.sina.com.cn/', 'https://blog.csdn.net/']; let rootURL = ['https://www.qisuu.la/du/', 'http://blog.sina.com.cn/', 'https://blog.csdn.net/'];
function isValidURL(url) { function isValidURL(url) {
if (url.indexOf('javascript:') >= 0 || url.indexOf('css') >= 0 || url.indexOf(':') > 8) { if (url.indexOf('javascript:') >= 0 || url.indexOf('css') >= 0 || url.indexOf(':') > 8) {
@ -119,7 +120,7 @@ function tidyResult() {
function outputAndQuit() { function outputAndQuit() {
outputWords(tidyResult()); outputWords(tidyResult());
console.log(`errorPages=${errorPages}`);
process.exit(0); process.exit(0);
} }
@ -188,6 +189,7 @@ function addWords(text) {
}); });
} catch (e) { } catch (e) {
console.log(e); console.log(e);
errorPages++;
} }
} }
@ -230,4 +232,4 @@ var c = new Crawler({
callback: onTaskDone callback: onTaskDone
}); });
c.queue(rootURL); c.queue(rootURL);

Binary file not shown.