awtk/tools/word_gen/gen.js

const fs = require('fs');
const URL = require('url')
const Crawler = require("crawler");
const Segment = require('segment');

let allWords = {};
let doneURLS = {};
let maxURLS = 10000;
let errorPages = 0;
let reservedPages = maxURLS;
const maxWordsPerChar = 15;
let rootURL = ['https://www.qisuu.la/du/', 'http://blog.sina.com.cn/', 'https://blog.csdn.net/'];

function isValidURL(url) {
  if (url.indexOf('javascript:') >= 0 || url.indexOf('css') >= 0 || url.indexOf(':') > 8) {
    return false;
  }

  if (doneURLS[url] || url.indexOf('#') >= 0 || url.indexOf('ico') >= 0) {
    return false;
  }

  if (url.indexOf('api.') >= 0 || url.indexOf('download') >= 0) {
    return false;
  }

  for (let i = 0; i < rootURL.length; i++) {
    let iter = rootURL[i];
    if (url.indexOf(iter) >= 0) {
      return true;
    }
  }

  return false;
}

function bufferWriteWord(buff, word, start) {
  let offset = start;
  let nr = word.length;

  for (let i = 0; i < nr; i++) {
    const code = word.charCodeAt(i);
    buff.writeUInt16LE(code, offset);
    offset += 2;
  }

  buff.writeUInt16LE(0, offset);
  offset += 2;

  return offset - start;
}

function outputWords(arr) {
  let offsetData = 0;
  let offsetIndex = 0;
  const nr = arr.length;
  let f = fs.openSync("words.bin", "w+");
  let headerBuffer = Buffer.alloc(8);
  let indexBuffer = Buffer.alloc(nr * 8);
  let contentBuffer = Buffer.alloc(10 * 1024 * 1024)

  headerBuffer.writeUInt32LE(0, 0);
  headerBuffer.writeUInt32LE(nr, 4);
  arr.forEach(iter => {
    const code = iter.c.charCodeAt(0);
    indexBuffer.writeUInt32LE(code, offsetIndex);
    indexBuffer.writeUInt32LE(offsetData, offsetIndex + 4);
    offsetIndex += 8;

    contentBuffer.writeUInt32LE(iter.words.length, offsetData);
    offsetData += 4;
    iter.words.forEach(w => {
      offsetData += bufferWriteWord(contentBuffer, w.w, offsetData);
    })
  });

  fs.writeSync(f, headerBuffer);
  fs.writeSync(f, indexBuffer);
  fs.writeSync(f, contentBuffer, 0, offsetData);
  fs.closeSync(f);
}

function tidyResult() {
  const arr = [];

  for (let c in allWords) {
    const words = allWords[c];
    let item = {
      c: c,
      words: []
    };

    for (let w in words) {
      const f = words[w];
      item.words.push({
        w: w,
        f: f
      })
    }

    item.words.sort((a, b) => {
      return b.f - a.f;
    })

    if (item.words.length > maxWordsPerChar) {
      item.words.length = maxWordsPerChar;
    }

    arr.push(item);
  }

  arr.sort((a, b) => {
    return a.c.charCodeAt(0) - b.c.charCodeAt(0);
  })

  console.log(JSON.stringify(arr, null, ' '));

  return arr;
}

function outputAndQuit() {
  outputWords(tidyResult());
  console.log(`errorPages=${errorPages}`);
  process.exit(0);
}

function isValidWord(w) {
  if (w.length > 8 || w.length < 2) {
    return false;
  }

  for (let i = 0; i < w.length; i++) {
    const c = w.charCodeAt(i);
    if (c < 0x80) {
      return false;
    }
  }

  return true;
}

function addWord(w) {
  const c = w.substring(0, 1);
  const others = w.substring(1);

  if (!isValidWord(w)) {
    return;
  }

  if (!allWords[c]) {
    allWords[c] = {};
  }

  if (!(allWords[c][others])) {
    allWords[c][others] = 1;
  } else {
    allWords[c][others] = allWords[c][others] + 1;
  }
}

function addUrls(requestUrl, urls, c) {
  for (let i = 0; i < urls.length; i++) {
    const iter = urls[i];
    const href = iter.attribs.href;
    const url = URL.resolve(requestUrl, href);


    if (isValidURL(url)) {
      maxURLS--;
      if (maxURLS >= 0) {
        console.log(`fetching: ${maxURLS} ${url}`);
        doneURLS[url] = true;
        c.queue(url);
      }
    } else {
      console.log(`skip: ${url}`);
    }
  }
}

function addWords(text) {
  const segment = new Segment()
  segment.useDefault();

  try {
    const words = segment.doSegment(text);
    words.forEach(element => {
      addWord(element.w);
    });
  } catch (e) {
    console.log(e);
    errorPages++;
  }
}

function onTaskDone(err, res, done) {
  if (reservedPages <= 0 || err) {
    outputAndQuit();

    done();
    return;
  }

  if (res.body.indexOf("UTF-8") < 0 && res.body.indexOf("utf-8") < 0) {
    done();
    return;
  }

  const contentType = res.headers['content-type'];
  if (!contentType || contentType.indexOf('html') < 0) {
    done();
    return;
  }

  addWords(res.$("body").text());

  reservedPages--;
  addUrls(res.request.uri.href, res.$("[href]"), c);

  console.log(`${reservedPages} ${res.request.uri.href}`);

  res = null;

  done();
}

var c = new Crawler({
  retries: 1,
  forceUTF8: false,
  timeout: 5000,
  skipDuplicates: true,
  callback: onTaskDone
});

c.queue(rootURL);