mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-11-30 11:17:50 +08:00
7514be6f30
- restructure: move reproduction outside - add evaluate in tester
51 lines
1.4 KiB
Python
51 lines
1.4 KiB
Python
''''
|
|
Tokenize yelp dataset's documents using stanford core nlp
|
|
'''
|
|
|
|
import json
|
|
import os
|
|
import pickle
|
|
|
|
import nltk
|
|
from nltk.tokenize import stanford
|
|
|
|
input_filename = 'review.json'
|
|
|
|
# config for stanford core nlp
|
|
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
|
|
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
|
|
tokenizer = stanford.CoreNLPTokenizer()
|
|
|
|
in_dirname = 'review'
|
|
out_dirname = 'reviews'
|
|
|
|
f = open(input_filename, encoding='utf-8')
|
|
samples = []
|
|
j = 0
|
|
for i, line in enumerate(f.readlines()):
|
|
review = json.loads(line)
|
|
samples.append((review['stars'], review['text']))
|
|
if (i + 1) % 5000 == 0:
|
|
print(i)
|
|
pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
|
|
j += 1
|
|
samples = []
|
|
pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
|
|
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
|
|
# print(samples[0])
|
|
|
|
|
|
for fn in os.listdir(in_dirname):
|
|
print(fn)
|
|
precessed = []
|
|
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
|
|
tokens = []
|
|
sents = nltk.tokenize.sent_tokenize(text)
|
|
for s in sents:
|
|
tokens.append(tokenizer.tokenize(s))
|
|
precessed.append((stars, tokens))
|
|
# print(tokens)
|
|
if len(precessed) % 100 == 0:
|
|
print(len(precessed))
|
|
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))
|