mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-02 04:07:35 +08:00
tokenize data
This commit is contained in:
parent
819914b6b8
commit
544ca8631b
BIN
model_inplement/code/__pycache__/model.cpython-36.pyc
Normal file
BIN
model_inplement/code/__pycache__/model.cpython-36.pyc
Normal file
Binary file not shown.
@ -22,19 +22,16 @@ class HAN(nn.Module):
|
||||
self.output_layer = nn.Linear(2* sent_hidden_size, output_size)
|
||||
self.softmax = nn.Softmax()
|
||||
|
||||
def forward(self, x, level='w'):
|
||||
def forward(self, doc):
|
||||
# input is a sequence of vector
|
||||
# if level == w, a seq of words (a sent); level == s, a seq of sents (a doc)
|
||||
if level == 's':
|
||||
v = self.sent_layer(x)
|
||||
output = self.softmax(self.output_layer(v))
|
||||
s_list = []
|
||||
for sent in doc:
|
||||
s_list.append(self.word_layer(sent))
|
||||
s_vec = torch.cat(s_list, dim=1).t()
|
||||
doc_vec = self.sent_layer(s_vec)
|
||||
output = self.softmax(self.output_layer(doc_vec))
|
||||
return output
|
||||
elif level == 'w':
|
||||
s = self.word_layer(x)
|
||||
return s
|
||||
else:
|
||||
print('unknow level in Parameter!')
|
||||
|
||||
|
||||
class AttentionNet(nn.Module):
|
||||
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
|
||||
@ -60,11 +57,53 @@ class AttentionNet(nn.Module):
|
||||
self.context_vec.data.uniform_(-0.1, 0.1)
|
||||
|
||||
def forward(self, inputs):
|
||||
# inputs's dim seq_len*word_dim
|
||||
# inputs's dim (seq_len, word_dim)
|
||||
inputs = torch.unsqueeze(inputs, 1)
|
||||
h_t, hidden = self.gru(inputs)
|
||||
h_t = torch.squeeze(h_t, 1)
|
||||
u = self.tanh(self.fc(h_t))
|
||||
alpha = self.softmax(torch.mm(u, self.context_vec))
|
||||
output = torch.mm(h_t.t(), alpha)
|
||||
# output's dim (2*hidden_size, 1)
|
||||
return output
|
||||
|
||||
|
||||
'''
|
||||
Train process
|
||||
'''
|
||||
import math
|
||||
import os
|
||||
import copy
|
||||
import pickle
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.ticker as ticker
|
||||
import numpy as np
|
||||
import json
|
||||
import nltk
|
||||
|
||||
optimizer = torch.optim.SGD(lr=0.01)
|
||||
criterion = nn.NLLLoss()
|
||||
epoch = 1
|
||||
batch_size = 10
|
||||
|
||||
net = HAN(input_size=100, output_size=5,
|
||||
word_hidden_size=50, word_num_layers=1, word_context_size=100,
|
||||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
|
||||
|
||||
def dataloader(filename):
|
||||
samples = pickle.load(open(filename, 'rb'))
|
||||
return samples
|
||||
|
||||
def gen_doc(text):
|
||||
pass
|
||||
|
||||
class SampleDoc:
|
||||
def __init__(self, doc, label):
|
||||
self.doc = doc
|
||||
self.label = label
|
||||
|
||||
def __iter__(self):
|
||||
for sent in self.doc:
|
||||
for word in sent:
|
||||
|
||||
|
42
model_inplement/code/preprocess.py
Normal file
42
model_inplement/code/preprocess.py
Normal file
@ -0,0 +1,42 @@
|
||||
import pickle
|
||||
import json
|
||||
import nltk
|
||||
from nltk.tokenize import stanford
|
||||
|
||||
# f = open('dataset/review.json', encoding='utf-8')
|
||||
# samples = []
|
||||
# j = 0
|
||||
# for i, line in enumerate(f.readlines()):
|
||||
# review = json.loads(line)
|
||||
# samples.append((review['stars'], review['text']))
|
||||
# if (i+1) % 5000 == 0:
|
||||
# print(i)
|
||||
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
|
||||
# j += 1
|
||||
# samples = []
|
||||
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
|
||||
samples = pickle.load(open('review/samples0.pkl', 'rb'))
|
||||
# print(samples[0])
|
||||
|
||||
import os
|
||||
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
|
||||
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
|
||||
tokenizer = stanford.CoreNLPTokenizer()
|
||||
|
||||
dirname = 'review'
|
||||
dirname1 = 'reviews'
|
||||
|
||||
for fn in os.listdir(dirname):
|
||||
print(fn)
|
||||
precessed = []
|
||||
for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
|
||||
tokens = []
|
||||
sents = nltk.tokenize.sent_tokenize(text)
|
||||
for s in sents:
|
||||
tokens.append(tokenizer.tokenize(s))
|
||||
precessed.append((stars, tokens))
|
||||
# print(tokens)
|
||||
if len(precessed) % 100 == 0:
|
||||
print(len(precessed))
|
||||
pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))
|
||||
|
0
model_inplement/code/train.py
Normal file
0
model_inplement/code/train.py
Normal file
Loading…
Reference in New Issue
Block a user