mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-11-30 03:07:59 +08:00
Merge pull request #6 from henryL7/master
prototype and self-attention model
This commit is contained in:
commit
96c65993bd
41
fastNLP/modules/prototype/README.md
Normal file
41
fastNLP/modules/prototype/README.md
Normal file
@ -0,0 +1,41 @@
|
||||
# Prototype
|
||||
|
||||
## Word2Idx.py
|
||||
A mapping model between words and indexes
|
||||
|
||||
## embedding.py
|
||||
embedding modules
|
||||
|
||||
Contains a simple encapsulation for torch.nn.Embedding
|
||||
|
||||
## encoder.py
|
||||
encoder modules
|
||||
|
||||
Contains a simple encapsulation for torch.nn.LSTM
|
||||
|
||||
## aggregation.py
|
||||
aggregation modules
|
||||
|
||||
Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130
|
||||
|
||||
## predict.py
|
||||
predict modules
|
||||
|
||||
Contains a two layers perceptron for classification
|
||||
|
||||
## example.py
|
||||
An example showing how to use above modules to build a model
|
||||
|
||||
Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.
|
||||
|
||||
## prepare.py
|
||||
A case of using Word2Idx to build Yelp datasets
|
||||
|
||||
## dataloader.py
|
||||
A dataloader for Yelp dataset
|
||||
|
||||
It is an iterable object, returning a zero-padded batch every iteration.
|
||||
|
||||
|
||||
|
||||
|
63
fastNLP/modules/prototype/Word2Idx.py
Normal file
63
fastNLP/modules/prototype/Word2Idx.py
Normal file
@ -0,0 +1,63 @@
|
||||
import collections
|
||||
import pickle
|
||||
|
||||
class Word2Idx():
|
||||
"""
|
||||
Build a word index according to word frequency.
|
||||
|
||||
If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
|
||||
If "max_num" is given, then at most the most frequent $max_num words will be kept.
|
||||
"words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
|
||||
num is the size of the lookup table.
|
||||
w2i is a lookup table assigning each word an index.
|
||||
i2w is a vector which serves as an invert mapping of w2i.
|
||||
Note that index 0 is token "<PAD>" for padding
|
||||
index 1 is token "<UNK>" for unregistered words
|
||||
e.g. i2w[w2i["word"]] == "word"
|
||||
"""
|
||||
def __init__(self):
|
||||
self.__w2i = dict()
|
||||
self.__i2w = []
|
||||
self.num = 0
|
||||
|
||||
def build(self, words, min_freq=0, max_num=None):
|
||||
"""build a model from words"""
|
||||
counter = collections.Counter(words)
|
||||
word_set = set(words)
|
||||
if max_num is not None:
|
||||
most_common = counter.most_common(min(len(word_set), max_num - 1))
|
||||
else:
|
||||
most_common = counter.most_common()
|
||||
self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
|
||||
self.__w2i["<PAD>"] = 0
|
||||
self.__w2i["<UNK>"] = 1
|
||||
self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
|
||||
self.num = len(self.__i2w)
|
||||
|
||||
def w2i(self, word):
|
||||
"""word to index"""
|
||||
if word in self.__w2i:
|
||||
return self.__w2i[word]
|
||||
return 0
|
||||
|
||||
def i2w(self, idx):
|
||||
"""index to word"""
|
||||
if idx >= self.num:
|
||||
raise Exception("out of range\n")
|
||||
return self.__i2w[idx]
|
||||
|
||||
def save(self, addr):
|
||||
"""save the model to a file with address "addr" """
|
||||
f = open(addr,"wb")
|
||||
pickle.dump([self.__i2w, self.__w2i, self.num], f)
|
||||
f.close()
|
||||
|
||||
def load(self, addr):
|
||||
"""load a model from a file with address "addr" """
|
||||
f = open(addr,"rb")
|
||||
paras = pickle.load(f)
|
||||
self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
|
||||
f.close()
|
||||
|
||||
|
||||
|
40
fastNLP/modules/prototype/aggregation.py
Normal file
40
fastNLP/modules/prototype/aggregation.py
Normal file
@ -0,0 +1,40 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
|
||||
class Selfattention(nn.Module):
|
||||
"""
|
||||
Self Attention Module.
|
||||
|
||||
Args:
|
||||
input_size : the size for the input vector
|
||||
d_a : the width of weight matrix
|
||||
r : the number of encoded vectors
|
||||
"""
|
||||
def __init__(self, input_size, d_a, r):
|
||||
super(Selfattention, self).__init__()
|
||||
self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
|
||||
self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
|
||||
self.softmax = nn.Softmax(dim=2)
|
||||
self.tanh = nn.Tanh()
|
||||
|
||||
def penalization(self, A):
|
||||
"""
|
||||
compute the penalization term for attention module
|
||||
"""
|
||||
if self.W_s1.is_cuda:
|
||||
I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
|
||||
else:
|
||||
I = Variable(torch.eye(A.size(1)), requires_grad=False)
|
||||
M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
|
||||
M = M.view(M.size(0), -1)
|
||||
return torch.sum(M ** 2, dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
|
||||
A = self.softmax(torch.matmul(self.W_s2, inter))
|
||||
out = torch.matmul(A, x)
|
||||
out = out.view(out.size(0), -1)
|
||||
penalty = self.penalization(A)
|
||||
return out, penalty
|
||||
|
81
fastNLP/modules/prototype/dataloader.py
Normal file
81
fastNLP/modules/prototype/dataloader.py
Normal file
@ -0,0 +1,81 @@
|
||||
import random
|
||||
import pickle
|
||||
import torch
|
||||
import numpy as np
|
||||
from torch.autograd import Variable
|
||||
|
||||
def float_wrapper(x, requires_grad=True, using_cuda=True):
|
||||
"""
|
||||
transform float type list to pytorch variable
|
||||
"""
|
||||
if using_cuda==True:
|
||||
return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
|
||||
else:
|
||||
return Variable(torch.FloatTensor(x), requires_grad=requires_grad)
|
||||
|
||||
def long_wrapper(x, requires_grad=True, using_cuda=True):
|
||||
"""
|
||||
transform long type list to pytorch variable
|
||||
"""
|
||||
if using_cuda==True:
|
||||
return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
|
||||
else:
|
||||
return Variable(torch.LongTensor(x), requires_grad=requires_grad)
|
||||
|
||||
def pad(X, using_cuda):
|
||||
"""
|
||||
zero-pad sequnces to same length then pack them together
|
||||
"""
|
||||
maxlen = max([x.size(0) for x in X])
|
||||
Y = []
|
||||
for x in X:
|
||||
padlen = maxlen - x.size(0)
|
||||
if padlen > 0:
|
||||
if using_cuda:
|
||||
paddings = Variable(torch.zeros(padlen).long()).cuda()
|
||||
else:
|
||||
paddings = Variable(torch.zeros(padlen).long())
|
||||
x_ = torch.cat((x, paddings), 0)
|
||||
Y.append(x_)
|
||||
else:
|
||||
Y.append(x)
|
||||
return torch.stack(Y)
|
||||
|
||||
class DataLoader(object):
|
||||
"""
|
||||
load data with form {"feature", "class"}
|
||||
|
||||
Args:
|
||||
fdir : data file address
|
||||
batch_size : batch_size
|
||||
shuffle : if True, shuffle dataset every epoch
|
||||
using_cuda : if True, return tensors on GPU
|
||||
"""
|
||||
def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
|
||||
with open(fdir, "rb") as f:
|
||||
self.data = pickle.load(f)
|
||||
self.batch_size = batch_size
|
||||
self.num = len(self.data)
|
||||
self.count = 0
|
||||
self.iters = int(self.num / batch_size)
|
||||
self.shuffle = shuffle
|
||||
self.using_cuda = using_cuda
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
if self.count == self.iters:
|
||||
self.count = 0
|
||||
if self.shuffle:
|
||||
random.shuffle(self.data)
|
||||
raise StopIteration()
|
||||
else:
|
||||
batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
|
||||
self.count += 1
|
||||
X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
|
||||
X = pad(X, self.using_cuda)
|
||||
y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
|
||||
return {"feature" : X, "class" : y}
|
||||
|
||||
|
23
fastNLP/modules/prototype/embedding.py
Normal file
23
fastNLP/modules/prototype/embedding.py
Normal file
@ -0,0 +1,23 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class Lookuptable(nn.Module):
|
||||
"""
|
||||
A simple lookup table
|
||||
|
||||
Args:
|
||||
nums : the size of the lookup table
|
||||
dims : the size of each vector
|
||||
padding_idx : pads the tensor with zeros whenever it encounters this index
|
||||
sparse : If True, gradient matrix will be a sparse tensor. In this case,
|
||||
only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
|
||||
"""
|
||||
def __init__(self, nums, dims, padding_idx=0, sparse=False):
|
||||
super(Lookuptable, self).__init__()
|
||||
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
|
||||
|
||||
def forward(self, x):
|
||||
return self.embed(x)
|
||||
|
||||
if __name__ == "__main__":
|
||||
model = Lookuptable(10, 20)
|
22
fastNLP/modules/prototype/encoder.py
Normal file
22
fastNLP/modules/prototype/encoder.py
Normal file
@ -0,0 +1,22 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
class Lstm(nn.Module):
|
||||
"""
|
||||
LSTM module
|
||||
|
||||
Args:
|
||||
input_size : input size
|
||||
hidden_size : hidden size
|
||||
num_layers : number of hidden layers
|
||||
dropout : dropout rate
|
||||
bidirectional : If True, becomes a bidirectional RNN
|
||||
"""
|
||||
def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
|
||||
super(Lstm, self).__init__()
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
|
||||
dropout=dropout, bidirectional=bidirectional)
|
||||
|
||||
def forward(self, x):
|
||||
x, _ = self.lstm(x)
|
||||
return x
|
129
fastNLP/modules/prototype/example.py
Normal file
129
fastNLP/modules/prototype/example.py
Normal file
@ -0,0 +1,129 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import encoder
|
||||
import aggregation
|
||||
import embedding
|
||||
import predict
|
||||
import torch.optim as optim
|
||||
import time
|
||||
import dataloader
|
||||
|
||||
WORD_NUM = 357361
|
||||
WORD_SIZE = 100
|
||||
HIDDEN_SIZE = 300
|
||||
D_A = 350
|
||||
R = 10
|
||||
MLP_HIDDEN = 2000
|
||||
CLASSES_NUM = 5
|
||||
|
||||
class Net(nn.Module):
|
||||
"""
|
||||
A model for sentiment analysis using lstm and self-attention
|
||||
"""
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
|
||||
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
|
||||
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
|
||||
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embedding(x)
|
||||
x = self.encoder(x)
|
||||
x, penalty = self.aggregation(x)
|
||||
x = self.predict(x)
|
||||
return x, penalty
|
||||
|
||||
def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
|
||||
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
|
||||
"""
|
||||
training procedure
|
||||
|
||||
Args:
|
||||
If model_dict is given (a file address), it will continue training on the given model.
|
||||
Otherwise, it would train a new model from scratch.
|
||||
If using_cuda is true, the training would be conducted on GPU.
|
||||
Learning_rate and momentum is for SGD optimizer.
|
||||
coef is the coefficent between the cross-entropy loss and the penalization term.
|
||||
interval is the frequncy of reporting.
|
||||
|
||||
the result will be saved with a form "model_dict_+current time", which could be used for further training
|
||||
"""
|
||||
|
||||
if using_cuda:
|
||||
net = Net().cuda()
|
||||
else:
|
||||
net = Net()
|
||||
|
||||
if model_dict != None:
|
||||
net.load_state_dict(torch.load(model_dict))
|
||||
|
||||
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)
|
||||
|
||||
#statistics
|
||||
loss_count = 0
|
||||
prepare_time = 0
|
||||
run_time = 0
|
||||
count = 0
|
||||
|
||||
for epoch in range(epochs):
|
||||
print("epoch: %d"%(epoch))
|
||||
for i, batch in enumerate(dataset):
|
||||
t1 = time.time()
|
||||
X = batch["feature"]
|
||||
y = batch["class"]
|
||||
|
||||
t2 = time.time()
|
||||
y_pred, y_penl = net(X)
|
||||
loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
|
||||
optimizer.zero_grad()
|
||||
loss.backward()
|
||||
nn.utils.clip_grad_norm(net.parameters(), 0.5)
|
||||
optimizer.step()
|
||||
t3 = time.time()
|
||||
|
||||
loss_count += torch.sum(y_penl).data[0]
|
||||
prepare_time += (t2 - t1)
|
||||
run_time += (t3 - t2)
|
||||
p, idx = torch.max(y_pred.data, dim=1)
|
||||
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
|
||||
|
||||
if (i + 1) % interval == 0:
|
||||
print("epoch : %d, iters: %d"%(epoch, i + 1))
|
||||
print("loss count:" + str(loss_count / (interval * batch_size)))
|
||||
print("acuracy:" + str(count / (interval * batch_size)))
|
||||
print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
|
||||
print("prepare time:" + str(prepare_time))
|
||||
print("run time:" + str(run_time))
|
||||
prepare_time = 0
|
||||
run_time = 0
|
||||
loss_count = 0
|
||||
count = 0
|
||||
string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
|
||||
torch.save(net.state_dict(), "model_dict_%s.dict"%(string))
|
||||
|
||||
def test(model_dict, using_cuda=True):
|
||||
if using_cuda:
|
||||
net = Net().cuda()
|
||||
else:
|
||||
net = Net()
|
||||
net.load_state_dict(torch.load(model_dict))
|
||||
dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
|
||||
count = 0
|
||||
for i, batch in enumerate(dataset):
|
||||
X = batch["feature"]
|
||||
y = batch["class"]
|
||||
y_pred, _ = net(X)
|
||||
p, idx = torch.max(y_pred.data, dim=1)
|
||||
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
|
||||
print("accuracy: %f"%(count / dataset.num))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train(using_cuda=torch.cuda.is_available())
|
||||
|
||||
|
||||
|
||||
|
25
fastNLP/modules/prototype/predict.py
Normal file
25
fastNLP/modules/prototype/predict.py
Normal file
@ -0,0 +1,25 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
class MLP(nn.Module):
|
||||
"""
|
||||
A two layers perceptron for classification.
|
||||
|
||||
Output : Unnormalized possibility distribution
|
||||
Args:
|
||||
input_size : the size of input
|
||||
hidden_size : the size of hidden layer
|
||||
output_size : the size of output
|
||||
"""
|
||||
def __init__(self, input_size, hidden_size, output_size):
|
||||
super(MLP,self).__init__()
|
||||
self.L1 = nn.Linear(input_size, hidden_size)
|
||||
self.L2 = nn.Linear(hidden_size, output_size)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.L2(F.relu(self.L1(x)))
|
||||
return out
|
||||
|
||||
if __name__ == "__main__":
|
||||
MLP(20, 30, 20)
|
50
fastNLP/modules/prototype/prepare.py
Normal file
50
fastNLP/modules/prototype/prepare.py
Normal file
@ -0,0 +1,50 @@
|
||||
import pickle
|
||||
import Word2Idx
|
||||
|
||||
def get_sets(m, n):
|
||||
"""
|
||||
get a train set containing m samples and a test set containing n samples
|
||||
"""
|
||||
samples = pickle.load(open("tuples.pkl","rb"))
|
||||
if m+n > len(samples):
|
||||
print("asking for too many tuples\n")
|
||||
return
|
||||
train_samples = samples[ : m]
|
||||
test_samples = samples[m: m+n]
|
||||
return train_samples, test_samples
|
||||
|
||||
def build_wordidx():
|
||||
"""
|
||||
build wordidx using word2idx
|
||||
"""
|
||||
train, test = get_sets(500000, 2000)
|
||||
words = []
|
||||
for x in train:
|
||||
words += x[0]
|
||||
wordidx = Word2Idx.Word2Idx()
|
||||
wordidx.build(words)
|
||||
print(wordidx.num)
|
||||
print(wordidx.i2w(0))
|
||||
wordidx.save("wordidx.pkl")
|
||||
|
||||
def build_sets():
|
||||
"""
|
||||
build train set and test set, transform word to index
|
||||
"""
|
||||
train, test = get_sets(500000, 2000)
|
||||
wordidx = Word2Idx.Word2Idx()
|
||||
wordidx.load("wordidx.pkl")
|
||||
train_set = []
|
||||
for x in train:
|
||||
sent = [wordidx.w2i(w) for w in x[0]]
|
||||
train_set.append({"sent" : sent, "class" : x[1]})
|
||||
test_set = []
|
||||
for x in test:
|
||||
sent = [wordidx.w2i(w) for w in x[0]]
|
||||
test_set.append({"sent" : sent, "class" : x[1]})
|
||||
pickle.dump(train_set, open("train_set.pkl", "wb"))
|
||||
pickle.dump(test_set, open("test_set.pkl", "wb"))
|
||||
|
||||
if __name__ == "__main__":
|
||||
build_wordidx()
|
||||
build_sets()
|
Loading…
Reference in New Issue
Block a user