mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-11-30 03:07:59 +08:00
Add files via upload
This commit is contained in:
parent
d26d5b56ac
commit
32256c6e51
21
Char-aware_NLM/LICENSE
Normal file
21
Char-aware_NLM/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2017
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
40
Char-aware_NLM/README.md
Normal file
40
Char-aware_NLM/README.md
Normal file
@ -0,0 +1,40 @@
|
||||
|
||||
# PyTorch-Character-Aware-Neural-Language-Model
|
||||
|
||||
This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim.
|
||||
|
||||
## Requiredments
|
||||
The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**.
|
||||
|
||||
## HyperParameters
|
||||
| HyperParam | value |
|
||||
| ------ | :-------|
|
||||
| LSTM batch size | 20 |
|
||||
| LSTM sequence length | 35 |
|
||||
| LSTM hidden units | 300 |
|
||||
| epochs | 35 |
|
||||
| initial learning rate | 1.0 |
|
||||
| character embedding dimension | 15 |
|
||||
|
||||
## Demo
|
||||
Train the model with split train/valid/test data.
|
||||
|
||||
`python train.py`
|
||||
|
||||
The trained model will saved in `cache/net.pkl`.
|
||||
Test the model.
|
||||
|
||||
`python test.py`
|
||||
|
||||
Best result on test set:
|
||||
PPl=127.2163
|
||||
cross entropy loss=4.8459
|
||||
|
||||
## Acknowledgement
|
||||
This implementation borrowed ideas from
|
||||
|
||||
https://github.com/jarfo/kchar
|
||||
|
||||
https://github.com/cronos123/Character-Aware-Neural-Language-Models
|
||||
|
||||
|
148
Char-aware_NLM/model.py
Normal file
148
Char-aware_NLM/model.py
Normal file
@ -0,0 +1,148 @@
|
||||
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Highway(nn.Module):
|
||||
"""Highway network"""
|
||||
def __init__(self, input_size):
|
||||
super(Highway, self).__init__()
|
||||
self.fc1 = nn.Linear(input_size, input_size, bias=True)
|
||||
self.fc2 = nn.Linear(input_size, input_size, bias=True)
|
||||
|
||||
def forward(self, x):
|
||||
t = F.sigmoid(self.fc1(x))
|
||||
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x)
|
||||
|
||||
|
||||
class charLM(nn.Module):
|
||||
"""CNN + highway network + LSTM
|
||||
# Input:
|
||||
4D tensor with shape [batch_size, in_channel, height, width]
|
||||
# Output:
|
||||
2D Tensor with shape [batch_size, vocab_size]
|
||||
# Arguments:
|
||||
char_emb_dim: the size of each character's embedding
|
||||
word_emb_dim: the size of each word's embedding
|
||||
vocab_size: num of unique words
|
||||
num_char: num of characters
|
||||
use_gpu: True or False
|
||||
"""
|
||||
def __init__(self, char_emb_dim, word_emb_dim,
|
||||
vocab_size, num_char, use_gpu):
|
||||
super(charLM, self).__init__()
|
||||
self.char_emb_dim = char_emb_dim
|
||||
self.word_emb_dim = word_emb_dim
|
||||
self.vocab_size = vocab_size
|
||||
|
||||
# char embedding layer
|
||||
self.char_embed = nn.Embedding(num_char, char_emb_dim)
|
||||
|
||||
# convolutions of filters with different sizes
|
||||
self.convolutions = []
|
||||
|
||||
# list of tuples: (the number of filter, width)
|
||||
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
|
||||
|
||||
for out_channel, filter_width in self.filter_num_width:
|
||||
self.convolutions.append(
|
||||
nn.Conv2d(
|
||||
1, # in_channel
|
||||
out_channel, # out_channel
|
||||
kernel_size=(char_emb_dim, filter_width), # (height, width)
|
||||
bias=True
|
||||
)
|
||||
)
|
||||
|
||||
self.highway_input_dim = sum([x for x, y in self.filter_num_width])
|
||||
|
||||
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
|
||||
|
||||
# highway net
|
||||
self.highway1 = Highway(self.highway_input_dim)
|
||||
self.highway2 = Highway(self.highway_input_dim)
|
||||
|
||||
# LSTM
|
||||
self.lstm_num_layers = 2
|
||||
|
||||
self.lstm = nn.LSTM(input_size=self.highway_input_dim,
|
||||
hidden_size=self.word_emb_dim,
|
||||
num_layers=self.lstm_num_layers,
|
||||
bias=True,
|
||||
dropout=0.5,
|
||||
batch_first=True)
|
||||
|
||||
# output layer
|
||||
self.dropout = nn.Dropout(p=0.5)
|
||||
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
|
||||
|
||||
|
||||
if use_gpu is True:
|
||||
for x in range(len(self.convolutions)):
|
||||
self.convolutions[x] = self.convolutions[x].cuda()
|
||||
self.highway1 = self.highway1.cuda()
|
||||
self.highway2 = self.highway2.cuda()
|
||||
self.lstm = self.lstm.cuda()
|
||||
self.dropout = self.dropout.cuda()
|
||||
self.char_embed = self.char_embed.cuda()
|
||||
self.linear = self.linear.cuda()
|
||||
self.batch_norm = self.batch_norm.cuda()
|
||||
|
||||
|
||||
def forward(self, x, hidden):
|
||||
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
|
||||
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
|
||||
lstm_batch_size = x.size()[0]
|
||||
lstm_seq_len = x.size()[1]
|
||||
|
||||
x = x.contiguous().view(-1, x.size()[2])
|
||||
# [num_seq*seq_len, max_word_len+2]
|
||||
|
||||
x = self.char_embed(x)
|
||||
# [num_seq*seq_len, max_word_len+2, char_emb_dim]
|
||||
|
||||
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
|
||||
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
|
||||
|
||||
x = self.conv_layers(x)
|
||||
# [num_seq*seq_len, total_num_filters]
|
||||
|
||||
x = self.batch_norm(x)
|
||||
# [num_seq*seq_len, total_num_filters]
|
||||
|
||||
x = self.highway1(x)
|
||||
x = self.highway2(x)
|
||||
# [num_seq*seq_len, total_num_filters]
|
||||
|
||||
x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1)
|
||||
# [num_seq, seq_len, total_num_filters]
|
||||
|
||||
x, hidden = self.lstm(x, hidden)
|
||||
# [seq_len, num_seq, hidden_size]
|
||||
|
||||
x = self.dropout(x)
|
||||
# [seq_len, num_seq, hidden_size]
|
||||
|
||||
x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1)
|
||||
# [num_seq*seq_len, hidden_size]
|
||||
|
||||
x = self.linear(x)
|
||||
# [num_seq*seq_len, vocab_size]
|
||||
return x, hidden
|
||||
|
||||
|
||||
def conv_layers(self, x):
|
||||
chosen_list = list()
|
||||
for conv in self.convolutions:
|
||||
feature_map = F.tanh(conv(x))
|
||||
# (batch_size, out_channel, 1, max_word_len-width+1)
|
||||
chosen = torch.max(feature_map, 3)[0]
|
||||
# (batch_size, out_channel, 1)
|
||||
chosen = chosen.squeeze()
|
||||
# (batch_size, out_channel)
|
||||
chosen_list.append(chosen)
|
||||
|
||||
# (batch_size, total_num_filers)
|
||||
return torch.cat(chosen_list, 1)
|
123
Char-aware_NLM/test.py
Normal file
123
Char-aware_NLM/test.py
Normal file
@ -0,0 +1,123 @@
|
||||
import os
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
from model import charLM
|
||||
from utilities import *
|
||||
from collections import namedtuple
|
||||
|
||||
def to_var(x):
|
||||
if torch.cuda.is_available():
|
||||
x = x.cuda()
|
||||
return Variable(x)
|
||||
|
||||
|
||||
def test(net, data, opt):
|
||||
net.eval()
|
||||
|
||||
test_input = torch.from_numpy(data.test_input)
|
||||
test_label = torch.from_numpy(data.test_label)
|
||||
|
||||
num_seq = test_input.size()[0] // opt.lstm_seq_len
|
||||
test_input = test_input[:num_seq*opt.lstm_seq_len, :]
|
||||
# [num_seq, seq_len, max_word_len+2]
|
||||
test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
loss_list = []
|
||||
num_hits = 0
|
||||
total = 0
|
||||
iterations = test_input.size()[0] // opt.lstm_batch_size
|
||||
test_generator = batch_generator(test_input, opt.lstm_batch_size)
|
||||
label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len)
|
||||
|
||||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
|
||||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
|
||||
|
||||
add_loss = 0.0
|
||||
for t in range(iterations):
|
||||
batch_input = test_generator.__next__ ()
|
||||
batch_label = label_generator.__next__()
|
||||
|
||||
net.zero_grad()
|
||||
hidden = [state.detach() for state in hidden]
|
||||
test_output, hidden = net(to_var(batch_input), hidden)
|
||||
|
||||
test_loss = criterion(test_output, to_var(batch_label)).data
|
||||
loss_list.append(test_loss)
|
||||
add_loss += test_loss
|
||||
|
||||
print("Test Loss={0:.4f}".format(float(add_loss) / iterations))
|
||||
print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations))))
|
||||
|
||||
|
||||
#############################################################
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
word_embed_dim = 300
|
||||
char_embedding_dim = 15
|
||||
|
||||
if os.path.exists("cache/prep.pt") is False:
|
||||
print("Cannot find prep.pt")
|
||||
|
||||
objetcs = torch.load("cache/prep.pt")
|
||||
|
||||
word_dict = objetcs["word_dict"]
|
||||
char_dict = objetcs["char_dict"]
|
||||
reverse_word_dict = objetcs["reverse_word_dict"]
|
||||
max_word_len = objetcs["max_word_len"]
|
||||
num_words = len(word_dict)
|
||||
|
||||
print("word/char dictionary built. Start making inputs.")
|
||||
|
||||
|
||||
if os.path.exists("cache/data_sets.pt") is False:
|
||||
|
||||
test_text = read_data("./test.txt")
|
||||
test_set = np.array(text2vec(test_text, char_dict, max_word_len))
|
||||
|
||||
# Labels are next-word index in word_dict with the same length as inputs
|
||||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])
|
||||
|
||||
category = {"test": test_set, "tlabel":test_label}
|
||||
torch.save(category, "cache/data_sets.pt")
|
||||
else:
|
||||
data_sets = torch.load("cache/data_sets.pt")
|
||||
test_set = data_sets["test"]
|
||||
test_label = data_sets["tlabel"]
|
||||
train_set = data_sets["tdata"]
|
||||
train_label = data_sets["trlabel"]
|
||||
|
||||
|
||||
DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ")
|
||||
data = DataTuple( test_input=test_set,
|
||||
test_label=test_label, train_label=train_label, train_input=train_set)
|
||||
|
||||
print("Loaded data sets. Start building network.")
|
||||
|
||||
|
||||
|
||||
USE_GPU = True
|
||||
cnn_batch_size = 700
|
||||
lstm_seq_len = 35
|
||||
lstm_batch_size = 20
|
||||
|
||||
|
||||
net = torch.load("cache/net.pkl")
|
||||
|
||||
Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len",
|
||||
"max_word_len", "lstm_batch_size", "word_embed_dim"])
|
||||
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
|
||||
lstm_seq_len=lstm_seq_len,
|
||||
max_word_len=max_word_len,
|
||||
lstm_batch_size=lstm_batch_size,
|
||||
word_embed_dim=word_embed_dim)
|
||||
|
||||
|
||||
print("Network built. Start testing.")
|
||||
|
||||
test(net, data, opt)
|
3761
Char-aware_NLM/test.txt
Normal file
3761
Char-aware_NLM/test.txt
Normal file
File diff suppressed because it is too large
Load Diff
268
Char-aware_NLM/train.py
Normal file
268
Char-aware_NLM/train.py
Normal file
@ -0,0 +1,268 @@
|
||||
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import numpy as np
|
||||
import os
|
||||
from model import charLM
|
||||
from utilities import *
|
||||
from collections import namedtuple
|
||||
from test import test
|
||||
|
||||
|
||||
def preprocess():
|
||||
|
||||
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt")
|
||||
num_words = len(word_dict)
|
||||
num_char = len(char_dict)
|
||||
char_dict["BOW"] = num_char+1
|
||||
char_dict["EOW"] = num_char+2
|
||||
char_dict["PAD"] = 0
|
||||
|
||||
# dict of (int, string)
|
||||
reverse_word_dict = {value:key for key, value in word_dict.items()}
|
||||
max_word_len = max([len(word) for word in word_dict])
|
||||
|
||||
objects = {
|
||||
"word_dict": word_dict,
|
||||
"char_dict": char_dict,
|
||||
"reverse_word_dict": reverse_word_dict,
|
||||
"max_word_len": max_word_len
|
||||
}
|
||||
|
||||
torch.save(objects, "cache/prep.pt")
|
||||
print("Preprocess done.")
|
||||
|
||||
|
||||
def to_var(x):
|
||||
if torch.cuda.is_available():
|
||||
x = x.cuda()
|
||||
return Variable(x)
|
||||
|
||||
|
||||
def train(net, data, opt):
|
||||
|
||||
torch.manual_seed(1024)
|
||||
|
||||
train_input = torch.from_numpy(data.train_input)
|
||||
train_label = torch.from_numpy(data.train_label)
|
||||
valid_input = torch.from_numpy(data.valid_input)
|
||||
valid_label = torch.from_numpy(data.valid_label)
|
||||
|
||||
# [num_seq, seq_len, max_word_len+2]
|
||||
num_seq = train_input.size()[0] // opt.lstm_seq_len
|
||||
train_input = train_input[:num_seq*opt.lstm_seq_len, :]
|
||||
train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
|
||||
|
||||
num_seq = valid_input.size()[0] // opt.lstm_seq_len
|
||||
valid_input = valid_input[:num_seq*opt.lstm_seq_len, :]
|
||||
valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
|
||||
|
||||
num_epoch = opt.epochs
|
||||
num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size
|
||||
|
||||
learning_rate = opt.init_lr
|
||||
old_PPL = 100000
|
||||
best_PPL = 100000
|
||||
|
||||
# Log-SoftMax
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# word_emb_dim == hidden_size / num of hidden units
|
||||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
|
||||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
|
||||
|
||||
|
||||
for epoch in range(num_epoch):
|
||||
|
||||
################ Validation ####################
|
||||
net.eval()
|
||||
loss_batch = []
|
||||
PPL_batch = []
|
||||
iterations = valid_input.size()[0] // opt.lstm_batch_size
|
||||
|
||||
valid_generator = batch_generator(valid_input, opt.lstm_batch_size)
|
||||
vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len)
|
||||
|
||||
|
||||
for t in range(iterations):
|
||||
batch_input = valid_generator.__next__()
|
||||
batch_label = vlabel_generator.__next__()
|
||||
|
||||
hidden = [state.detach() for state in hidden]
|
||||
valid_output, hidden = net(to_var(batch_input), hidden)
|
||||
|
||||
length = valid_output.size()[0]
|
||||
|
||||
# [num_sample-1, len(word_dict)] vs [num_sample-1]
|
||||
valid_loss = criterion(valid_output, to_var(batch_label))
|
||||
|
||||
PPL = torch.exp(valid_loss.data)
|
||||
|
||||
loss_batch.append(float(valid_loss))
|
||||
PPL_batch.append(float(PPL))
|
||||
|
||||
PPL = np.mean(PPL_batch)
|
||||
print("[epoch {}] valid PPL={}".format(epoch, PPL))
|
||||
print("valid loss={}".format(np.mean(loss_batch)))
|
||||
print("PPL decrease={}".format(float(old_PPL - PPL)))
|
||||
|
||||
# Preserve the best model
|
||||
if best_PPL > PPL:
|
||||
best_PPL = PPL
|
||||
torch.save(net.state_dict(), "cache/model.pt")
|
||||
torch.save(net, "cache/net.pkl")
|
||||
|
||||
# Adjust the learning rate
|
||||
if float(old_PPL - PPL) <= 1.0:
|
||||
learning_rate /= 2
|
||||
print("halved lr:{}".format(learning_rate))
|
||||
|
||||
old_PPL = PPL
|
||||
|
||||
##################################################
|
||||
#################### Training ####################
|
||||
net.train()
|
||||
optimizer = optim.SGD(net.parameters(),
|
||||
lr = learning_rate,
|
||||
momentum=0.85)
|
||||
|
||||
# split the first dim
|
||||
input_generator = batch_generator(train_input, opt.lstm_batch_size)
|
||||
label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len)
|
||||
|
||||
for t in range(num_iter_per_epoch):
|
||||
batch_input = input_generator.__next__()
|
||||
batch_label = label_generator.__next__()
|
||||
|
||||
# detach hidden state of LSTM from last batch
|
||||
hidden = [state.detach() for state in hidden]
|
||||
|
||||
output, hidden = net(to_var(batch_input), hidden)
|
||||
# [num_word, vocab_size]
|
||||
|
||||
loss = criterion(output, to_var(batch_label))
|
||||
|
||||
net.zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2)
|
||||
optimizer.step()
|
||||
|
||||
|
||||
if (t+1) % 100 == 0:
|
||||
print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1,
|
||||
t+1, float(loss.data), float(np.exp(loss.data))))
|
||||
|
||||
|
||||
torch.save(net.state_dict(), "cache/model.pt")
|
||||
print("Training finished.")
|
||||
|
||||
|
||||
################################################################
|
||||
|
||||
if __name__=="__main__":
|
||||
|
||||
word_embed_dim = 300
|
||||
char_embedding_dim = 15
|
||||
|
||||
if os.path.exists("cache/prep.pt") is False:
|
||||
preprocess()
|
||||
|
||||
objetcs = torch.load("cache/prep.pt")
|
||||
|
||||
word_dict = objetcs["word_dict"]
|
||||
char_dict = objetcs["char_dict"]
|
||||
reverse_word_dict = objetcs["reverse_word_dict"]
|
||||
max_word_len = objetcs["max_word_len"]
|
||||
num_words = len(word_dict)
|
||||
|
||||
print("word/char dictionary built. Start making inputs.")
|
||||
|
||||
|
||||
if os.path.exists("cache/data_sets.pt") is False:
|
||||
train_text = read_data("./train.txt")
|
||||
valid_text = read_data("./valid.txt")
|
||||
test_text = read_data("./test.txt")
|
||||
|
||||
train_set = np.array(text2vec(train_text, char_dict, max_word_len))
|
||||
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
|
||||
test_set = np.array(text2vec(test_text, char_dict, max_word_len))
|
||||
|
||||
# Labels are next-word index in word_dict with the same length as inputs
|
||||
train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]])
|
||||
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
|
||||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])
|
||||
|
||||
category = {"tdata":train_set, "vdata":valid_set, "test": test_set,
|
||||
"trlabel":train_label, "vlabel":valid_label, "tlabel":test_label}
|
||||
torch.save(category, "cache/data_sets.pt")
|
||||
else:
|
||||
data_sets = torch.load("cache/data_sets.pt")
|
||||
train_set = data_sets["tdata"]
|
||||
valid_set = data_sets["vdata"]
|
||||
test_set = data_sets["test"]
|
||||
train_label = data_sets["trlabel"]
|
||||
valid_label = data_sets["vlabel"]
|
||||
test_label = data_sets["tlabel"]
|
||||
|
||||
|
||||
DataTuple = namedtuple("DataTuple",
|
||||
"train_input train_label valid_input valid_label test_input test_label")
|
||||
data = DataTuple(train_input=train_set,
|
||||
train_label=train_label,
|
||||
valid_input=valid_set,
|
||||
valid_label=valid_label,
|
||||
test_input=test_set,
|
||||
test_label=test_label)
|
||||
|
||||
print("Loaded data sets. Start building network.")
|
||||
|
||||
|
||||
|
||||
USE_GPU = True
|
||||
cnn_batch_size = 700
|
||||
lstm_seq_len = 35
|
||||
lstm_batch_size = 20
|
||||
# cnn_batch_size == lstm_seq_len * lstm_batch_size
|
||||
|
||||
net = charLM(char_embedding_dim,
|
||||
word_embed_dim,
|
||||
num_words,
|
||||
len(char_dict),
|
||||
use_gpu=USE_GPU)
|
||||
|
||||
for param in net.parameters():
|
||||
nn.init.uniform(param.data, -0.05, 0.05)
|
||||
|
||||
|
||||
Options = namedtuple("Options", [
|
||||
"cnn_batch_size", "init_lr", "lstm_seq_len",
|
||||
"max_word_len", "lstm_batch_size", "epochs",
|
||||
"word_embed_dim"])
|
||||
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
|
||||
init_lr=1.0,
|
||||
lstm_seq_len=lstm_seq_len,
|
||||
max_word_len=max_word_len,
|
||||
lstm_batch_size=lstm_batch_size,
|
||||
epochs=35,
|
||||
word_embed_dim=word_embed_dim)
|
||||
|
||||
|
||||
print("Network built. Start training.")
|
||||
|
||||
|
||||
# You can stop training anytime by "ctrl+C"
|
||||
try:
|
||||
train(net, data, opt)
|
||||
except KeyboardInterrupt:
|
||||
print('-' * 89)
|
||||
print('Exiting from training early')
|
||||
|
||||
|
||||
torch.save(net, "cache/net.pkl")
|
||||
print("save net")
|
||||
|
||||
|
||||
test(net, data, opt)
|
42068
Char-aware_NLM/train.txt
Normal file
42068
Char-aware_NLM/train.txt
Normal file
File diff suppressed because it is too large
Load Diff
86
Char-aware_NLM/utilities.py
Normal file
86
Char-aware_NLM/utilities.py
Normal file
@ -0,0 +1,86 @@
|
||||
import torch
|
||||
from torch.autograd import Variable
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
|
||||
def batch_generator(x, batch_size):
|
||||
# x: [num_words, in_channel, height, width]
|
||||
# partitions x into batches
|
||||
num_step = x.size()[0] // batch_size
|
||||
for t in range(num_step):
|
||||
yield x[t*batch_size:(t+1)*batch_size]
|
||||
|
||||
|
||||
def text2vec(words, char_dict, max_word_len):
|
||||
""" Return list of list of int """
|
||||
word_vec = []
|
||||
for word in words:
|
||||
vec = [char_dict[ch] for ch in word]
|
||||
if len(vec) < max_word_len:
|
||||
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
|
||||
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
|
||||
word_vec.append(vec)
|
||||
return word_vec
|
||||
|
||||
|
||||
def seq2vec(input_words, char_embedding, char_embedding_dim, char_table):
|
||||
""" convert the input strings into character embeddings """
|
||||
# input_words == list of string
|
||||
# char_embedding == torch.nn.Embedding
|
||||
# char_embedding_dim == int
|
||||
# char_table == list of unique chars
|
||||
# Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2]
|
||||
max_word_len = max([len(word) for word in input_words])
|
||||
print("max_word_len={}".format(max_word_len))
|
||||
tensor_list = []
|
||||
|
||||
start_column = torch.ones(char_embedding_dim, 1)
|
||||
end_column = torch.ones(char_embedding_dim, 1)
|
||||
|
||||
for word in input_words:
|
||||
# convert string to word embedding
|
||||
word_encoding = char_embedding_lookup(word, char_embedding, char_table)
|
||||
# add start and end columns
|
||||
word_encoding = torch.cat([start_column, word_encoding, end_column], 1)
|
||||
# zero-pad right columns
|
||||
word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data
|
||||
# create dimension
|
||||
word_encoding = word_encoding.unsqueeze(0)
|
||||
|
||||
tensor_list.append(word_encoding)
|
||||
|
||||
return torch.cat(tensor_list, 0)
|
||||
|
||||
|
||||
def read_data(file_name):
|
||||
# Return: list of strings
|
||||
with open(file_name, 'r') as f:
|
||||
corpus = f.read().lower()
|
||||
import re
|
||||
corpus = re.sub(r"<unk>", "unk", corpus)
|
||||
return corpus.split()
|
||||
|
||||
|
||||
def get_char_dict(vocabulary):
|
||||
# vocabulary == dict of (word, int)
|
||||
# Return: dict of (char, int), starting from 1
|
||||
char_dict = dict()
|
||||
count = 1
|
||||
for word in vocabulary:
|
||||
for ch in word:
|
||||
if ch not in char_dict:
|
||||
char_dict[ch] = count
|
||||
count += 1
|
||||
return char_dict
|
||||
|
||||
|
||||
def create_word_char_dict(*file_name):
|
||||
text = []
|
||||
for file in file_name:
|
||||
text += read_data(file)
|
||||
word_dict = {word:ix for ix, word in enumerate(set(text))}
|
||||
char_dict = get_char_dict(word_dict)
|
||||
return word_dict, char_dict
|
||||
|
3370
Char-aware_NLM/valid.txt
Normal file
3370
Char-aware_NLM/valid.txt
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user