diff --git a/fastNLP/action/tester.py b/fastNLP/action/tester.py index 7f660bb0..9d32ec40 100644 --- a/fastNLP/action/tester.py +++ b/fastNLP/action/tester.py @@ -1,87 +1,154 @@ -from collections import namedtuple +import _pickle import numpy as np +import torch from fastNLP.action.action import Action +from fastNLP.action.action import RandomSampler, Batchifier +from fastNLP.modules.utils import seq_mask -class Tester(Action): +class BaseTester(Action): """docstring for Tester""" - TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output", - "save_loss", "batch_size"]) - def __init__(self, test_args): """ :param test_args: named tuple """ - super(Tester, self).__init__() - self.validate_in_training = test_args.validate_in_training - self.save_dev_input = test_args.save_dev_input - self.valid_x = None - self.valid_y = None - self.save_output = test_args.save_output + super(BaseTester, self).__init__() + self.validate_in_training = test_args["validate_in_training"] + self.save_dev_data = None + self.save_output = test_args["save_output"] self.output = None - self.save_loss = test_args.save_loss + self.save_loss = test_args["save_loss"] self.mean_loss = None - self.batch_size = test_args.batch_size + self.batch_size = test_args["batch_size"] + self.pickle_path = test_args["pickle_path"] + self.iterator = None - def test(self, network, data): - print("testing") - network.mode(test=True) # turn on the testing mode - if self.save_dev_input: - if self.valid_x is None: - valid_x, valid_y = network.prepare_input(data) - self.valid_x = valid_x - self.valid_y = valid_y - else: - valid_x = self.valid_x - valid_y = self.valid_y - else: - valid_x, valid_y = network.prepare_input(data) + self.model = None + self.eval_history = [] - # split into batches by self.batch_size - iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y) + def test(self, network): + # print("--------------testing----------------") + self.model = network + + # turn on the testing mode; clean up the history + self.mode(network, test=True) + + dev_data = self.prepare_input(self.pickle_path) + + self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) batch_output = list() - loss_history = list() - # turn on the testing mode of the network - network.mode(test=True) + num_iter = len(dev_data) // self.batch_size - for step in range(iterations): - batch_x, batch_y = test_batch_generator.__next__() + for step in range(num_iter): + batch_x, batch_y = self.batchify(dev_data) - # forward pass from test input to predicted output - prediction = network.data_forward(batch_x) - - loss = network.get_loss(prediction, batch_y) + prediction = self.data_forward(network, batch_x) + eval_results = self.evaluate(prediction, batch_y) if self.save_output: - batch_output.append(prediction.data) + batch_output.append(prediction) if self.save_loss: - loss_history.append(loss) - self.log(self.make_log(step, loss)) + self.eval_history.append(eval_results) - if self.save_loss: - self.mean_loss = np.mean(np.array(loss_history)) - if self.save_output: - self.output = self.make_output(batch_output) + def prepare_input(self, data_path): + """ + Save the dev data once it is loaded. Can return directly next time. + :param data_path: str, the path to the pickle data for dev + :return save_dev_data: list. Each entry is a sample, which is also a list of features and label(s). + """ + if self.save_dev_data is None: + data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + self.save_dev_data = data_dev + return self.save_dev_data - @property - def loss(self): - return self.mean_loss - - @property - def result(self): - return self.output + def batchify(self, data): + """ + 1. Perform batching from data and produce a batch of training data. + 2. Add padding. + :param data: list. Each entry is a sample, which is also a list of features and label(s). + E.g. + [ + [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 + [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 + ... + ] + :return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] + batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] + """ + indices = next(self.iterator) + batch = [data[idx] for idx in indices] + batch_x = [sample[0] for sample in batch] + batch_y = [sample[1] for sample in batch] + batch_x = self.pad(batch_x) + return batch_x, batch_y @staticmethod - def make_output(batch_outputs): - # construct full prediction with batch outputs - return np.concatenate(batch_outputs, axis=0) + def pad(batch, fill=0): + """ + Pad a batch of samples to maximum length. + :param batch: list of list + :param fill: word index to pad, default 0. + :return: a padded batch + """ + max_length = max([len(x) for x in batch]) + for idx, sample in enumerate(batch): + if len(sample) < max_length: + batch[idx] = sample + [fill * (max_length - len(sample))] + return batch - def load_config(self, args): + def data_forward(self, network, data): raise NotImplementedError - def load_dataset(self, args): + def evaluate(self, predict, truth): raise NotImplementedError + + @property + def matrices(self): + raise NotImplementedError + + def mode(self, model, test=True): + """To do: combine this function with Trainer ?? """ + if test: + model.eval() + else: + model.train() + self.eval_history.clear() + + +class POSTester(BaseTester): + """ + Tester for sequence labeling. + """ + + def __init__(self, test_args): + super(POSTester, self).__init__(test_args) + self.max_len = None + self.mask = None + self.batch_result = None + + def data_forward(self, network, x): + """To Do: combine with Trainer + + :param network: the PyTorch model + :param x: list of list, [batch_size, max_len] + :return y: [batch_size, num_classes] + """ + seq_len = [len(seq) for seq in x] + x = torch.Tensor(x).long() + self.batch_size = x.size(0) + self.max_len = x.size(1) + self.mask = seq_mask(seq_len, self.max_len) + y = network(x) + return y + + def evaluate(self, predict, truth): + truth = torch.Tensor(truth) + loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len) + return loss.data + + def matrices(self): + return np.mean(self.eval_history) diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 437ab7d2..0ab9fee7 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -1,12 +1,12 @@ import _pickle -from collections import namedtuple import numpy as np import torch from fastNLP.action.action import Action from fastNLP.action.action import RandomSampler, Batchifier -from fastNLP.action.tester import Tester +from fastNLP.action.tester import POSTester +from fastNLP.modules.utils import seq_mask class BaseTrainer(Action): @@ -21,23 +21,29 @@ class BaseTrainer(Action): - grad_backward - get_loss """ - TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"]) def __init__(self, train_args): """ - training parameters + :param train_args: dict of (key, value) + + The base trainer requires the following keys: + - epochs: int, the number of epochs in training + - validate: bool, whether or not to validate on dev set + - batch_size: int + - pickle_path: str, the path to pickle files for pre-processing """ super(BaseTrainer, self).__init__() - self.n_epochs = train_args.epochs - self.validate = train_args.validate - self.batch_size = train_args.batch_size - self.pickle_path = train_args.pickle_path + self.n_epochs = train_args["epochs"] + self.validate = train_args["validate"] + self.batch_size = train_args["batch_size"] + self.pickle_path = train_args["pickle_path"] self.model = None self.iterator = None self.loss_func = None + self.optimizer = None def train(self, network): - """General training loop. + """General Training Steps :param network: a model The method is framework independent. @@ -51,22 +57,27 @@ class BaseTrainer(Action): - update Subclasses must implement these methods with a specific framework. """ + # prepare model and data self.model = network data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path) - test_args = Tester.TestConfig(save_output=True, validate_in_training=True, - save_dev_input=True, save_loss=True, batch_size=self.batch_size) - evaluator = Tester(test_args) + # define tester over dev data + valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, + "save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path} + validator = POSTester(valid_args) - best_loss = 1e10 + # main training epochs iterations = len(data_train) // self.batch_size - for epoch in range(self.n_epochs): - self.mode(test=False) + # turn on network training mode; define optimizer; prepare batch iterator + self.mode(test=False) self.define_optimizer() + self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True)) + + # training iterations in one epoch for step in range(iterations): - batch_x, batch_y = self.batchify(self.batch_size, data_train) + batch_x, batch_y = self.batchify(data_train) prediction = self.data_forward(network, batch_x) @@ -77,9 +88,8 @@ class BaseTrainer(Action): if self.validate: if data_dev is None: raise RuntimeError("No validation data provided.") - evaluator.test(network, data_dev) - if evaluator.loss < best_loss: - best_loss = evaluator.loss + validator.test(network) + print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices())) # finish training @@ -155,23 +165,20 @@ class BaseTrainer(Action): """ raise NotImplementedError - def batchify(self, batch_size, data): + def batchify(self, data): """ 1. Perform batching from data and produce a batch of training data. 2. Add padding. - :param batch_size: int, the size of a batch :param data: list. Each entry is a sample, which is also a list of features and label(s). E.g. [ - [[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 1 - [[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 2 + [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 + [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 ... ] - :return batch_x: list. Each entry is a list of features of a sample. - batch_y: list. Each entry is a list of labels of a sample. + :return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] + batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] """ - if self.iterator is None: - self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True)) indices = next(self.iterator) batch = [data[idx] for idx in indices] batch_x = [sample[0] for sample in batch] @@ -195,7 +202,9 @@ class BaseTrainer(Action): class ToyTrainer(BaseTrainer): - """A simple trainer for a PyTorch model.""" + """ + deprecated + """ def __init__(self, train_args): super(ToyTrainer, self).__init__(train_args) @@ -230,7 +239,7 @@ class ToyTrainer(BaseTrainer): class WordSegTrainer(BaseTrainer): """ - reserve for changes + deprecated """ def __init__(self, train_args): @@ -301,6 +310,7 @@ class WordSegTrainer(BaseTrainer): self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85) def get_loss(self, predict, truth): + truth = torch.Tensor(truth) self._loss = torch.nn.CrossEntropyLoss(predict, truth) return self._loss @@ -313,8 +323,76 @@ class WordSegTrainer(BaseTrainer): self.optimizer.step() +class POSTrainer(BaseTrainer): + """ + Trainer for Sequence Modeling + + """ + def __init__(self, train_args): + super(POSTrainer, self).__init__(train_args) + self.vocab_size = train_args["vocab_size"] + self.num_classes = train_args["num_classes"] + self.max_len = None + self.mask = None + + def prepare_input(self, data_path): + """ + To do: Load pkl files of train/dev/test and embedding + """ + data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + return data_train, data_dev, 0, 1 + + def data_forward(self, network, x): + """ + :param network: the PyTorch model + :param x: list of list, [batch_size, max_len] + :return y: [batch_size, num_classes] + """ + seq_len = [len(seq) for seq in x] + x = torch.Tensor(x).long() + self.batch_size = x.size(0) + self.max_len = x.size(1) + self.mask = seq_mask(seq_len, self.max_len) + y = network(x) + return y + + def mode(self, test=False): + if test: + self.model.eval() + else: + self.model.train() + + def define_optimizer(self): + self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9) + + def grad_backward(self, loss): + self.model.zero_grad() + loss.backward() + + def update(self): + self.optimizer.step() + + def get_loss(self, predict, truth): + """ + Compute loss given prediction and ground truth. + :param predict: prediction label vector, [batch_size, num_classes] + :param truth: ground truth label vector, [batch_size, max_len] + :return: a scalar + """ + truth = torch.Tensor(truth) + if self.loss_func is None: + if hasattr(self.model, "loss"): + self.loss_func = self.model.loss + else: + self.define_loss() + loss, prediction = self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len) + # print("loss={:.2f}".format(loss.data)) + return loss + + if __name__ == "__name__": - train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./") + train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"} trainer = BaseTrainer(train_args) data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10] - trainer.batchify(batch_size=3, data=data_train) + trainer.batchify(data=data_train) diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 7132eb3b..d57a48db 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -15,7 +15,6 @@ class POSDatasetLoader(DatasetLoader): def __init__(self, data_name, data_path): super(POSDatasetLoader, self).__init__(data_name, data_path) - #self.data_set = self.load() def load(self): assert os.path.exists(self.data_path) @@ -24,7 +23,7 @@ class POSDatasetLoader(DatasetLoader): return line def load_lines(self): - assert os.path.exists(self.data_path) + assert (os.path.exists(self.data_path)) with open(self.data_path, "r", encoding="utf-8") as f: lines = f.readlines() return lines diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py index b8d88c35..8b9c6d88 100644 --- a/fastNLP/loader/preprocess.py +++ b/fastNLP/loader/preprocess.py @@ -46,19 +46,17 @@ class BasePreprocess(object): class POSPreprocess(BasePreprocess): - """ This class are used to preprocess the pos datasets. - In these datasets, each line is divided by '\t' - The first Col is the vocabulary. - The second Col is the labels. + In these datasets, each line are divided by '\t' + while the first Col is the vocabulary and the second + Col is the label. Different sentence are divided by an empty line. e.g: Tom label1 and label2 Jerry label1 . label3 - Hello label4 world label5 ! label3 @@ -71,11 +69,13 @@ class POSPreprocess(BasePreprocess): super(POSPreprocess, self).__init__(data, pickle_path) self.word_dict = None self.label_dict = None + self.data = data + self.pickle_path = pickle_path self.build_dict() self.word2id() - self.id2word() + self.vocab_size = self.id2word() self.class2id() - self.id2class() + self.num_classes = self.id2class() self.embedding() self.data_train() self.data_dev() @@ -87,7 +87,8 @@ class POSPreprocess(BasePreprocess): DEFAULT_RESERVED_LABEL[2]: 4} self.label_dict = {} for w in self.data: - if len(w) == 0: + w = w.strip() + if len(w) <= 1: continue word = w.split('\t') @@ -95,10 +96,11 @@ class POSPreprocess(BasePreprocess): index = len(self.word_dict) self.word_dict[word[0]] = index - for label in word[1: ]: - if label not in self.label_dict: - index = len(self.label_dict) - self.label_dict[label] = index + # for label in word[1: ]: + label = word[1] + if label not in self.label_dict: + index = len(self.label_dict) + self.label_dict[label] = index def pickle_exist(self, pickle_name): """ @@ -107,7 +109,7 @@ class POSPreprocess(BasePreprocess): """ if not os.path.exists(self.pickle_path): os.makedirs(self.pickle_path) - file_name = self.pickle_path + pickle_name + file_name = os.path.join(self.pickle_path, pickle_name) if os.path.exists(file_name): return True else: @@ -118,42 +120,48 @@ class POSPreprocess(BasePreprocess): return # nothing will be done if word2id.pkl exists - file_name = self.pickle_path + "word2id.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "word2id.pkl") + with open(file_name, "wb") as f: _pickle.dump(self.word_dict, f) def id2word(self): if self.pickle_exist("id2word.pkl"): - return + file_name = os.path.join(self.pickle_path, "id2word.pkl") + id2word_dict = _pickle.load(open(file_name, "rb")) + return len(id2word_dict) # nothing will be done if id2word.pkl exists id2word_dict = {} for word in self.word_dict: id2word_dict[self.word_dict[word]] = word - file_name = self.pickle_path + "id2word.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "id2word.pkl") + with open(file_name, "wb") as f: _pickle.dump(id2word_dict, f) + return len(id2word_dict) def class2id(self): if self.pickle_exist("class2id.pkl"): return # nothing will be done if class2id.pkl exists - file_name = self.pickle_path + "class2id.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "class2id.pkl") + with open(file_name, "wb") as f: _pickle.dump(self.label_dict, f) def id2class(self): if self.pickle_exist("id2class.pkl"): - return + file_name = os.path.join(self.pickle_path, "id2class.pkl") + id2class_dict = _pickle.load(open(file_name, "rb")) + return len(id2class_dict) # nothing will be done if id2class.pkl exists id2class_dict = {} for label in self.label_dict: id2class_dict[self.label_dict[label]] = label - file_name = self.pickle_path + "id2class.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "id2class.pkl") + with open(file_name, "wb") as f: _pickle.dump(id2class_dict, f) + return len(id2class_dict) def embedding(self): if self.pickle_exist("embedding.pkl"): @@ -168,22 +176,26 @@ class POSPreprocess(BasePreprocess): data_train = [] sentence = [] for w in self.data: - if len(w) == 0: + w = w.strip() + if len(w) <= 1: wid = [] lid = [] for i in range(len(sentence)): + # if sentence[i][0]=="": + # print("") wid.append(self.word_dict[sentence[i][0]]) lid.append(self.label_dict[sentence[i][1]]) data_train.append((wid, lid)) sentence = [] + continue sentence.append(w.split('\t')) - file_name = self.pickle_path + "data_train.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "data_train.pkl") + with open(file_name, "wb") as f: _pickle.dump(data_train, f) def data_dev(self): pass def data_test(self): - pass + pass \ No newline at end of file diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 9249e2e3..24dfdb85 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -3,32 +3,12 @@ import torch class BaseModel(torch.nn.Module): """Base PyTorch model for all models. - Three network modules presented: - - embedding module - - aggregation module - - output module - Subclasses must implement these three modules with "components". + To do: add some useful common features """ def __init__(self): super(BaseModel, self).__init__() - def forward(self, *inputs): - x = self.encode(*inputs) - x = self.aggregation(x) - x = self.output(x) - return x - - def encode(self, x): - raise NotImplementedError - - def aggregation(self, x): - raise NotImplementedError - - def output(self, x): - raise NotImplementedError - - class Vocabulary(object): """A look-up table that allows you to access `Lexeme` objects. The `Vocab` @@ -93,3 +73,4 @@ class Token(object): self.doc = doc self.token = doc[offset] self.i = offset + diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py new file mode 100644 index 00000000..80d13cf3 --- /dev/null +++ b/fastNLP/models/sequence_modeling.py @@ -0,0 +1,97 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + +from fastNLP.models.base_model import BaseModel +from fastNLP.modules.CRF import ContionalRandomField + + +class SeqLabeling(BaseModel): + """ + PyTorch Network for sequence labeling + """ + + def __init__(self, hidden_dim, + rnn_num_layer, + num_classes, + vocab_size, + word_emb_dim=100, + init_emb=None, + rnn_mode="gru", + bi_direction=False, + dropout=0.5, + use_crf=True): + super(SeqLabeling, self).__init__() + + self.Emb = nn.Embedding(vocab_size, word_emb_dim) + if init_emb: + self.Emb.weight = nn.Parameter(init_emb) + + self.num_classes = num_classes + self.input_dim = word_emb_dim + self.layers = rnn_num_layer + self.hidden_dim = hidden_dim + self.bi_direction = bi_direction + self.dropout = dropout + self.mode = rnn_mode + + if self.mode == "lstm": + self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + elif self.mode == "gru": + self.rnn = nn.GRU(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + elif self.mode == "rnn": + self.rnn = nn.RNN(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + else: + raise Exception + if bi_direction: + self.linear = nn.Linear(self.hidden_dim * 2, self.num_classes) + else: + self.linear = nn.Linear(self.hidden_dim, self.num_classes) + self.use_crf = use_crf + if self.use_crf: + self.crf = ContionalRandomField(num_classes) + + def forward(self, x): + """ + :param x: LongTensor, [batch_size, mex_len] + :return y: [batch_size, tag_size, tag_size] + """ + x = self.Emb(x) + # [batch_size, max_len, word_emb_dim] + x, hidden = self.rnn(x) + # [batch_size, max_len, hidden_size * direction] + y = self.linear(x) + # [batch_size, max_len, num_classes] + return y + + def loss(self, x, y, mask, batch_size, max_len): + """ + Negative log likelihood loss. + :param x: FloatTensor, [batch_size, tag_size, tag_size] + :param y: LongTensor, [batch_size, max_len] + :param mask: ByteTensor, [batch_size, max_len] + :param batch_size: int + :param max_len: int + :return loss: + prediction: + """ + x = x.float() + y = y.long() + mask = mask.byte() + # print(x.shape, y.shape, mask.shape) + + if self.use_crf: + total_loss = self.crf(x, y, mask) + tag_seq = self.crf.viterbi_decode(x, mask) + else: + # error + loss_function = nn.NLLLoss(ignore_index=0, size_average=False) + x = x.view(batch_size * max_len, -1) + score = F.log_softmax(x) + total_loss = loss_function(score, y.view(batch_size * max_len)) + _, tag_seq = torch.max(score) + tag_seq = tag_seq.view(batch_size, max_len) + return torch.mean(total_loss), tag_seq diff --git a/fastNLP/modules/CRF.py b/fastNLP/modules/CRF.py index 6361b93d..96c84dca 100644 --- a/fastNLP/modules/CRF.py +++ b/fastNLP/modules/CRF.py @@ -82,7 +82,7 @@ class ContionalRandomField(nn.Module): def _glod_score(self, feats, tags, masks): """ Compute the score for the gold path. - :param feats: FloatTensor, batch_size x tag_size x tag_size + :param feats: FloatTensor, batch_size x max_len x tag_size :param tags: LongTensor, batch_size x max_len :param masks: ByteTensor, batch_size x max_len :return:FloatTensor, batch_size @@ -118,7 +118,7 @@ class ContionalRandomField(nn.Module): def forward(self, feats, tags, masks): """ Calculate the neg log likelihood - :param feats:FloatTensor, batch_size x tag_size x tag_size + :param feats:FloatTensor, batch_size x max_len x tag_size :param tags:LongTensor, batch_size x max_len :param masks:ByteTensor batch_size x max_len :return:FloatTensor, batch_size diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py index a19898c6..d23a0ec2 100644 --- a/fastNLP/modules/prototype/example.py +++ b/fastNLP/modules/prototype/example.py @@ -1,12 +1,13 @@ +import time + +import aggregation +import dataloader +import embedding +import encoder +import predict import torch import torch.nn as nn -import encoder -import aggregation -import embedding -import predict import torch.optim as optim -import time -import dataloader WORD_NUM = 357361 WORD_SIZE = 100 @@ -16,6 +17,30 @@ R = 10 MLP_HIDDEN = 2000 CLASSES_NUM = 5 +from fastNLP.models.base_model import BaseModel +from fastNLP.action.trainer import BaseTrainer + + +class MyNet(BaseModel): + def __init__(self): + super(MyNet, self).__init__() + self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE) + self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True) + self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R) + self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM) + self.penalty = None + + def encode(self, x): + return self.encode(self.embedding(x)) + + def aggregate(self, x): + x, self.penalty = self.aggregate(x) + return x + + def decode(self, x): + return [self.predict(x), self.penalty] + + class Net(nn.Module): """ A model for sentiment analysis using lstm and self-attention @@ -34,6 +59,19 @@ class Net(nn.Module): x = self.predict(x) return x, penalty + +class MyTrainer(BaseTrainer): + def __init__(self, args): + super(MyTrainer, self).__init__(args) + self.optimizer = None + + def define_optimizer(self): + self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9) + + def define_loss(self): + self.loss_func = nn.CrossEntropyLoss() + + def train(model_dict=None, using_cuda=True, learning_rate=0.06,\ momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10): """ diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 15afe883..a6b31a20 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -7,3 +7,9 @@ def mask_softmax(matrix, mask): else: raise NotImplementedError return result + + +def seq_mask(seq_len, max_len): + mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] + mask = torch.stack(mask, 1) + return mask diff --git a/fastNLP/reproduction/__init__.py b/fastNLP/reproduction/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/fastNLP/reproduction/CNN-sentence_classification/.gitignore b/reproduction/CNN-sentence_classification/.gitignore similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/.gitignore rename to reproduction/CNN-sentence_classification/.gitignore diff --git a/fastNLP/reproduction/CNN-sentence_classification/README.md b/reproduction/CNN-sentence_classification/README.md similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/README.md rename to reproduction/CNN-sentence_classification/README.md diff --git a/fastNLP/reproduction/CNN-sentence_classification/__init__.py b/reproduction/CNN-sentence_classification/__init__.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/__init__.py rename to reproduction/CNN-sentence_classification/__init__.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/dataset.py b/reproduction/CNN-sentence_classification/dataset.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/dataset.py rename to reproduction/CNN-sentence_classification/dataset.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/model.py b/reproduction/CNN-sentence_classification/model.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/model.py rename to reproduction/CNN-sentence_classification/model.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg b/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg rename to reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg diff --git a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos b/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos rename to reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos diff --git a/fastNLP/reproduction/CNN-sentence_classification/train.py b/reproduction/CNN-sentence_classification/train.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/train.py rename to reproduction/CNN-sentence_classification/train.py diff --git a/fastNLP/reproduction/Char-aware_NLM/LICENSE b/reproduction/Char-aware_NLM/LICENSE similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/LICENSE rename to reproduction/Char-aware_NLM/LICENSE diff --git a/fastNLP/reproduction/Char-aware_NLM/README.md b/reproduction/Char-aware_NLM/README.md similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/README.md rename to reproduction/Char-aware_NLM/README.md diff --git a/fastNLP/reproduction/Char-aware_NLM/__init__.py b/reproduction/Char-aware_NLM/__init__.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/__init__.py rename to reproduction/Char-aware_NLM/__init__.py diff --git a/fastNLP/reproduction/Char-aware_NLM/model.py b/reproduction/Char-aware_NLM/model.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/model.py rename to reproduction/Char-aware_NLM/model.py diff --git a/fastNLP/reproduction/Char-aware_NLM/test.py b/reproduction/Char-aware_NLM/test.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/test.py rename to reproduction/Char-aware_NLM/test.py diff --git a/fastNLP/reproduction/Char-aware_NLM/test.txt b/reproduction/Char-aware_NLM/test.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/test.txt rename to reproduction/Char-aware_NLM/test.txt diff --git a/fastNLP/reproduction/Char-aware_NLM/train.py b/reproduction/Char-aware_NLM/train.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/train.py rename to reproduction/Char-aware_NLM/train.py diff --git a/fastNLP/reproduction/Char-aware_NLM/train.txt b/reproduction/Char-aware_NLM/train.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/train.txt rename to reproduction/Char-aware_NLM/train.txt diff --git a/fastNLP/reproduction/Char-aware_NLM/utilities.py b/reproduction/Char-aware_NLM/utilities.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/utilities.py rename to reproduction/Char-aware_NLM/utilities.py diff --git a/fastNLP/reproduction/Char-aware_NLM/valid.txt b/reproduction/Char-aware_NLM/valid.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/valid.txt rename to reproduction/Char-aware_NLM/valid.txt diff --git a/fastNLP/reproduction/HAN-document_classification/README.md b/reproduction/HAN-document_classification/README.md similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/README.md rename to reproduction/HAN-document_classification/README.md diff --git a/fastNLP/reproduction/HAN-document_classification/__init__.py b/reproduction/HAN-document_classification/__init__.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/__init__.py rename to reproduction/HAN-document_classification/__init__.py diff --git a/fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl b/reproduction/HAN-document_classification/data/test_samples.pkl similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl rename to reproduction/HAN-document_classification/data/test_samples.pkl diff --git a/fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl b/reproduction/HAN-document_classification/data/train_samples.pkl similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl rename to reproduction/HAN-document_classification/data/train_samples.pkl diff --git a/fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec b/reproduction/HAN-document_classification/data/yelp.word2vec similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec rename to reproduction/HAN-document_classification/data/yelp.word2vec diff --git a/fastNLP/reproduction/HAN-document_classification/evaluate.py b/reproduction/HAN-document_classification/evaluate.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/evaluate.py rename to reproduction/HAN-document_classification/evaluate.py diff --git a/fastNLP/reproduction/HAN-document_classification/model.py b/reproduction/HAN-document_classification/model.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/model.py rename to reproduction/HAN-document_classification/model.py diff --git a/fastNLP/reproduction/HAN-document_classification/preprocess.py b/reproduction/HAN-document_classification/preprocess.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/preprocess.py rename to reproduction/HAN-document_classification/preprocess.py diff --git a/fastNLP/reproduction/HAN-document_classification/train.py b/reproduction/HAN-document_classification/train.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/train.py rename to reproduction/HAN-document_classification/train.py diff --git a/requirements.txt b/requirements.txt index 0fc94538..d961dd92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -numpy==1.14.2 +numpy>=1.14.2 torch==0.4.0 -torchvision==0.1.8 +torchvision>=0.1.8 diff --git a/test/data_for_tests/people.txt b/test/data_for_tests/people.txt new file mode 100644 index 00000000..f34c85cb --- /dev/null +++ b/test/data_for_tests/people.txt @@ -0,0 +1,67 @@ +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +中 B-nt +共 M-nt +中 M-nt +央 E-nt +总 B-n +书 M-n +记 E-n +、 S-w +国 B-n +家 E-n +主 B-n +席 E-n +江 B-nr +泽 M-nr +民 E-nr + +( S-w +一 B-t +九 M-t +九 M-t +七 M-t +年 E-t +十 B-t +二 M-t +月 E-t +三 B-t +十 M-t +一 M-t +日 E-t +) S-w + +1 B-t +2 M-t +月 E-t +3 B-t +1 M-t +日 E-t +, S-w \ No newline at end of file diff --git a/test/test_POS_pipeline.py b/test/test_POS_pipeline.py new file mode 100644 index 00000000..af22e3b9 --- /dev/null +++ b/test/test_POS_pipeline.py @@ -0,0 +1,35 @@ +import sys + +sys.path.append("..") + +from fastNLP.action.trainer import POSTrainer +from fastNLP.loader.dataset_loader import POSDatasetLoader +from fastNLP.loader.preprocess import POSPreprocess +from fastNLP.models.sequence_modeling import SeqLabeling + +data_name = "people.txt" +data_path = "data_for_tests/people.txt" +pickle_path = "data_for_tests" + +if __name__ == "__main__": + # Data Loader + pos = POSDatasetLoader(data_name, data_path) + train_data = pos.load_lines() + + # Preprocessor + p = POSPreprocess(train_data, pickle_path) + vocab_size = p.vocab_size + num_classes = p.num_classes + + # Trainer + train_args = {"epochs": 20, "batch_size": 1, "num_classes": num_classes, + "vocab_size": vocab_size, "pickle_path": pickle_path, "validate": True} + trainer = POSTrainer(train_args) + + # Model + model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True) + + # Start training + trainer.train(model) + + print("Training finished!")