Merge pull request #1 from fastnlp/master

update
This commit is contained in:
lyhuang18 2018-09-02 17:48:56 +08:00 committed by GitHub
commit c80ae39fb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 911 additions and 11094 deletions

View File

@ -2,6 +2,9 @@
[![Build Status](https://travis-ci.org/fastnlp/fastNLP.svg?branch=master)](https://travis-ci.org/fastnlp/fastNLP)
[![codecov](https://codecov.io/gh/fastnlp/fastNLP/branch/master/graph/badge.svg)](https://codecov.io/gh/fastnlp/fastNLP)
[![PyPI version](https://badge.fury.io/py/fastNLP.svg)](https://badge.fury.io/py/fastNLP)
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)
fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below:
@ -30,6 +33,7 @@ A typical fastNLP routine is composed of four phases: loading dataset, pre-proce
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import encoder
from fastNLP.modules import aggregation
from fastNLP.modules import decoder
from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.loader.preprocess import ClassPreprocess
@ -42,20 +46,20 @@ class ClassificationModel(BaseModel):
Simple text classification model based on CNN.
"""
def __init__(self, class_num, vocab_size):
def __init__(self, num_classes, vocab_size):
super(ClassificationModel, self).__init__()
self.embed = encoder.Embedding(nums=vocab_size, dims=300)
self.conv = encoder.Conv(
self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.pool = aggregation.MaxPool()
self.output = encoder.Linear(input_size=100, output_size=class_num)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
def forward(self, x):
x = self.embed(x) # [N,L] -> [N,L,C]
x = self.conv(x) # [N,L,C_in] -> [N,L,C_out]
x = self.pool(x) # [N,L,C] -> [N,C]
x = self.output(x) # [N,C] -> [N, N_class]
x = self.emb(x) # [N,L] -> [N,L,C]
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out]
x = self.agg(x) # [N,L,C] -> [N,C]
x = self.dec(x) # [N,C] -> [N, N_class]
return x
@ -75,7 +79,7 @@ model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size)
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)
# train model
train_args = {

27
fastNLP/core/loss.py Normal file
View File

@ -0,0 +1,27 @@
import torch
class Loss(object):
"""Loss function of the algorithm,
either the wrapper of a loss function from framework, or a user-defined loss (need pytorch auto_grad support)
"""
def __init__(self, args):
if args is None:
# this is useful when
self._loss = None
elif isinstance(args, str):
self._loss = self._borrow_from_pytorch(args)
else:
raise NotImplementedError
def get(self):
return self._loss
@staticmethod
def _borrow_from_pytorch(loss_name):
if loss_name == "cross_entropy":
return torch.nn.CrossEntropyLoss()
else:
raise NotImplementedError

View File

@ -1,3 +1,54 @@
"""
use optimizer from Pytorch
"""
import torch
class Optimizer(object):
"""Wrapper of optimizer from framework
names: arguments (type)
1. Adam: lr (float), weight_decay (float)
2. AdaGrad
3. RMSProp
4. SGD: lr (float), momentum (float)
"""
def __init__(self, optimizer_name, **kwargs):
"""
:param optimizer_name: str, the name of the optimizer
:param kwargs: the arguments
"""
self.optim_name = optimizer_name
self.kwargs = kwargs
@property
def name(self):
return self.optim_name
@property
def params(self):
return self.kwargs
def construct_from_pytorch(self, model_params):
"""construct a optimizer from framework over given model parameters"""
if self.optim_name in ["SGD", "sgd"]:
if "lr" in self.kwargs:
if "momentum" not in self.kwargs:
self.kwargs["momentum"] = 0
optimizer = torch.optim.SGD(model_params, lr=self.kwargs["lr"], momentum=self.kwargs["momentum"])
else:
raise ValueError("requires learning rate for SGD optimizer")
elif self.optim_name in ["adam", "Adam"]:
if "lr" in self.kwargs:
if "weight_decay" not in self.kwargs:
self.kwargs["weight_decay"] = 0
optimizer = torch.optim.Adam(model_params, lr=self.kwargs["lr"],
weight_decay=self.kwargs["weight_decay"])
else:
raise ValueError("requires learning rate for Adam optimizer")
else:
raise NotImplementedError
return optimizer

View File

@ -19,13 +19,13 @@ DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
def save_pickle(obj, pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "wb") as f:
_pickle.dump(obj, f)
print("{} saved. ".format(file_name))
print("{} saved in {}".format(file_name, pickle_path))
def load_pickle(pickle_path, file_name):
with open(os.path.join(pickle_path, file_name), "rb") as f:
obj = _pickle.load(f)
print("{} loaded. ".format(file_name))
print("{} loaded from {}".format(file_name, pickle_path))
return obj
@ -59,7 +59,6 @@ class BasePreprocess(object):
def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
"""Main preprocessing pipeline.
:param train_dev_data: three-level list, with either single label or multiple labels in a sample.
:param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
:param pickle_path: str, the path to save the pickle files.
@ -98,6 +97,8 @@ class BasePreprocess(object):
save_pickle(data_train, pickle_path, "data_train.pkl")
else:
data_train = load_pickle(pickle_path, "data_train.pkl")
if pickle_exist(pickle_path, "data_dev.pkl"):
data_dev = load_pickle(pickle_path, "data_dev.pkl")
else:
# cross_val is True
if not pickle_exist(pickle_path, "data_train_0.pkl"):

View File

@ -1,5 +1,3 @@
import _pickle
import numpy as np
import torch
@ -14,43 +12,78 @@ logger = create_logger(__name__, "./train_test.log")
class BaseTester(object):
"""An collection of model inference and evaluation of performance, used over validation/dev set and test set. """
def __init__(self, test_args):
def __init__(self, **kwargs):
"""
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
:param kwargs: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
"""
super(BaseTester, self).__init__()
self.validate_in_training = test_args["validate_in_training"]
self.save_dev_data = None
self.save_output = test_args["save_output"]
self.output = None
self.save_loss = test_args["save_loss"]
self.mean_loss = None
self.batch_size = test_args["batch_size"]
self.pickle_path = test_args["pickle_path"]
self.iterator = None
self.use_cuda = test_args["use_cuda"]
"""
"default_args" provides default value for important settings.
The initialization arguments "kwargs" with the same key (name) will override the default value.
"kwargs" must have the same type as "default_args" on corresponding keys.
Otherwise, error will raise.
"""
default_args = {"save_output": False, # collect outputs of validation set
"save_loss": False, # collect losses in validation
"save_best_dev": False, # save best model during validation
"batch_size": 8,
"use_cuda": True,
"pickle_path": "./save/",
"model_name": "dev_best_model.pkl",
"print_every_step": 1,
}
"""
"required_args" is the collection of arguments that users must pass to Trainer explicitly.
This is used to warn users of essential settings in the training.
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
required_args = {}
self.model = None
for req_key in required_args:
if req_key not in kwargs:
logger.error("Tester lacks argument {}".format(req_key))
raise ValueError("Tester lacks argument {}".format(req_key))
for key in default_args:
if key in kwargs:
if isinstance(kwargs[key], type(default_args[key])):
default_args[key] = kwargs[key]
else:
msg = "Argument %s type mismatch: expected %s while get %s" % (
key, type(default_args[key]), type(kwargs[key]))
logger.error(msg)
raise ValueError(msg)
else:
# BeseTester doesn't care about extra arguments
pass
print(default_args)
self.save_output = default_args["save_output"]
self.save_best_dev = default_args["save_best_dev"]
self.save_loss = default_args["save_loss"]
self.batch_size = default_args["batch_size"]
self.pickle_path = default_args["pickle_path"]
self.use_cuda = default_args["use_cuda"]
self.print_every_step = default_args["print_every_step"]
self._model = None
self.eval_history = []
self.batch_output = []
def test(self, network, dev_data):
if torch.cuda.is_available() and self.use_cuda:
self.model = network.cuda()
self._model = network.cuda()
else:
self.model = network
self._model = network
# turn on the testing mode; clean up the history
self.mode(network, test=True)
self.eval_history.clear()
self.batch_output.clear()
# dev_data = self.prepare_input(self.pickle_path)
# logger.info("validation data loaded")
iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
n_batches = len(dev_data) // self.batch_size
print_every_step = 1
step = 0
for batch_x, batch_y in self.make_batch(iterator, dev_data):
@ -65,21 +98,10 @@ class BaseTester(object):
print_output = "[test step {}] {}".format(step, eval_results)
logger.info(print_output)
if step % print_every_step == 0:
if self.print_every_step > 0 and step % self.print_every_step == 0:
print(print_output)
step += 1
def prepare_input(self, data_path):
"""Save the dev data once it is loaded. Can return directly next time.
:param data_path: str, the path to the pickle data for dev
:return save_dev_data: list. Each entry is a sample, which is also a list of features and label(s).
"""
if self.save_dev_data is None:
data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb"))
self.save_dev_data = data_dev
return self.save_dev_data
def mode(self, model, test):
"""Train mode or Test mode. This is for PyTorch currently.
@ -117,15 +139,14 @@ class SeqLabelTester(BaseTester):
Tester for sequence labeling.
"""
def __init__(self, test_args):
def __init__(self, **test_args):
"""
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
"""
super(SeqLabelTester, self).__init__(test_args)
super(SeqLabelTester, self).__init__(**test_args)
self.max_len = None
self.mask = None
self.seq_len = None
self.batch_result = None
def data_forward(self, network, inputs):
"""This is only for sequence labeling with CRF decoder.
@ -159,14 +180,14 @@ class SeqLabelTester(BaseTester):
:return:
"""
batch_size, max_len = predict.size(0), predict.size(1)
loss = self.model.loss(predict, truth, self.mask) / batch_size
loss = self._model.loss(predict, truth, self.mask) / batch_size
prediction = self.model.prediction(predict, self.mask)
results = torch.Tensor(prediction).view(-1,)
prediction = self._model.prediction(predict, self.mask)
results = torch.Tensor(prediction).view(-1, )
# make sure "results" is in the same device as "truth"
results = results.to(truth)
accuracy = torch.sum(results == truth.view((-1,))).to(torch.float) / results.shape[0]
return [loss.data, accuracy.data]
return [float(loss), float(accuracy)]
def metrics(self):
batch_loss = np.mean([x[0] for x in self.eval_history])
@ -184,21 +205,16 @@ class SeqLabelTester(BaseTester):
def make_batch(self, iterator, data):
return Action.make_batch(iterator, use_cuda=self.use_cuda, output_length=True)
class ClassificationTester(BaseTester):
"""Tester for classification."""
def __init__(self, test_args):
def __init__(self, **test_args):
"""
:param test_args: a dict-like object that has __getitem__ method, \
can be accessed by "test_args["key_str"]"
"""
super(ClassificationTester, self).__init__(test_args)
self.pickle_path = test_args["pickle_path"]
self.save_dev_data = None
self.output = None
self.mean_loss = None
self.iterator = None
super(ClassificationTester, self).__init__(**test_args)
def make_batch(self, iterator, data, max_len=None):
return Action.make_batch(iterator, use_cuda=self.use_cuda, max_len=max_len)
@ -221,4 +237,3 @@ class ClassificationTester(BaseTester):
y_true = torch.cat(y_true, dim=0)
acc = float(torch.sum(y_pred == y_true)) / len(y_true)
return y_true.cpu().numpy(), y_prob.cpu().numpy(), acc

View File

@ -4,12 +4,12 @@ import os
import time
from datetime import timedelta
import numpy as np
import torch
import torch.nn as nn
from fastNLP.core.action import Action
from fastNLP.core.action import RandomSampler, Batchifier
from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.tester import SeqLabelTester, ClassificationTester
from fastNLP.modules import utils
from fastNLP.saver.logger import create_logger
@ -23,14 +23,13 @@ class BaseTrainer(object):
"""Operations to train a model, including data loading, SGD, and validation.
Subclasses must implement the following abstract methods:
- define_optimizer
- grad_backward
- get_loss
"""
def __init__(self, train_args):
def __init__(self, **kwargs):
"""
:param train_args: dict of (key, value), or dict-like object. key is str.
:param kwargs: dict of (key, value), or dict-like object. key is str.
The base trainer requires the following keys:
- epochs: int, the number of epochs in training
@ -39,64 +38,90 @@ class BaseTrainer(object):
- pickle_path: str, the path to pickle files for pre-processing
"""
super(BaseTrainer, self).__init__()
self.n_epochs = train_args["epochs"]
self.batch_size = train_args["batch_size"]
self.pickle_path = train_args["pickle_path"]
self.validate = train_args["validate"]
self.save_best_dev = train_args["save_best_dev"]
self.model_saved_path = train_args["model_saved_path"]
self.use_cuda = train_args["use_cuda"]
"""
"default_args" provides default value for important settings.
The initialization arguments "kwargs" with the same key (name) will override the default value.
"kwargs" must have the same type as "default_args" on corresponding keys.
Otherwise, error will raise.
"""
default_args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/",
"save_best_dev": True, "model_name": "default_model_name.pkl", "print_every_step": 1,
"loss": Loss(None),
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0)
}
"""
"required_args" is the collection of arguments that users must pass to Trainer explicitly.
This is used to warn users of essential settings in the training.
Obviously, "required_args" is the subset of "default_args".
The value in "default_args" to the keys in "required_args" is simply for type check.
"""
# TODO: required arguments
required_args = {}
self.model = None
self.iterator = None
self.loss_func = None
self.optimizer = None
for req_key in required_args:
if req_key not in kwargs:
logger.error("Trainer lacks argument {}".format(req_key))
raise ValueError("Trainer lacks argument {}".format(req_key))
for key in default_args:
if key in kwargs:
if isinstance(kwargs[key], type(default_args[key])):
default_args[key] = kwargs[key]
else:
msg = "Argument %s type mismatch: expected %s while get %s" % (
key, type(default_args[key]), type(kwargs[key]))
logger.error(msg)
raise ValueError(msg)
else:
# BaseTrainer doesn't care about extra arguments
pass
print(default_args)
self.n_epochs = default_args["epochs"]
self.batch_size = default_args["batch_size"]
self.pickle_path = default_args["pickle_path"]
self.validate = default_args["validate"]
self.save_best_dev = default_args["save_best_dev"]
self.use_cuda = default_args["use_cuda"]
self.model_name = default_args["model_name"]
self.print_every_step = default_args["print_every_step"]
self._model = None
self._loss_func = default_args["loss"].get() # return a pytorch loss function or None
self._optimizer = None
self._optimizer_proto = default_args["optimizer"]
def train(self, network, train_data, dev_data=None):
"""General Training Steps
"""General Training Procedure
:param network: a model
:param train_data: three-level list, the training set.
:param dev_data: three-level list, the validation data (optional)
The method is framework independent.
Work by calling the following methods:
- prepare_input
- mode
- define_optimizer
- data_forward
- get_loss
- grad_backward
- update
Subclasses must implement these methods with a specific framework.
"""
# prepare model and data, transfer model to gpu if available
# transfer model to gpu if available
if torch.cuda.is_available() and self.use_cuda:
self.model = network.cuda()
self._model = network.cuda()
# self._model is used to access model-specific loss
else:
self.model = network
self._model = network
# train_data = self.load_train_data(self.pickle_path)
# logger.info("training data loaded")
# define tester over dev data
# define Tester over dev data
if self.validate:
default_valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
"save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path,
"use_cuda": self.use_cuda}
"use_cuda": self.use_cuda, "print_every_step": 0}
validator = self._create_validator(default_valid_args)
logger.info("validator defined as {}".format(str(validator)))
# optimizer and loss
self.define_optimizer()
logger.info("optimizer defined as {}".format(str(self.optimizer)))
logger.info("optimizer defined as {}".format(str(self._optimizer)))
self.define_loss()
logger.info("loss function defined as {}".format(str(self._loss_func)))
# main training epochs
n_samples = len(train_data)
n_batches = n_samples // self.batch_size
n_print = 1
# main training procedure
start = time.time()
logger.info("training epochs started")
for epoch in range(1, self.n_epochs + 1):
logger.info("training epoch {}".format(epoch))
@ -106,23 +131,30 @@ class BaseTrainer(object):
data_iterator = iter(Batchifier(RandomSampler(train_data), self.batch_size, drop_last=False))
logger.info("prepared data iterator")
self._train_step(data_iterator, network, start=start, n_print=n_print, epoch=epoch)
# one forward and backward pass
self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch)
# validation
if self.validate:
logger.info("validation started")
validator.test(network, dev_data)
if self.save_best_dev and self.best_eval_result(validator):
self.save_model(network)
print("saved better model selected by dev")
logger.info("saved better model selected by dev")
self.save_model(network, self.model_name)
print("Saved better model selected by validation.")
logger.info("Saved better model selected by validation.")
valid_results = validator.show_matrices()
print("[epoch {}] {}".format(epoch, valid_results))
logger.info("[epoch {}] {}".format(epoch, valid_results))
def _train_step(self, data_iterator, network, **kwargs):
"""Training process in one epoch."""
"""Training process in one epoch.
kwargs should contain:
- n_print: int, print training information every n steps.
- start: time.time(), the starting time of this step.
- epoch: int,
"""
step = 0
for batch_x, batch_y in self.make_batch(data_iterator):
@ -132,7 +164,7 @@ class BaseTrainer(object):
self.grad_backward(loss)
self.update()
if step % kwargs["n_print"] == 0:
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time()
diff = timedelta(seconds=round(end - kwargs["start"]))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
@ -153,6 +185,11 @@ class BaseTrainer(object):
logger.error("the number of folds in train and dev data unequals {}!={}".format(len(train_data_cv),
len(dev_data_cv)))
raise RuntimeError("the number of folds in train and dev data unequals")
if self.validate is False:
logger.warn("Cross validation requires self.validate to be True. Please turn it on. ")
print("[warning] Cross validation requires self.validate to be True. Please turn it on. ")
self.validate = True
n_fold = len(train_data_cv)
logger.info("perform {} folds cross validation.".format(n_fold))
for i in range(n_fold):
@ -186,7 +223,7 @@ class BaseTrainer(object):
"""
Define framework-specific optimizer specified by the models.
"""
raise NotImplementedError
self._optimizer = self._optimizer_proto.construct_from_pytorch(self._model.parameters())
def update(self):
"""
@ -194,7 +231,7 @@ class BaseTrainer(object):
For PyTorch, just call optimizer to update.
"""
raise NotImplementedError
self._optimizer.step()
def data_forward(self, network, x):
raise NotImplementedError
@ -206,7 +243,8 @@ class BaseTrainer(object):
For PyTorch, just do "loss.backward()"
"""
raise NotImplementedError
self._model.zero_grad()
loss.backward()
def get_loss(self, predict, truth):
"""
@ -215,21 +253,25 @@ class BaseTrainer(object):
:param truth: ground truth label vector
:return: a scalar
"""
if self.loss_func is None:
if hasattr(self.model, "loss"):
self.loss_func = self.model.loss
logger.info("The model has a loss function, use it.")
else:
logger.info("The model didn't define loss, use Trainer's loss.")
self.define_loss()
return self.loss_func(predict, truth)
return self._loss_func(predict, truth)
def define_loss(self):
"""
Assign an instance of loss function to self.loss_func
E.g. self.loss_func = nn.CrossEntropyLoss()
if the model defines a loss, use model's loss.
Otherwise, Trainer must has a loss argument, use it as loss.
These two losses cannot be defined at the same time.
Trainer does not handle loss definition or choose default losses.
"""
raise NotImplementedError
if hasattr(self._model, "loss") and self._loss_func is not None:
raise ValueError("Both the model and Trainer define loss. Please take out your loss.")
if hasattr(self._model, "loss"):
self._loss_func = self._model.loss
logger.info("The model has a loss function, use it.")
else:
if self._loss_func is None:
raise ValueError("Please specify a loss function.")
logger.info("The model didn't define loss, use Trainer's loss.")
def best_eval_result(self, validator):
"""
@ -238,71 +280,35 @@ class BaseTrainer(object):
"""
raise NotImplementedError
def save_model(self, network):
"""
def save_model(self, network, model_name):
"""Save this model with such a name.
This method may be called multiple times by Trainer to overwritten a better model.
:param network: the PyTorch model
model_best_dev.pkl may be overwritten by a better model in future epochs.
:param model_name: str
"""
ModelSaver(self.model_saved_path + "model_best_dev.pkl").save_pytorch(network)
if model_name[-4:] != ".pkl":
model_name += ".pkl"
ModelSaver(self.pickle_path + model_name).save_pytorch(network)
def _create_validator(self, valid_args):
raise NotImplementedError
class ToyTrainer(BaseTrainer):
"""
An example to show the definition of Trainer.
"""
def __init__(self, training_args):
super(ToyTrainer, self).__init__(training_args)
def load_train_data(self, data_path):
data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
return data_train, data_dev, 0, 1
def data_forward(self, network, x):
return network(x)
def grad_backward(self, loss):
self.model.zero_grad()
loss.backward()
def get_loss(self, pred, truth):
return np.mean(np.square(pred - truth))
def define_optimizer(self):
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)
def update(self):
self.optimizer.step()
class SeqLabelTrainer(BaseTrainer):
"""
Trainer for Sequence Modeling
Trainer for Sequence Labeling
"""
def __init__(self, train_args):
super(SeqLabelTrainer, self).__init__(train_args)
self.vocab_size = train_args["vocab_size"]
self.num_classes = train_args["num_classes"]
def __init__(self, **kwargs):
super(SeqLabelTrainer, self).__init__(**kwargs)
# self.vocab_size = kwargs["vocab_size"]
# self.num_classes = kwargs["num_classes"]
self.max_len = None
self.mask = None
self.best_accuracy = 0.0
def define_optimizer(self):
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
def grad_backward(self, loss):
self.model.zero_grad()
loss.backward()
def update(self):
self.optimizer.step()
def data_forward(self, network, inputs):
if not isinstance(inputs, tuple):
raise RuntimeError("output_length must be true for sequence modeling. Receive {}".format(type(inputs[0])))
@ -330,7 +336,7 @@ class SeqLabelTrainer(BaseTrainer):
batch_size, max_len = predict.size(0), predict.size(1)
assert truth.shape == (batch_size, max_len)
loss = self.model.loss(predict, truth, self.mask)
loss = self._model.loss(predict, truth, self.mask)
return loss
def best_eval_result(self, validator):
@ -345,48 +351,25 @@ class SeqLabelTrainer(BaseTrainer):
return Action.make_batch(iterator, output_length=True, use_cuda=self.use_cuda)
def _create_validator(self, valid_args):
return SeqLabelTester(valid_args)
return SeqLabelTester(**valid_args)
class ClassificationTrainer(BaseTrainer):
"""Trainer for classification."""
"""Trainer for text classification."""
def __init__(self, train_args):
super(ClassificationTrainer, self).__init__(train_args)
self.learn_rate = train_args["learn_rate"]
self.momentum = train_args["momentum"]
def __init__(self, **train_args):
super(ClassificationTrainer, self).__init__(**train_args)
self.iterator = None
self.loss_func = None
self.optimizer = None
self.best_accuracy = 0
def define_loss(self):
self.loss_func = nn.CrossEntropyLoss()
def define_optimizer(self):
"""
Define framework-specific optimizer specified by the models.
"""
self.optimizer = torch.optim.SGD(
self.model.parameters(),
lr=self.learn_rate,
momentum=self.momentum)
def data_forward(self, network, x):
"""Forward through network."""
logits = network(x)
return logits
def grad_backward(self, loss):
"""Compute gradient backward."""
self.model.zero_grad()
loss.backward()
def update(self):
"""Apply gradient."""
self.optimizer.step()
def make_batch(self, iterator):
return Action.make_batch(iterator, output_length=False, use_cuda=self.use_cuda)
@ -404,4 +387,4 @@ class ClassificationTrainer(BaseTrainer):
return False
def _create_validator(self, valid_args):
return ClassificationTester(valid_args)
return ClassificationTester(**valid_args)

View File

@ -1,4 +1,5 @@
from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
from fastNLP.core.preprocess import load_pickle
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader
@ -7,14 +8,13 @@ mapping from model name to [URL, file_name.class_name, model_pickle_name]
Notice that the class of the model should be in "models" directory.
Example:
"zh_pos_tag_model": ["www.fudan.edu.cn", "sequence_modeling.SeqLabeling", "saved_model.pkl"]
"""
FastNLP_MODEL_COLLECTION = {
"seq_label_model": {
"url": "www.fudan.edu.cn",
"class": "sequence_modeling.SeqLabeling",
"class": "sequence_modeling.SeqLabeling", # file_name.class_name in models/
"pickle": "seq_label_model.pkl",
"type": "seq_label"
"type": "seq_label",
"config_file_name": "config", # the name of the config file which stores model initialization parameters
"config_section_name": "text_class_model" # the name of the section in the config file which stores model init params
},
"text_class_model": {
"url": "www.fudan.edu.cn",
@ -22,11 +22,18 @@ FastNLP_MODEL_COLLECTION = {
"pickle": "text_class_model.pkl",
"type": "text_class"
}
"""
FastNLP_MODEL_COLLECTION = {
"cws_basic_model": {
"url": "",
"class": "sequence_modeling.AdvSeqLabel",
"pickle": "cws_basic_model_v_0.pkl",
"type": "seq_label",
"config_file_name": "config",
"config_section_name": "text_class_model"
}
}
CONFIG_FILE_NAME = "config"
SECTION_NAME = "text_class_model"
class FastNLP(object):
"""
@ -51,10 +58,13 @@ class FastNLP(object):
self.model = None
self.infer_type = None # "seq_label"/"text_class"
def load(self, model_name):
def load(self, model_name, config_file="config", section_name="model"):
"""
Load a pre-trained FastNLP model together with additional data.
:param model_name: str, the name of a FastNLP model.
:param config_file: str, the name of the config file which stores the initialization information of the model.
(default: "config")
:param section_name: str, the name of the corresponding section in the config file. (default: model)
"""
assert type(model_name) is str
if model_name not in FastNLP_MODEL_COLLECTION:
@ -64,37 +74,47 @@ class FastNLP(object):
self._download(model_name, FastNLP_MODEL_COLLECTION[model_name]["url"])
model_class = self._get_model_class(FastNLP_MODEL_COLLECTION[model_name]["class"])
print("Restore model class {}".format(str(model_class)))
model_args = ConfigSection()
ConfigLoader.load_config(self.model_dir + CONFIG_FILE_NAME, {SECTION_NAME: model_args})
ConfigLoader.load_config(self.model_dir + config_file, {section_name: model_args})
print("Restore model hyper-parameters {}".format(str(model_args.data)))
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(self.model_dir, "word2id.pkl")
model_args["vocab_size"] = len(word2index)
index2label = load_pickle(self.model_dir, "id2class.pkl")
model_args["num_classes"] = len(index2label)
# Construct the model
model = model_class(model_args)
print("Model constructed.")
# To do: framework independent
ModelLoader.load_pytorch(model, self.model_dir + FastNLP_MODEL_COLLECTION[model_name]["pickle"])
print("Model weights loaded.")
self.model = model
self.infer_type = FastNLP_MODEL_COLLECTION[model_name]["type"]
print("Model loaded. ")
print("Inference ready.")
def run(self, raw_input):
"""
Perform inference over given input using the loaded model.
:param raw_input: str, raw text
:param raw_input: list of string. Each list is an input query.
:return results:
"""
infer = self._create_inference(self.model_dir)
# string ---> 2-D list of string
infer_input = self.string_to_list(raw_input)
# tokenize: list of string ---> 2-D list of string
infer_input = self.tokenize(raw_input, language="zh")
# 2-D list of string ---> list of strings
# 2-D list of string ---> 2-D list of tags
results = infer.predict(self.model, infer_input)
# list of strings ---> final answers
# 2-D list of tags ---> list of final answers
outputs = self._make_output(results, infer_input)
return outputs
@ -142,81 +162,100 @@ class FastNLP(object):
"""
return True
def string_to_list(self, text, delimiter="\n"):
"""
This function is used to transform raw input to lists, which is done by DatasetLoader in training.
Split text string into three-level lists.
[
[word_11, word_12, ...],
[word_21, word_22, ...],
...
]
:param text: string
:param delimiter: str, character used to split text into sentences.
:return data: two-level lists
def tokenize(self, text, language):
"""Extract tokens from strings.
For English, extract words separated by space.
For Chinese, extract characters.
TODO: more complex tokenization methods
:param text: list of string
:param language: str, one of ('zh', 'en'), Chinese or English.
:return data: list of list of string, each string is a token.
"""
assert language in ("zh", "en")
data = []
sents = text.strip().split(delimiter)
for sent in sents:
characters = []
for ch in sent:
characters.append(ch)
data.append(characters)
for sent in text:
if language == "en":
tokens = sent.strip().split()
elif language == "zh":
tokens = [char for char in sent]
else:
raise RuntimeError("Unknown language {}".format(language))
data.append(tokens)
return data
def _make_output(self, results, infer_input):
"""Transform the infer output into user-friendly output.
:param results: 1 or 2-D list of strings.
If self.infer_type == "seq_label", it is of shape [num_examples, tag_seq_length]
If self.infer_type == "text_class", it is of shape [num_examples]
:param infer_input: 2-D list of string, the input query before inference.
:return outputs: list. Each entry is a prediction.
"""
if self.infer_type == "seq_label":
outputs = make_seq_label_output(results, infer_input)
elif self.infer_type == "text_class":
outputs = make_class_output(results, infer_input)
else:
raise ValueError("fail to make outputs with infer type {}".format(self.infer_type))
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))
return outputs
def make_seq_label_output(result, infer_input):
"""
Transform model output into user-friendly contents.
:param result: 1-D list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return outputs:
"""
return result
"""Transform model output into user-friendly contents.
:param result: 2-D list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return ret: list of list of tuples
[
[(word_11, label_11), (word_12, label_12), ...],
[(word_21, label_21), (word_22, label_22), ...],
...
]
"""
ret = []
for example_x, example_y in zip(infer_input, result):
ret.append([(x, y) for x, y in zip(example_x, example_y)])
return ret
def make_class_output(result, infer_input):
"""Transform model output into user-friendly contents.
:param result: 2-D list of strings. (model output)
:param infer_input: 1-D list of string (model input)
:return ret: the same as result, [label_1, label_2, ...]
"""
return result
def interpret_word_seg_results(infer_input, results):
"""
Transform model output into user-friendly contents.
def interpret_word_seg_results(char_seq, label_seq):
"""Transform model output into user-friendly contents.
Example: In CWS, convert <BMES> labeling into segmented text.
:param results: list of strings. (model output)
:param infer_input: 2-D list of string (model input)
:return output: list of strings
:param char_seq: list of string,
:param label_seq: list of string, the same length as char_seq
Each entry is one of ('B', 'M', 'E', 'S').
:return output: list of words
"""
outputs = []
for sent_char, sent_label in zip(infer_input, results):
words = []
word = ""
for char, label in zip(sent_char, sent_label):
if label[0] == "B":
if word != "":
words.append(word)
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words = []
word = ""
for char, label in zip(char_seq, label_seq):
if label[0] == "B":
if word != "":
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label")
outputs.append(" ".join(words))
return outputs
word = char
elif label[0] == "M":
word += char
elif label[0] == "E":
word += char
words.append(word)
word = ""
elif label[0] == "S":
if word != "":
words.append(word)
word = ""
words.append(char)
else:
raise ValueError("invalid label {}".format(label[0]))
return words

View File

@ -94,6 +94,10 @@ class ConfigSection(object):
def __contains__(self, item):
return item in self.__dict__.keys()
@property
def data(self):
return self.__dict__
if __name__ == "__main__":
config = ConfigLoader('configLoader', 'there is no data')

View File

@ -142,6 +142,8 @@ class CharLM(BaseModel):
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
}
if not os.path.exists("cache"):
os.mkdir("cache")
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")

View File

@ -0,0 +1,56 @@
import torch
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, size_layer, num_class=2, activation='relu'):
"""Multilayer Perceptrons as a decoder
Args:
size_layer: list of int, define the size of MLP layers
num_class: int, num of class in output, should be 2 or the last layer's size
activation: str or function, the activation function for hidden layers
"""
super(MLP, self).__init__()
self.hiddens = nn.ModuleList()
self.output = None
for i in range(1, len(size_layer)):
if i + 1 == len(size_layer):
self.output = nn.Linear(size_layer[i-1], size_layer[i])
else:
self.hiddens.append(nn.Linear(size_layer[i-1], size_layer[i]))
if num_class == 2:
self.out_active = nn.LogSigmoid()
elif num_class == size_layer[-1]:
self.out_active = nn.LogSoftmax(dim=1)
else:
raise ValueError("should set output num_class correctly: {}".format(num_class))
actives = {
'relu': nn.ReLU(),
'tanh': nn.Tanh()
}
if activation in actives:
self.hidden_active = actives[activation]
elif isinstance(activation, callable):
self.hidden_active = activation
else:
raise ValueError("should set activation correctly: {}".format(activation))
def forward(self, x):
for layer in self.hiddens:
x = self.hidden_active(layer(x))
x = self.out_active(self.output(x))
return x
if __name__ == '__main__':
net1 = MLP([5,10,5])
net2 = MLP([5,10,5], 5)
for net in [net1, net2]:
x = torch.randn(5, 5)
y = net(x)
print(x)
print(y)

View File

@ -15,7 +15,7 @@ class Embedding(nn.Module):
def __init__(self, nums, dims, padding_idx=0, sparse=False, init_emb=None, dropout=0.0):
super(Embedding, self).__init__()
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
if init_emb:
if init_emb is not None:
self.embed.weight = nn.Parameter(init_emb)
self.dropout = nn.Dropout(dropout)

View File

@ -273,7 +273,7 @@ class MaskedRNNBase(nn.Module):
hx = (hx, hx)
func = AutogradMaskedStep(num_layers=self.num_layers,
dropout=self.dropout,
dropout=self.step_dropout,
train=self.training,
lstm=lstm)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 35 KiB

View File

@ -18,7 +18,6 @@ MLP_HIDDEN = 2000
CLASSES_NUM = 5
from fastNLP.models.base_model import BaseModel
from fastNLP.core.trainer import BaseTrainer
class MyNet(BaseModel):
@ -60,18 +59,6 @@ class Net(nn.Module):
return x, penalty
class MyTrainer(BaseTrainer):
def __init__(self, args):
super(MyTrainer, self).__init__(args)
self.optimizer = None
def define_optimizer(self):
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
def define_loss(self):
self.loss_func = nn.CrossEntropyLoss()
def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
"""

View File

@ -1,26 +1,26 @@
import sys, os
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, BaseLoader
from fastNLP.loader.preprocess import POSPreprocess, load_pickle
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.inference import SeqLabelInfer
from fastNLP.core.optimizer import SGD
from fastNLP.core.predictor import SeqLabelInfer
# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = 'icwb2-data'
cfgfile = 'cws.cfg'
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'
data_name = "pku_training.utf8"
cws_data_path = os.path.join(datadir, "training/pku_training.utf8")
cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")
@ -70,12 +70,13 @@ def train():
train_data = loader.load_pku()
# Preprocessor
p = POSPreprocess(train_data, pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes
preprocessor = SeqLabelPreprocess()
data_train, data_dev = preprocessor.run(train_data, pickle_path=pickle_path, train_dev_split=0.3)
train_args["vocab_size"] = preprocessor.vocab_size
train_args["num_classes"] = preprocessor.num_classes
# Trainer
trainer = SeqLabelTrainer(train_args)
trainer = SeqLabelTrainer(**train_args.data)
# Model
model = AdvSeqLabel(train_args)
@ -83,10 +84,11 @@ def train():
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass
# Start training
trainer.train(model)
trainer.train(model, data_train, data_dev)
print("Training finished!")
# Saver
@ -106,6 +108,9 @@ def test():
index2label = load_pickle(pickle_path, "id2class.pkl")
test_args["num_classes"] = len(index2label)
# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")
# Define the same model
model = AdvSeqLabel(test_args)
@ -114,10 +119,10 @@ def test():
print("model loaded!")
# Tester
tester = SeqLabelTester(test_args)
tester = SeqLabelTester(**test_args.data)
# Start testing
tester.test(model)
tester.test(model, dev_data)
# print test results
print(tester.show_matrices())

18
test/core/test_action.py Normal file
View File

@ -0,0 +1,18 @@
import os
import unittest
from fastNLP.core.action import Action, Batchifier, SequentialSampler
class TestAction(unittest.TestCase):
def test_case_1(self):
x = [1, 2, 3, 4, 5, 6, 7, 8]
y = [1, 1, 1, 1, 2, 2, 2, 2]
data = []
for i in range(len(x)):
data.append([[x[i]], [y[i]]])
data = Batchifier(SequentialSampler(data), batch_size=2, drop_last=False)
action = Action()
for batch_x in action.make_batch(data, use_cuda=False, output_length=True, max_len=None):
print(batch_x)

View File

@ -0,0 +1,43 @@
import os
import unittest
from fastNLP.core.preprocess import SeqLabelPreprocess
class TestSeqLabelPreprocess(unittest.TestCase):
def test_case_1(self):
data = [
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
]
if os.path.exists("./save"):
for root, dirs, files in os.walk("./save", topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4,
pickle_path="./save")
result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4,
pickle_path="./save")
if os.path.exists("./save"):
for root, dirs, files in os.walk("./save", topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data,
pickle_path="./save", train_dev_split=0.4,
cross_val=True)
result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data,
pickle_path="./save", train_dev_split=0.4,
cross_val=True)

33
test/core/test_trainer.py Normal file
View File

@ -0,0 +1,33 @@
import os
import torch.nn as nn
import unittest
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.core.loss import Loss
from fastNLP.core.optimizer import Optimizer
from fastNLP.models.sequence_modeling import SeqLabeling
class TestTrainer(unittest.TestCase):
def test_case_1(self):
args = {"epochs": 3, "batch_size": 8, "validate": True, "use_cuda": True, "pickle_path": "./save/",
"save_best_dev": True, "model_name": "default_model_name.pkl",
"loss": Loss(None),
"optimizer": Optimizer("Adam", lr=0.001, weight_decay=0),
"vocab_size": 20,
"word_emb_dim": 100,
"rnn_hidden_units": 100,
"num_classes": 3
}
trainer = SeqLabelTrainer()
train_data = [
[[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]],
[[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]],
[[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]],
[[1, 2, 3, 4, 5, 6], [1, 0, 1, 0, 1, 2]],
[[2, 3, 4, 5, 1, 6], [0, 1, 0, 1, 0, 2]],
[[1, 4, 1, 4, 1, 6], [1, 0, 1, 0, 1, 2]],
]
dev_data = train_data
model = SeqLabeling(args)
trainer.train(network=model, train_data=train_data, dev_data=dev_data)

View File

@ -1,65 +1,11 @@
[General]
revision = "first"
datapath = "./data/smallset/imdb/"
embed_path = "./data/smallset/imdb/embedding.txt"
optimizer = "adam"
attn_mode = "rout"
seq_encoder = "bilstm"
out_caps_num = 5
rout_iter = 3
max_snt_num = 40
max_wd_num = 40
max_epochs = 50
pre_trained = true
batch_sz = 32
batch_sz_min = 32
bucket_sz = 5000
partial_update_until_epoch = 2
embed_size = 300
hidden_size = 200
dense_hidden = [300, 10]
lr = 0.0002
decay_steps = 1000
decay_rate = 0.9
dropout = 0.2
early_stopping = 7
reg = 1e-06
[My]
datapath = "./data/smallset/imdb/"
embed_path = "./data/smallset/imdb/embedding.txt"
optimizer = "adam"
attn_mode = "rout"
seq_encoder = "bilstm"
out_caps_num = 5
rout_iter = 3
max_snt_num = 40
max_wd_num = 40
max_epochs = 50
pre_trained = true
batch_sz = 32
batch_sz_min = 32
bucket_sz = 5000
partial_update_until_epoch = 2
embed_size = 300
hidden_size = 200
dense_hidden = [300, 10]
lr = 0.0002
decay_steps = 1000
decay_rate = 0.9
dropout = 0.2
early_stopping = 70
reg = 1e-05
test = 5
new_attr = 40
[POS]
[test_seq_label_trainer]
epochs = 1
batch_size = 32
pickle_path = "./data_for_tests/"
validate = true
save_best_dev = true
model_saved_path = "./"
use_cuda = true
[test_seq_label_model]
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
@ -68,13 +14,12 @@ dropout = 0.5
use_crf = true
use_cuda = true
[POS_test]
[test_seq_label_tester]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 1
pickle_path = "./data_for_tests/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
@ -84,7 +29,6 @@ use_crf = true
use_cuda = true
[POS_infer]
pickle_path = "./data_for_tests/"
rnn_hidden_units = 100
rnn_layers = 1
rnn_bi_direction = true
@ -95,14 +39,9 @@ num_classes = 27
[text_class]
epochs = 1
batch_size = 10
pickle_path = "./save_path/"
validate = false
save_best_dev = false
model_saved_path = "./save_path/"
use_cuda = true
learn_rate = 1e-3
momentum = 0.9
[text_class_model]
vocab_size = 867
num_classes = 18
model_name = "class_model.pkl"

7
test/loader/config Normal file
View File

@ -0,0 +1,7 @@
[test]
x = 1
y = 2
z = 3
input = [1,2,3]
text = "this is text"
doubles = 0.5

View File

@ -0,0 +1,75 @@
import os
import configparser
import json
import unittest
from fastNLP.loader.config_loader import ConfigSection, ConfigLoader
from fastNLP.loader.dataset_loader import TokenizeDatasetLoader, POSDatasetLoader, LMDatasetLoader
class TestConfigLoader(unittest.TestCase):
def test_case_ConfigLoader(self):
def read_section_from_config(config_path, section_name):
dict = {}
if not os.path.exists(config_path):
raise FileNotFoundError("config file {} NOT found.".format(config_path))
cfg = configparser.ConfigParser()
cfg.read(config_path)
if section_name not in cfg:
raise AttributeError("config file {} do NOT have section {}".format(
config_path, section_name
))
gen_sec = cfg[section_name]
for s in gen_sec.keys():
try:
val = json.loads(gen_sec[s])
dict[s] = val
except Exception as e:
raise AttributeError("json can NOT load {} in section {}, config file {}".format(
s, section_name, config_path
))
return dict
test_arg = ConfigSection()
ConfigLoader("config", "").load_config(os.path.join("./test/loader", "config"), {"test": test_arg})
#ConfigLoader("config", "").load_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config",
# {"test": test_arg})
#dict = read_section_from_config("/home/ygxu/github/fastNLP_testing/fastNLP/test/loader/config", "test")
dict = read_section_from_config(os.path.join("./test/loader", "config"), "test")
for sec in dict:
if (sec not in test_arg) or (dict[sec] != test_arg[sec]):
raise AttributeError("ERROR")
for sec in test_arg.__dict__.keys():
if (sec not in dict) or (dict[sec] != test_arg[sec]):
raise AttributeError("ERROR")
try:
not_exist = test_arg["NOT EXIST"]
except Exception as e:
pass
print("pass config test!")
class TestDatasetLoader(unittest.TestCase):
def test_case_TokenizeDatasetLoader(self):
loader = TokenizeDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
data = loader.load_pku(max_seq_len=32)
print("pass TokenizeDatasetLoader test!")
def test_case_POSDatasetLoader(self):
loader = POSDatasetLoader("people", "./test/data_for_tests/people.txt")
data = loader.load()
datas = loader.load_lines()
print("pass POSDatasetLoader test!")
def test_case_LMDatasetLoader(self):
loader = LMDatasetLoader("cws_pku_utf_8", "./test/data_for_tests/cws_pku_utf_8")
data = loader.load()
datas = loader.load_lines()
print("pass TokenizeDatasetLoader test!")

View File

@ -0,0 +1,27 @@
import torch
import unittest
from fastNLP.modules.encoder.masked_rnn import MaskedRNN
class TestMaskedRnn(unittest.TestCase):
def test_case_1(self):
masked_rnn = MaskedRNN(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
mask = torch.tensor([[[1], [0]]])
y = masked_rnn(x, mask=mask)
def test_case_2(self):
masked_rnn = MaskedRNN(input_size=1, hidden_size=1, bidirectional=False, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
xx = torch.tensor([[[1.0]]])
y = masked_rnn.step(xx)
y = masked_rnn.step(xx, mask=mask)

View File

@ -0,0 +1,30 @@
import torch
import unittest
from fastNLP.modules.other_modules import GroupNorm, LayerNormalization, BiLinear
class TestGroupNorm(unittest.TestCase):
def test_case_1(self):
gn = GroupNorm(num_features=1, num_groups=10, eps=1.5e-5)
x = torch.randn((20, 50, 10))
y = gn(x)
class TestLayerNormalization(unittest.TestCase):
def test_case_1(self):
ln = LayerNormalization(d_hid=5, eps=2e-3)
x = torch.randn((20, 50, 5))
y = ln(x)
class TestBiLinear(unittest.TestCase):
def test_case_1(self):
bl = BiLinear(n_left=5, n_right=5, n_out=10, bias=True)
x_left = torch.randn((7, 10, 20, 5))
x_right = torch.randn((7, 10, 20, 5))
y = bl(x_left, x_right)
print(bl)
bl2 = BiLinear(n_left=15, n_right=15, n_out=10, bias=True)

View File

@ -0,0 +1,18 @@
import torch
import numpy as np
import unittest
import fastNLP.modules.utils as utils
class TestUtils(unittest.TestCase):
def test_case_1(self):
a = torch.tensor([
[1, 2, 3, 4, 5], [2, 3, 4, 5, 6]
])
utils.orthogonal(a)
def test_case_2(self):
a = np.random.rand(100, 100)
utils.mst(a)

View File

@ -0,0 +1,28 @@
import torch
import unittest
from fastNLP.modules.encoder.variational_rnn import VarMaskedFastLSTM
class TestMaskedRnn(unittest.TestCase):
def test_case_1(self):
masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
mask = torch.tensor([[[1], [0]]])
y = masked_rnn(x, mask=mask)
def test_case_2(self):
masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=False, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
xx = torch.tensor([[[1.0]]])
#y, hidden = masked_rnn.step(xx)
#step() still has a bug
#y, hidden = masked_rnn.step(xx, mask=mask)

View File

@ -20,7 +20,7 @@ class MyNERTrainer(SeqLabelTrainer):
override
:return:
"""
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
self.optimizer = torch.optim.Adam(self._model.parameters(), lr=0.001)
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5)
def update(self):

View File

@ -13,6 +13,7 @@ from fastNLP.loader.dataset_loader import ClassDatasetLoader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules import aggregation
from fastNLP.modules import encoder
from fastNLP.modules import decoder
class ClassificationModel(BaseModel):
@ -20,20 +21,20 @@ class ClassificationModel(BaseModel):
Simple text classification model based on CNN.
"""
def __init__(self, class_num, vocab_size):
def __init__(self, num_classes, vocab_size):
super(ClassificationModel, self).__init__()
self.embed = encoder.Embedding(nums=vocab_size, dims=300)
self.conv = encoder.Conv(
self.emb = encoder.Embedding(nums=vocab_size, dims=300)
self.enc = encoder.Conv(
in_channels=300, out_channels=100, kernel_size=3)
self.pool = aggregation.MaxPool()
self.output = encoder.Linear(input_size=100, output_size=class_num)
self.agg = aggregation.MaxPool()
self.dec = decoder.MLP(100, num_classes=num_classes)
def forward(self, x):
x = self.embed(x) # [N,L] -> [N,L,C]
x = self.conv(x) # [N,L,C_in] -> [N,L,C_out]
x = self.pool(x) # [N,L,C] -> [N,C]
x = self.output(x) # [N,C] -> [N, N_class]
x = self.emb(x) # [N,L] -> [N,L,C]
x = self.enc(x) # [N,L,C_in] -> [N,L,C_out]
x = self.agg(x) # [N,L,C] -> [N,C]
x = self.dec(x) # [N,C] -> [N, N_class]
return x
@ -55,7 +56,7 @@ model_args = {
'num_classes': n_classes,
'vocab_size': vocab_size
}
model = ClassificationModel(class_num=n_classes, vocab_size=vocab_size)
model = ClassificationModel(num_classes=n_classes, vocab_size=vocab_size)
# train model
train_args = {
@ -75,4 +76,4 @@ trainer.cross_validate(model)
# predict using model
data_infer = [x[0] for x in data]
infer = ClassificationInfer(data_dir)
labels_pred = infer.predict(model, data_infer)
labels_pred = infer.predict(model, data_infer)

View File

@ -1,7 +1,7 @@
import os
import sys
sys.path.append("..")
import argparse
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import POSDatasetLoader, BaseLoader
@ -11,17 +11,29 @@ from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.optimizer import Optimizer
data_name = "people.txt"
data_path = "data_for_tests/people.txt"
pickle_path = "seq_label/"
data_infer_path = "data_for_tests/people_infer.txt"
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--save", type=str, default="./seq_label/", help="path to save pickle files")
parser.add_argument("-t", "--train", type=str, default="./data_for_tests/people.txt",
help="path to the training data")
parser.add_argument("-c", "--config", type=str, default="./data_for_tests/config", help="path to the config file")
parser.add_argument("-m", "--model_name", type=str, default="seq_label_model.pkl", help="the name of the model")
parser.add_argument("-i", "--infer", type=str, default="data_for_tests/people_infer.txt",
help="data used for inference")
args = parser.parse_args()
pickle_path = args.save
model_name = args.model_name
config_dir = args.config
data_path = args.train
data_infer_path = args.infer
def infer():
# Load infer configuration, the same as test
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg", "").load_config(config_dir, {"POS_infer": test_args})
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
@ -33,11 +45,11 @@ def infer():
model = SeqLabeling(test_args)
# Dump trained parameters into the model
ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name))
print("model loaded!")
# Data Loader
raw_data_loader = BaseLoader(data_name, data_infer_path)
raw_data_loader = BaseLoader("xxx", data_infer_path)
infer_data = raw_data_loader.load_lines()
# Inference interface
@ -51,49 +63,72 @@ def infer():
def train_and_test():
# Config Loader
train_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})
trainer_args = ConfigSection()
model_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})
# Data Loader
pos_loader = POSDatasetLoader(data_name, data_path)
pos_loader = POSDatasetLoader("xxx", data_path)
train_data = pos_loader.load_lines()
# Preprocessor
p = SeqLabelPreprocess()
data_train, data_dev = p.run(train_data, pickle_path=pickle_path, train_dev_split=0.5)
train_args["vocab_size"] = p.vocab_size
train_args["num_classes"] = p.num_classes
model_args["vocab_size"] = p.vocab_size
model_args["num_classes"] = p.num_classes
# Trainer
trainer = SeqLabelTrainer(train_args)
# Trainer: two definition styles
# 1
# trainer = SeqLabelTrainer(trainer_args.data)
# 2
trainer = SeqLabelTrainer(
epochs=trainer_args["epochs"],
batch_size=trainer_args["batch_size"],
validate=trainer_args["validate"],
use_cuda=trainer_args["use_cuda"],
pickle_path=pickle_path,
save_best_dev=trainer_args["save_best_dev"],
model_name=model_name,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
)
# Model
model = SeqLabeling(train_args)
model = SeqLabeling(model_args)
# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")
# Saver
saver = ModelSaver(pickle_path + "saved_model.pkl")
saver = ModelSaver(os.path.join(pickle_path, model_name))
saver.save_pytorch(model)
print("Model saved!")
del model, trainer, pos_loader
# Define the same model
model = SeqLabeling(train_args)
model = SeqLabeling(model_args)
# Dump trained parameters into the model
ModelLoader.load_pytorch(model, pickle_path + "saved_model.pkl")
ModelLoader.load_pytorch(model, os.path.join(pickle_path, model_name))
print("model loaded!")
# Load test configuration
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
tester_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config(config_dir, {"test_seq_label_tester": tester_args})
# Tester
tester = SeqLabelTester(test_args)
tester = SeqLabelTester(save_output=False,
save_loss=False,
save_best_dev=False,
batch_size=4,
use_cuda=False,
pickle_path=pickle_path,
model_name="seq_label_in_test.pkl",
print_every_step=1
)
# Start testing with validation data
tester.test(model, data_dev)

View File

@ -1,13 +1,24 @@
import sys
sys.path.append("..")
from fastNLP.fastnlp import FastNLP
from fastNLP.fastnlp import interpret_word_seg_results
PATH_TO_CWS_PICKLE_FILES = "/home/zyfeng/fastNLP/reproduction/chinese_word_segment/save/"
def word_seg():
nlp = FastNLP("./data_for_tests/")
nlp.load("seq_label_model")
text = "这是最好的基于深度学习的中文分词系统。"
result = nlp.run(text)
print(result)
print("FastNLP finished!")
nlp = FastNLP(model_dir=PATH_TO_CWS_PICKLE_FILES)
nlp.load("cws_basic_model", config_file="cws.cfg", section_name="POS_test")
text = ["这是最好的基于深度学习的中文分词系统。",
"大王叫我来巡山。",
"我党多年来致力于改善人民生活水平。"]
results = nlp.run(text)
print(results)
for example in results:
words, labels = [], []
for res in example:
words.append(res[0])
labels.append(res[1])
print(interpret_word_seg_results(words, labels))
def text_class():
@ -19,5 +30,14 @@ def text_class():
print("FastNLP finished!")
def test_word_seg_interpret():
foo = [[('', 'S'), ('', 'S'), ('', 'S'), ('', 'S'), ('', 'S'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'),
('', 'B'), ('', 'E'), ('', 'S'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'), ('', 'B'), ('', 'E'),
('', 'S')]]
chars = [x[0] for x in foo[0]]
labels = [x[1] for x in foo[0]]
print(interpret_word_seg_results(chars, labels))
if __name__ == "__main__":
text_class()
word_seg()

View File

@ -1,6 +1,7 @@
# Python: 3.5
# encoding: utf-8
import argparse
import os
import sys
@ -13,75 +14,105 @@ from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.preprocess import ClassPreprocess
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.loss import Loss
save_path = "./test_classification/"
data_dir = "./data_for_tests/"
train_file = 'text_classify.txt'
model_name = "model_class.pkl"
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--save", type=str, default="./test_classification/", help="path to save pickle files")
parser.add_argument("-t", "--train", type=str, default="./data_for_tests/text_classify.txt",
help="path to the training data")
parser.add_argument("-c", "--config", type=str, default="./data_for_tests/config", help="path to the config file")
parser.add_argument("-m", "--model_name", type=str, default="classify_model.pkl", help="the name of the model")
args = parser.parse_args()
save_dir = args.save
train_data_dir = args.train
model_name = args.model_name
config_dir = args.config
def infer():
# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", os.path.join(data_dir, train_file))
ds_loader = ClassDatasetLoader("train", train_data_dir)
data = ds_loader.load()
unlabeled_data = [x[0] for x in data]
# pre-process data
pre = ClassPreprocess()
vocab_size, n_classes = pre.run(data, pickle_path=save_path)
print("vocabulary size:", vocab_size)
print("number of classes:", n_classes)
data = pre.run(data, pickle_path=save_dir)
print("vocabulary size:", pre.vocab_size)
print("number of classes:", pre.num_classes)
model_args = ConfigSection()
ConfigLoader.load_config("data_for_tests/config", {"text_class_model": model_args})
# TODO: load from config file
model_args["vocab_size"] = pre.vocab_size
model_args["num_classes"] = pre.num_classes
# ConfigLoader.load_config(config_dir, {"text_class_model": model_args})
# construct model
print("Building model...")
cnn = CNNText(model_args)
# Dump trained parameters into the model
ModelLoader.load_pytorch(cnn, "./data_for_tests/saved_model.pkl")
ModelLoader.load_pytorch(cnn, os.path.join(save_dir, model_name))
print("model loaded!")
infer = ClassificationInfer(data_dir)
infer = ClassificationInfer(pickle_path=save_dir)
results = infer.predict(cnn, unlabeled_data)
print(results)
def train():
train_args, model_args = ConfigSection(), ConfigSection()
ConfigLoader.load_config("data_for_tests/config", {"text_class": train_args, "text_class_model": model_args})
ConfigLoader.load_config(config_dir, {"text_class": train_args})
# load dataset
print("Loading data...")
ds_loader = ClassDatasetLoader("train", os.path.join(data_dir, train_file))
ds_loader = ClassDatasetLoader("train", train_data_dir)
data = ds_loader.load()
print(data[0])
# pre-process data
pre = ClassPreprocess()
data_train = pre.run(data, pickle_path=save_path)
data_train = pre.run(data, pickle_path=save_dir)
print("vocabulary size:", pre.vocab_size)
print("number of classes:", pre.num_classes)
model_args["num_classes"] = pre.num_classes
model_args["vocab_size"] = pre.vocab_size
# construct model
print("Building model...")
model = CNNText(model_args)
# ConfigSaver().save_config(config_dir, {"text_class_model": model_args})
# train
print("Training...")
trainer = ClassificationTrainer(train_args)
# 1
# trainer = ClassificationTrainer(train_args)
# 2
trainer = ClassificationTrainer(epochs=train_args["epochs"],
batch_size=train_args["batch_size"],
validate=train_args["validate"],
use_cuda=train_args["use_cuda"],
pickle_path=save_dir,
save_best_dev=train_args["save_best_dev"],
model_name=model_name,
loss=Loss("cross_entropy"),
optimizer=Optimizer("SGD", lr=0.001, momentum=0.9))
trainer.train(model, data_train)
print("Training finished!")
saver = ModelSaver("./data_for_tests/saved_model.pkl")
saver = ModelSaver(os.path.join(save_dir, model_name))
saver.save_pytorch(model)
print("Model saved!")
if __name__ == "__main__":
train()
# infer()
infer()