mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-02 04:07:35 +08:00
commit
7ff29877cd
@ -1,87 +1,154 @@
|
||||
from collections import namedtuple
|
||||
import _pickle
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from fastNLP.action.action import Action
|
||||
from fastNLP.action.action import RandomSampler, Batchifier
|
||||
from fastNLP.modules.utils import seq_mask
|
||||
|
||||
|
||||
class Tester(Action):
|
||||
class BaseTester(Action):
|
||||
"""docstring for Tester"""
|
||||
|
||||
TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
|
||||
"save_loss", "batch_size"])
|
||||
|
||||
def __init__(self, test_args):
|
||||
"""
|
||||
:param test_args: named tuple
|
||||
"""
|
||||
super(Tester, self).__init__()
|
||||
self.validate_in_training = test_args.validate_in_training
|
||||
self.save_dev_input = test_args.save_dev_input
|
||||
self.valid_x = None
|
||||
self.valid_y = None
|
||||
self.save_output = test_args.save_output
|
||||
super(BaseTester, self).__init__()
|
||||
self.validate_in_training = test_args["validate_in_training"]
|
||||
self.save_dev_data = None
|
||||
self.save_output = test_args["save_output"]
|
||||
self.output = None
|
||||
self.save_loss = test_args.save_loss
|
||||
self.save_loss = test_args["save_loss"]
|
||||
self.mean_loss = None
|
||||
self.batch_size = test_args.batch_size
|
||||
self.batch_size = test_args["batch_size"]
|
||||
self.pickle_path = test_args["pickle_path"]
|
||||
self.iterator = None
|
||||
|
||||
def test(self, network, data):
|
||||
print("testing")
|
||||
network.mode(test=True) # turn on the testing mode
|
||||
if self.save_dev_input:
|
||||
if self.valid_x is None:
|
||||
valid_x, valid_y = network.prepare_input(data)
|
||||
self.valid_x = valid_x
|
||||
self.valid_y = valid_y
|
||||
else:
|
||||
valid_x = self.valid_x
|
||||
valid_y = self.valid_y
|
||||
else:
|
||||
valid_x, valid_y = network.prepare_input(data)
|
||||
self.model = None
|
||||
self.eval_history = []
|
||||
|
||||
# split into batches by self.batch_size
|
||||
iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)
|
||||
def test(self, network):
|
||||
# print("--------------testing----------------")
|
||||
self.model = network
|
||||
|
||||
# turn on the testing mode; clean up the history
|
||||
self.mode(network, test=True)
|
||||
|
||||
dev_data = self.prepare_input(self.pickle_path)
|
||||
|
||||
self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))
|
||||
|
||||
batch_output = list()
|
||||
loss_history = list()
|
||||
# turn on the testing mode of the network
|
||||
network.mode(test=True)
|
||||
num_iter = len(dev_data) // self.batch_size
|
||||
|
||||
for step in range(iterations):
|
||||
batch_x, batch_y = test_batch_generator.__next__()
|
||||
for step in range(num_iter):
|
||||
batch_x, batch_y = self.batchify(dev_data)
|
||||
|
||||
# forward pass from test input to predicted output
|
||||
prediction = network.data_forward(batch_x)
|
||||
|
||||
loss = network.get_loss(prediction, batch_y)
|
||||
prediction = self.data_forward(network, batch_x)
|
||||
eval_results = self.evaluate(prediction, batch_y)
|
||||
|
||||
if self.save_output:
|
||||
batch_output.append(prediction.data)
|
||||
batch_output.append(prediction)
|
||||
if self.save_loss:
|
||||
loss_history.append(loss)
|
||||
self.log(self.make_log(step, loss))
|
||||
self.eval_history.append(eval_results)
|
||||
|
||||
if self.save_loss:
|
||||
self.mean_loss = np.mean(np.array(loss_history))
|
||||
if self.save_output:
|
||||
self.output = self.make_output(batch_output)
|
||||
def prepare_input(self, data_path):
|
||||
"""
|
||||
Save the dev data once it is loaded. Can return directly next time.
|
||||
:param data_path: str, the path to the pickle data for dev
|
||||
:return save_dev_data: list. Each entry is a sample, which is also a list of features and label(s).
|
||||
"""
|
||||
if self.save_dev_data is None:
|
||||
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
|
||||
self.save_dev_data = data_dev
|
||||
return self.save_dev_data
|
||||
|
||||
@property
|
||||
def loss(self):
|
||||
return self.mean_loss
|
||||
|
||||
@property
|
||||
def result(self):
|
||||
return self.output
|
||||
def batchify(self, data):
|
||||
"""
|
||||
1. Perform batching from data and produce a batch of training data.
|
||||
2. Add padding.
|
||||
:param data: list. Each entry is a sample, which is also a list of features and label(s).
|
||||
E.g.
|
||||
[
|
||||
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
|
||||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
|
||||
...
|
||||
]
|
||||
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
|
||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
|
||||
"""
|
||||
indices = next(self.iterator)
|
||||
batch = [data[idx] for idx in indices]
|
||||
batch_x = [sample[0] for sample in batch]
|
||||
batch_y = [sample[1] for sample in batch]
|
||||
batch_x = self.pad(batch_x)
|
||||
return batch_x, batch_y
|
||||
|
||||
@staticmethod
|
||||
def make_output(batch_outputs):
|
||||
# construct full prediction with batch outputs
|
||||
return np.concatenate(batch_outputs, axis=0)
|
||||
def pad(batch, fill=0):
|
||||
"""
|
||||
Pad a batch of samples to maximum length.
|
||||
:param batch: list of list
|
||||
:param fill: word index to pad, default 0.
|
||||
:return: a padded batch
|
||||
"""
|
||||
max_length = max([len(x) for x in batch])
|
||||
for idx, sample in enumerate(batch):
|
||||
if len(sample) < max_length:
|
||||
batch[idx] = sample + [fill * (max_length - len(sample))]
|
||||
return batch
|
||||
|
||||
def load_config(self, args):
|
||||
def data_forward(self, network, data):
|
||||
raise NotImplementedError
|
||||
|
||||
def load_dataset(self, args):
|
||||
def evaluate(self, predict, truth):
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def matrices(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def mode(self, model, test=True):
|
||||
"""To do: combine this function with Trainer ?? """
|
||||
if test:
|
||||
model.eval()
|
||||
else:
|
||||
model.train()
|
||||
self.eval_history.clear()
|
||||
|
||||
|
||||
class POSTester(BaseTester):
|
||||
"""
|
||||
Tester for sequence labeling.
|
||||
"""
|
||||
|
||||
def __init__(self, test_args):
|
||||
super(POSTester, self).__init__(test_args)
|
||||
self.max_len = None
|
||||
self.mask = None
|
||||
self.batch_result = None
|
||||
|
||||
def data_forward(self, network, x):
|
||||
"""To Do: combine with Trainer
|
||||
|
||||
:param network: the PyTorch model
|
||||
:param x: list of list, [batch_size, max_len]
|
||||
:return y: [batch_size, num_classes]
|
||||
"""
|
||||
seq_len = [len(seq) for seq in x]
|
||||
x = torch.Tensor(x).long()
|
||||
self.batch_size = x.size(0)
|
||||
self.max_len = x.size(1)
|
||||
self.mask = seq_mask(seq_len, self.max_len)
|
||||
y = network(x)
|
||||
return y
|
||||
|
||||
def evaluate(self, predict, truth):
|
||||
truth = torch.Tensor(truth)
|
||||
loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len)
|
||||
return loss.data
|
||||
|
||||
def matrices(self):
|
||||
return np.mean(self.eval_history)
|
||||
|
@ -1,12 +1,12 @@
|
||||
import _pickle
|
||||
from collections import namedtuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from fastNLP.action.action import Action
|
||||
from fastNLP.action.action import RandomSampler, Batchifier
|
||||
from fastNLP.action.tester import Tester
|
||||
from fastNLP.action.tester import POSTester
|
||||
from fastNLP.modules.utils import seq_mask
|
||||
|
||||
|
||||
class BaseTrainer(Action):
|
||||
@ -21,23 +21,29 @@ class BaseTrainer(Action):
|
||||
- grad_backward
|
||||
- get_loss
|
||||
"""
|
||||
TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"])
|
||||
|
||||
def __init__(self, train_args):
|
||||
"""
|
||||
training parameters
|
||||
:param train_args: dict of (key, value)
|
||||
|
||||
The base trainer requires the following keys:
|
||||
- epochs: int, the number of epochs in training
|
||||
- validate: bool, whether or not to validate on dev set
|
||||
- batch_size: int
|
||||
- pickle_path: str, the path to pickle files for pre-processing
|
||||
"""
|
||||
super(BaseTrainer, self).__init__()
|
||||
self.n_epochs = train_args.epochs
|
||||
self.validate = train_args.validate
|
||||
self.batch_size = train_args.batch_size
|
||||
self.pickle_path = train_args.pickle_path
|
||||
self.n_epochs = train_args["epochs"]
|
||||
self.validate = train_args["validate"]
|
||||
self.batch_size = train_args["batch_size"]
|
||||
self.pickle_path = train_args["pickle_path"]
|
||||
self.model = None
|
||||
self.iterator = None
|
||||
self.loss_func = None
|
||||
self.optimizer = None
|
||||
|
||||
def train(self, network):
|
||||
"""General training loop.
|
||||
"""General Training Steps
|
||||
:param network: a model
|
||||
|
||||
The method is framework independent.
|
||||
@ -51,22 +57,27 @@ class BaseTrainer(Action):
|
||||
- update
|
||||
Subclasses must implement these methods with a specific framework.
|
||||
"""
|
||||
# prepare model and data
|
||||
self.model = network
|
||||
data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path)
|
||||
|
||||
test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
|
||||
save_dev_input=True, save_loss=True, batch_size=self.batch_size)
|
||||
evaluator = Tester(test_args)
|
||||
# define tester over dev data
|
||||
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
|
||||
"save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path}
|
||||
validator = POSTester(valid_args)
|
||||
|
||||
best_loss = 1e10
|
||||
# main training epochs
|
||||
iterations = len(data_train) // self.batch_size
|
||||
|
||||
for epoch in range(self.n_epochs):
|
||||
self.mode(test=False)
|
||||
|
||||
# turn on network training mode; define optimizer; prepare batch iterator
|
||||
self.mode(test=False)
|
||||
self.define_optimizer()
|
||||
self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True))
|
||||
|
||||
# training iterations in one epoch
|
||||
for step in range(iterations):
|
||||
batch_x, batch_y = self.batchify(self.batch_size, data_train)
|
||||
batch_x, batch_y = self.batchify(data_train)
|
||||
|
||||
prediction = self.data_forward(network, batch_x)
|
||||
|
||||
@ -77,9 +88,8 @@ class BaseTrainer(Action):
|
||||
if self.validate:
|
||||
if data_dev is None:
|
||||
raise RuntimeError("No validation data provided.")
|
||||
evaluator.test(network, data_dev)
|
||||
if evaluator.loss < best_loss:
|
||||
best_loss = evaluator.loss
|
||||
validator.test(network)
|
||||
print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices()))
|
||||
|
||||
# finish training
|
||||
|
||||
@ -155,23 +165,20 @@ class BaseTrainer(Action):
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def batchify(self, batch_size, data):
|
||||
def batchify(self, data):
|
||||
"""
|
||||
1. Perform batching from data and produce a batch of training data.
|
||||
2. Add padding.
|
||||
:param batch_size: int, the size of a batch
|
||||
:param data: list. Each entry is a sample, which is also a list of features and label(s).
|
||||
E.g.
|
||||
[
|
||||
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 1
|
||||
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 2
|
||||
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
|
||||
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
|
||||
...
|
||||
]
|
||||
:return batch_x: list. Each entry is a list of features of a sample.
|
||||
batch_y: list. Each entry is a list of labels of a sample.
|
||||
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
|
||||
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
|
||||
"""
|
||||
if self.iterator is None:
|
||||
self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
|
||||
indices = next(self.iterator)
|
||||
batch = [data[idx] for idx in indices]
|
||||
batch_x = [sample[0] for sample in batch]
|
||||
@ -195,7 +202,9 @@ class BaseTrainer(Action):
|
||||
|
||||
|
||||
class ToyTrainer(BaseTrainer):
|
||||
"""A simple trainer for a PyTorch model."""
|
||||
"""
|
||||
deprecated
|
||||
"""
|
||||
|
||||
def __init__(self, train_args):
|
||||
super(ToyTrainer, self).__init__(train_args)
|
||||
@ -230,7 +239,7 @@ class ToyTrainer(BaseTrainer):
|
||||
|
||||
class WordSegTrainer(BaseTrainer):
|
||||
"""
|
||||
reserve for changes
|
||||
deprecated
|
||||
"""
|
||||
|
||||
def __init__(self, train_args):
|
||||
@ -301,6 +310,7 @@ class WordSegTrainer(BaseTrainer):
|
||||
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)
|
||||
|
||||
def get_loss(self, predict, truth):
|
||||
truth = torch.Tensor(truth)
|
||||
self._loss = torch.nn.CrossEntropyLoss(predict, truth)
|
||||
return self._loss
|
||||
|
||||
@ -313,8 +323,76 @@ class WordSegTrainer(BaseTrainer):
|
||||
self.optimizer.step()
|
||||
|
||||
|
||||
class POSTrainer(BaseTrainer):
|
||||
"""
|
||||
Trainer for Sequence Modeling
|
||||
|
||||
"""
|
||||
def __init__(self, train_args):
|
||||
super(POSTrainer, self).__init__(train_args)
|
||||
self.vocab_size = train_args["vocab_size"]
|
||||
self.num_classes = train_args["num_classes"]
|
||||
self.max_len = None
|
||||
self.mask = None
|
||||
|
||||
def prepare_input(self, data_path):
|
||||
"""
|
||||
To do: Load pkl files of train/dev/test and embedding
|
||||
"""
|
||||
data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
|
||||
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
|
||||
return data_train, data_dev, 0, 1
|
||||
|
||||
def data_forward(self, network, x):
|
||||
"""
|
||||
:param network: the PyTorch model
|
||||
:param x: list of list, [batch_size, max_len]
|
||||
:return y: [batch_size, num_classes]
|
||||
"""
|
||||
seq_len = [len(seq) for seq in x]
|
||||
x = torch.Tensor(x).long()
|
||||
self.batch_size = x.size(0)
|
||||
self.max_len = x.size(1)
|
||||
self.mask = seq_mask(seq_len, self.max_len)
|
||||
y = network(x)
|
||||
return y
|
||||
|
||||
def mode(self, test=False):
|
||||
if test:
|
||||
self.model.eval()
|
||||
else:
|
||||
self.model.train()
|
||||
|
||||
def define_optimizer(self):
|
||||
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
|
||||
|
||||
def grad_backward(self, loss):
|
||||
self.model.zero_grad()
|
||||
loss.backward()
|
||||
|
||||
def update(self):
|
||||
self.optimizer.step()
|
||||
|
||||
def get_loss(self, predict, truth):
|
||||
"""
|
||||
Compute loss given prediction and ground truth.
|
||||
:param predict: prediction label vector, [batch_size, num_classes]
|
||||
:param truth: ground truth label vector, [batch_size, max_len]
|
||||
:return: a scalar
|
||||
"""
|
||||
truth = torch.Tensor(truth)
|
||||
if self.loss_func is None:
|
||||
if hasattr(self.model, "loss"):
|
||||
self.loss_func = self.model.loss
|
||||
else:
|
||||
self.define_loss()
|
||||
loss, prediction = self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len)
|
||||
# print("loss={:.2f}".format(loss.data))
|
||||
return loss
|
||||
|
||||
|
||||
if __name__ == "__name__":
|
||||
train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./")
|
||||
train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"}
|
||||
trainer = BaseTrainer(train_args)
|
||||
data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
|
||||
trainer.batchify(batch_size=3, data=data_train)
|
||||
trainer.batchify(data=data_train)
|
||||
|
@ -15,7 +15,6 @@ class POSDatasetLoader(DatasetLoader):
|
||||
|
||||
def __init__(self, data_name, data_path):
|
||||
super(POSDatasetLoader, self).__init__(data_name, data_path)
|
||||
#self.data_set = self.load()
|
||||
|
||||
def load(self):
|
||||
assert os.path.exists(self.data_path)
|
||||
@ -24,7 +23,7 @@ class POSDatasetLoader(DatasetLoader):
|
||||
return line
|
||||
|
||||
def load_lines(self):
|
||||
assert os.path.exists(self.data_path)
|
||||
assert (os.path.exists(self.data_path))
|
||||
with open(self.data_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
return lines
|
||||
|
@ -46,19 +46,17 @@ class BasePreprocess(object):
|
||||
|
||||
|
||||
class POSPreprocess(BasePreprocess):
|
||||
|
||||
"""
|
||||
This class are used to preprocess the pos datasets.
|
||||
In these datasets, each line is divided by '\t'
|
||||
The first Col is the vocabulary.
|
||||
The second Col is the labels.
|
||||
In these datasets, each line are divided by '\t'
|
||||
while the first Col is the vocabulary and the second
|
||||
Col is the label.
|
||||
Different sentence are divided by an empty line.
|
||||
e.g:
|
||||
Tom label1
|
||||
and label2
|
||||
Jerry label1
|
||||
. label3
|
||||
|
||||
Hello label4
|
||||
world label5
|
||||
! label3
|
||||
@ -71,11 +69,13 @@ class POSPreprocess(BasePreprocess):
|
||||
super(POSPreprocess, self).__init__(data, pickle_path)
|
||||
self.word_dict = None
|
||||
self.label_dict = None
|
||||
self.data = data
|
||||
self.pickle_path = pickle_path
|
||||
self.build_dict()
|
||||
self.word2id()
|
||||
self.id2word()
|
||||
self.vocab_size = self.id2word()
|
||||
self.class2id()
|
||||
self.id2class()
|
||||
self.num_classes = self.id2class()
|
||||
self.embedding()
|
||||
self.data_train()
|
||||
self.data_dev()
|
||||
@ -87,7 +87,8 @@ class POSPreprocess(BasePreprocess):
|
||||
DEFAULT_RESERVED_LABEL[2]: 4}
|
||||
self.label_dict = {}
|
||||
for w in self.data:
|
||||
if len(w) == 0:
|
||||
w = w.strip()
|
||||
if len(w) <= 1:
|
||||
continue
|
||||
word = w.split('\t')
|
||||
|
||||
@ -95,10 +96,11 @@ class POSPreprocess(BasePreprocess):
|
||||
index = len(self.word_dict)
|
||||
self.word_dict[word[0]] = index
|
||||
|
||||
for label in word[1: ]:
|
||||
if label not in self.label_dict:
|
||||
index = len(self.label_dict)
|
||||
self.label_dict[label] = index
|
||||
# for label in word[1: ]:
|
||||
label = word[1]
|
||||
if label not in self.label_dict:
|
||||
index = len(self.label_dict)
|
||||
self.label_dict[label] = index
|
||||
|
||||
def pickle_exist(self, pickle_name):
|
||||
"""
|
||||
@ -107,7 +109,7 @@ class POSPreprocess(BasePreprocess):
|
||||
"""
|
||||
if not os.path.exists(self.pickle_path):
|
||||
os.makedirs(self.pickle_path)
|
||||
file_name = self.pickle_path + pickle_name
|
||||
file_name = os.path.join(self.pickle_path, pickle_name)
|
||||
if os.path.exists(file_name):
|
||||
return True
|
||||
else:
|
||||
@ -118,42 +120,48 @@ class POSPreprocess(BasePreprocess):
|
||||
return
|
||||
# nothing will be done if word2id.pkl exists
|
||||
|
||||
file_name = self.pickle_path + "word2id.pkl"
|
||||
with open(file_name, "wb", encoding='utf-8') as f:
|
||||
file_name = os.path.join(self.pickle_path, "word2id.pkl")
|
||||
with open(file_name, "wb") as f:
|
||||
_pickle.dump(self.word_dict, f)
|
||||
|
||||
def id2word(self):
|
||||
if self.pickle_exist("id2word.pkl"):
|
||||
return
|
||||
file_name = os.path.join(self.pickle_path, "id2word.pkl")
|
||||
id2word_dict = _pickle.load(open(file_name, "rb"))
|
||||
return len(id2word_dict)
|
||||
# nothing will be done if id2word.pkl exists
|
||||
|
||||
id2word_dict = {}
|
||||
for word in self.word_dict:
|
||||
id2word_dict[self.word_dict[word]] = word
|
||||
file_name = self.pickle_path + "id2word.pkl"
|
||||
with open(file_name, "wb", encoding='utf-8') as f:
|
||||
file_name = os.path.join(self.pickle_path, "id2word.pkl")
|
||||
with open(file_name, "wb") as f:
|
||||
_pickle.dump(id2word_dict, f)
|
||||
return len(id2word_dict)
|
||||
|
||||
def class2id(self):
|
||||
if self.pickle_exist("class2id.pkl"):
|
||||
return
|
||||
# nothing will be done if class2id.pkl exists
|
||||
|
||||
file_name = self.pickle_path + "class2id.pkl"
|
||||
with open(file_name, "wb", encoding='utf-8') as f:
|
||||
file_name = os.path.join(self.pickle_path, "class2id.pkl")
|
||||
with open(file_name, "wb") as f:
|
||||
_pickle.dump(self.label_dict, f)
|
||||
|
||||
def id2class(self):
|
||||
if self.pickle_exist("id2class.pkl"):
|
||||
return
|
||||
file_name = os.path.join(self.pickle_path, "id2class.pkl")
|
||||
id2class_dict = _pickle.load(open(file_name, "rb"))
|
||||
return len(id2class_dict)
|
||||
# nothing will be done if id2class.pkl exists
|
||||
|
||||
id2class_dict = {}
|
||||
for label in self.label_dict:
|
||||
id2class_dict[self.label_dict[label]] = label
|
||||
file_name = self.pickle_path + "id2class.pkl"
|
||||
with open(file_name, "wb", encoding='utf-8') as f:
|
||||
file_name = os.path.join(self.pickle_path, "id2class.pkl")
|
||||
with open(file_name, "wb") as f:
|
||||
_pickle.dump(id2class_dict, f)
|
||||
return len(id2class_dict)
|
||||
|
||||
def embedding(self):
|
||||
if self.pickle_exist("embedding.pkl"):
|
||||
@ -168,22 +176,26 @@ class POSPreprocess(BasePreprocess):
|
||||
data_train = []
|
||||
sentence = []
|
||||
for w in self.data:
|
||||
if len(w) == 0:
|
||||
w = w.strip()
|
||||
if len(w) <= 1:
|
||||
wid = []
|
||||
lid = []
|
||||
for i in range(len(sentence)):
|
||||
# if sentence[i][0]=="":
|
||||
# print("")
|
||||
wid.append(self.word_dict[sentence[i][0]])
|
||||
lid.append(self.label_dict[sentence[i][1]])
|
||||
data_train.append((wid, lid))
|
||||
sentence = []
|
||||
continue
|
||||
sentence.append(w.split('\t'))
|
||||
|
||||
file_name = self.pickle_path + "data_train.pkl"
|
||||
with open(file_name, "wb", encoding='utf-8') as f:
|
||||
file_name = os.path.join(self.pickle_path, "data_train.pkl")
|
||||
with open(file_name, "wb") as f:
|
||||
_pickle.dump(data_train, f)
|
||||
|
||||
def data_dev(self):
|
||||
pass
|
||||
|
||||
def data_test(self):
|
||||
pass
|
||||
pass
|
@ -3,32 +3,12 @@ import torch
|
||||
|
||||
class BaseModel(torch.nn.Module):
|
||||
"""Base PyTorch model for all models.
|
||||
Three network modules presented:
|
||||
- embedding module
|
||||
- aggregation module
|
||||
- output module
|
||||
Subclasses must implement these three modules with "components".
|
||||
To do: add some useful common features
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(BaseModel, self).__init__()
|
||||
|
||||
def forward(self, *inputs):
|
||||
x = self.encode(*inputs)
|
||||
x = self.aggregation(x)
|
||||
x = self.output(x)
|
||||
return x
|
||||
|
||||
def encode(self, x):
|
||||
raise NotImplementedError
|
||||
|
||||
def aggregation(self, x):
|
||||
raise NotImplementedError
|
||||
|
||||
def output(self, x):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
class Vocabulary(object):
|
||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||
@ -93,3 +73,4 @@ class Token(object):
|
||||
self.doc = doc
|
||||
self.token = doc[offset]
|
||||
self.i = offset
|
||||
|
||||
|
97
fastNLP/models/sequence_modeling.py
Normal file
97
fastNLP/models/sequence_modeling.py
Normal file
@ -0,0 +1,97 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from fastNLP.models.base_model import BaseModel
|
||||
from fastNLP.modules.CRF import ContionalRandomField
|
||||
|
||||
|
||||
class SeqLabeling(BaseModel):
|
||||
"""
|
||||
PyTorch Network for sequence labeling
|
||||
"""
|
||||
|
||||
def __init__(self, hidden_dim,
|
||||
rnn_num_layer,
|
||||
num_classes,
|
||||
vocab_size,
|
||||
word_emb_dim=100,
|
||||
init_emb=None,
|
||||
rnn_mode="gru",
|
||||
bi_direction=False,
|
||||
dropout=0.5,
|
||||
use_crf=True):
|
||||
super(SeqLabeling, self).__init__()
|
||||
|
||||
self.Emb = nn.Embedding(vocab_size, word_emb_dim)
|
||||
if init_emb:
|
||||
self.Emb.weight = nn.Parameter(init_emb)
|
||||
|
||||
self.num_classes = num_classes
|
||||
self.input_dim = word_emb_dim
|
||||
self.layers = rnn_num_layer
|
||||
self.hidden_dim = hidden_dim
|
||||
self.bi_direction = bi_direction
|
||||
self.dropout = dropout
|
||||
self.mode = rnn_mode
|
||||
|
||||
if self.mode == "lstm":
|
||||
self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
|
||||
bidirectional=self.bi_direction, dropout=self.dropout)
|
||||
elif self.mode == "gru":
|
||||
self.rnn = nn.GRU(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
|
||||
bidirectional=self.bi_direction, dropout=self.dropout)
|
||||
elif self.mode == "rnn":
|
||||
self.rnn = nn.RNN(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
|
||||
bidirectional=self.bi_direction, dropout=self.dropout)
|
||||
else:
|
||||
raise Exception
|
||||
if bi_direction:
|
||||
self.linear = nn.Linear(self.hidden_dim * 2, self.num_classes)
|
||||
else:
|
||||
self.linear = nn.Linear(self.hidden_dim, self.num_classes)
|
||||
self.use_crf = use_crf
|
||||
if self.use_crf:
|
||||
self.crf = ContionalRandomField(num_classes)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
:param x: LongTensor, [batch_size, mex_len]
|
||||
:return y: [batch_size, tag_size, tag_size]
|
||||
"""
|
||||
x = self.Emb(x)
|
||||
# [batch_size, max_len, word_emb_dim]
|
||||
x, hidden = self.rnn(x)
|
||||
# [batch_size, max_len, hidden_size * direction]
|
||||
y = self.linear(x)
|
||||
# [batch_size, max_len, num_classes]
|
||||
return y
|
||||
|
||||
def loss(self, x, y, mask, batch_size, max_len):
|
||||
"""
|
||||
Negative log likelihood loss.
|
||||
:param x: FloatTensor, [batch_size, tag_size, tag_size]
|
||||
:param y: LongTensor, [batch_size, max_len]
|
||||
:param mask: ByteTensor, [batch_size, max_len]
|
||||
:param batch_size: int
|
||||
:param max_len: int
|
||||
:return loss:
|
||||
prediction:
|
||||
"""
|
||||
x = x.float()
|
||||
y = y.long()
|
||||
mask = mask.byte()
|
||||
# print(x.shape, y.shape, mask.shape)
|
||||
|
||||
if self.use_crf:
|
||||
total_loss = self.crf(x, y, mask)
|
||||
tag_seq = self.crf.viterbi_decode(x, mask)
|
||||
else:
|
||||
# error
|
||||
loss_function = nn.NLLLoss(ignore_index=0, size_average=False)
|
||||
x = x.view(batch_size * max_len, -1)
|
||||
score = F.log_softmax(x)
|
||||
total_loss = loss_function(score, y.view(batch_size * max_len))
|
||||
_, tag_seq = torch.max(score)
|
||||
tag_seq = tag_seq.view(batch_size, max_len)
|
||||
return torch.mean(total_loss), tag_seq
|
@ -82,7 +82,7 @@ class ContionalRandomField(nn.Module):
|
||||
def _glod_score(self, feats, tags, masks):
|
||||
"""
|
||||
Compute the score for the gold path.
|
||||
:param feats: FloatTensor, batch_size x tag_size x tag_size
|
||||
:param feats: FloatTensor, batch_size x max_len x tag_size
|
||||
:param tags: LongTensor, batch_size x max_len
|
||||
:param masks: ByteTensor, batch_size x max_len
|
||||
:return:FloatTensor, batch_size
|
||||
@ -118,7 +118,7 @@ class ContionalRandomField(nn.Module):
|
||||
def forward(self, feats, tags, masks):
|
||||
"""
|
||||
Calculate the neg log likelihood
|
||||
:param feats:FloatTensor, batch_size x tag_size x tag_size
|
||||
:param feats:FloatTensor, batch_size x max_len x tag_size
|
||||
:param tags:LongTensor, batch_size x max_len
|
||||
:param masks:ByteTensor batch_size x max_len
|
||||
:return:FloatTensor, batch_size
|
||||
|
@ -1,12 +1,13 @@
|
||||
import time
|
||||
|
||||
import aggregation
|
||||
import dataloader
|
||||
import embedding
|
||||
import encoder
|
||||
import predict
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import encoder
|
||||
import aggregation
|
||||
import embedding
|
||||
import predict
|
||||
import torch.optim as optim
|
||||
import time
|
||||
import dataloader
|
||||
|
||||
WORD_NUM = 357361
|
||||
WORD_SIZE = 100
|
||||
@ -16,6 +17,30 @@ R = 10
|
||||
MLP_HIDDEN = 2000
|
||||
CLASSES_NUM = 5
|
||||
|
||||
from fastNLP.models.base_model import BaseModel
|
||||
from fastNLP.action.trainer import BaseTrainer
|
||||
|
||||
|
||||
class MyNet(BaseModel):
|
||||
def __init__(self):
|
||||
super(MyNet, self).__init__()
|
||||
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
|
||||
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
|
||||
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
|
||||
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
|
||||
self.penalty = None
|
||||
|
||||
def encode(self, x):
|
||||
return self.encode(self.embedding(x))
|
||||
|
||||
def aggregate(self, x):
|
||||
x, self.penalty = self.aggregate(x)
|
||||
return x
|
||||
|
||||
def decode(self, x):
|
||||
return [self.predict(x), self.penalty]
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
"""
|
||||
A model for sentiment analysis using lstm and self-attention
|
||||
@ -34,6 +59,19 @@ class Net(nn.Module):
|
||||
x = self.predict(x)
|
||||
return x, penalty
|
||||
|
||||
|
||||
class MyTrainer(BaseTrainer):
|
||||
def __init__(self, args):
|
||||
super(MyTrainer, self).__init__(args)
|
||||
self.optimizer = None
|
||||
|
||||
def define_optimizer(self):
|
||||
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
|
||||
|
||||
def define_loss(self):
|
||||
self.loss_func = nn.CrossEntropyLoss()
|
||||
|
||||
|
||||
def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
|
||||
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
|
||||
"""
|
||||
|
@ -7,3 +7,9 @@ def mask_softmax(matrix, mask):
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return result
|
||||
|
||||
|
||||
def seq_mask(seq_len, max_len):
|
||||
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)]
|
||||
mask = torch.stack(mask, 1)
|
||||
return mask
|
||||
|
@ -1,3 +1,3 @@
|
||||
numpy==1.14.2
|
||||
numpy>=1.14.2
|
||||
torch==0.4.0
|
||||
torchvision==0.1.8
|
||||
torchvision>=0.1.8
|
||||
|
67
test/data_for_tests/people.txt
Normal file
67
test/data_for_tests/people.txt
Normal file
@ -0,0 +1,67 @@
|
||||
迈 B-v
|
||||
向 E-v
|
||||
充 B-v
|
||||
满 E-v
|
||||
希 B-n
|
||||
望 E-n
|
||||
的 S-u
|
||||
新 S-a
|
||||
世 B-n
|
||||
纪 E-n
|
||||
— B-w
|
||||
— E-w
|
||||
一 B-t
|
||||
九 M-t
|
||||
九 M-t
|
||||
八 M-t
|
||||
年 E-t
|
||||
新 B-t
|
||||
年 E-t
|
||||
讲 B-n
|
||||
话 E-n
|
||||
( S-w
|
||||
附 S-v
|
||||
图 B-n
|
||||
片 E-n
|
||||
1 S-m
|
||||
张 S-q
|
||||
) S-w
|
||||
|
||||
中 B-nt
|
||||
共 M-nt
|
||||
中 M-nt
|
||||
央 E-nt
|
||||
总 B-n
|
||||
书 M-n
|
||||
记 E-n
|
||||
、 S-w
|
||||
国 B-n
|
||||
家 E-n
|
||||
主 B-n
|
||||
席 E-n
|
||||
江 B-nr
|
||||
泽 M-nr
|
||||
民 E-nr
|
||||
|
||||
( S-w
|
||||
一 B-t
|
||||
九 M-t
|
||||
九 M-t
|
||||
七 M-t
|
||||
年 E-t
|
||||
十 B-t
|
||||
二 M-t
|
||||
月 E-t
|
||||
三 B-t
|
||||
十 M-t
|
||||
一 M-t
|
||||
日 E-t
|
||||
) S-w
|
||||
|
||||
1 B-t
|
||||
2 M-t
|
||||
月 E-t
|
||||
3 B-t
|
||||
1 M-t
|
||||
日 E-t
|
||||
, S-w
|
35
test/test_POS_pipeline.py
Normal file
35
test/test_POS_pipeline.py
Normal file
@ -0,0 +1,35 @@
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
|
||||
from fastNLP.action.trainer import POSTrainer
|
||||
from fastNLP.loader.dataset_loader import POSDatasetLoader
|
||||
from fastNLP.loader.preprocess import POSPreprocess
|
||||
from fastNLP.models.sequence_modeling import SeqLabeling
|
||||
|
||||
data_name = "people.txt"
|
||||
data_path = "data_for_tests/people.txt"
|
||||
pickle_path = "data_for_tests"
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Data Loader
|
||||
pos = POSDatasetLoader(data_name, data_path)
|
||||
train_data = pos.load_lines()
|
||||
|
||||
# Preprocessor
|
||||
p = POSPreprocess(train_data, pickle_path)
|
||||
vocab_size = p.vocab_size
|
||||
num_classes = p.num_classes
|
||||
|
||||
# Trainer
|
||||
train_args = {"epochs": 20, "batch_size": 1, "num_classes": num_classes,
|
||||
"vocab_size": vocab_size, "pickle_path": pickle_path, "validate": True}
|
||||
trainer = POSTrainer(train_args)
|
||||
|
||||
# Model
|
||||
model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True)
|
||||
|
||||
# Start training
|
||||
trainer.train(model)
|
||||
|
||||
print("Training finished!")
|
Loading…
Reference in New Issue
Block a user