mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-03 20:57:37 +08:00
Merge branch 'dataset' of github.com:yhcc/fastNLP into dataset
This commit is contained in:
commit
7df33b23ea
@ -1,14 +1,18 @@
|
||||
|
||||
import torch
|
||||
|
||||
from fastNLP.core.dataset import DataSet
|
||||
from fastNLP.core.instance import Instance
|
||||
from fastNLP.core.predictor import Predictor
|
||||
|
||||
|
||||
class API:
|
||||
def __init__(self):
|
||||
self.pipeline = None
|
||||
self.model = None
|
||||
|
||||
def predict(self):
|
||||
pass
|
||||
def predict(self, *args, **kwargs):
|
||||
raise NotImplementedError
|
||||
|
||||
def load(self, name):
|
||||
_dict = torch.load(name)
|
||||
@ -19,3 +23,47 @@ class API:
|
||||
_dict = {'pipeline': self.pipeline,
|
||||
'model': self.model}
|
||||
torch.save(_dict, path)
|
||||
|
||||
|
||||
class POS_tagger(API):
|
||||
"""FastNLP API for Part-Of-Speech tagging.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super(POS_tagger, self).__init__()
|
||||
|
||||
def predict(self, query):
|
||||
"""
|
||||
|
||||
:param query: list of list of str. Each string is a token(word).
|
||||
:return answer: list of list of str. Each string is a tag.
|
||||
"""
|
||||
self.load("/home/zyfeng/fastnlp_0.2.0/reproduction/pos_tag_model/model_pp.pkl")
|
||||
|
||||
data = DataSet()
|
||||
for example in query:
|
||||
data.append(Instance(words=example))
|
||||
|
||||
data = self.pipeline(data)
|
||||
|
||||
predictor = Predictor()
|
||||
outputs = predictor.predict(self.model, data)
|
||||
|
||||
answers = []
|
||||
for out in outputs:
|
||||
out = out.numpy()
|
||||
for sent in out:
|
||||
answers.append([self.tag_vocab.to_word(tag) for tag in sent])
|
||||
return answers
|
||||
|
||||
def load(self, name):
|
||||
_dict = torch.load(name)
|
||||
self.pipeline = _dict['pipeline']
|
||||
self.model = _dict['model']
|
||||
self.tag_vocab = _dict["tag_vocab"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tagger = POS_tagger()
|
||||
print(tagger.predict([["我", "是", "学生", "。"], ["我", "是", "学生", "。"]]))
|
||||
|
@ -11,7 +11,7 @@ class Pipeline:
|
||||
self.pipeline = []
|
||||
if isinstance(processors, list):
|
||||
for proc in processors:
|
||||
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(processor))
|
||||
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc))
|
||||
self.pipeline = processors
|
||||
|
||||
def add_processor(self, processor):
|
||||
|
@ -9,7 +9,7 @@ class Batch(object):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, batch_size, sampler, use_cuda, sort_in_batch=False, sort_key=None):
|
||||
def __init__(self, dataset, batch_size, sampler, use_cuda):
|
||||
"""
|
||||
|
||||
:param dataset: a DataSet object
|
||||
@ -22,8 +22,6 @@ class Batch(object):
|
||||
self.batch_size = batch_size
|
||||
self.sampler = sampler
|
||||
self.use_cuda = use_cuda
|
||||
self.sort_in_batch = sort_in_batch
|
||||
self.sort_key = sort_key if sort_key is not None else 'word_seq'
|
||||
self.idx_list = None
|
||||
self.curidx = 0
|
||||
|
||||
|
@ -119,7 +119,7 @@ class DataSet(object):
|
||||
assert isinstance(val, bool)
|
||||
self.field_arrays[name].is_target = val
|
||||
else:
|
||||
raise KeyError
|
||||
raise KeyError("{} is not a valid field name.".format(name))
|
||||
return self
|
||||
|
||||
def set_need_tensor(self, **kwargs):
|
||||
|
@ -43,12 +43,11 @@ class SeqLabelEvaluator(Evaluator):
|
||||
:return accuracy:
|
||||
"""
|
||||
truth = [item["truth"] for item in truth]
|
||||
total_correct, total_count= 0., 0.
|
||||
total_correct, total_count = 0., 0.
|
||||
for x, y in zip(predict, truth):
|
||||
x = torch.Tensor(x)
|
||||
x = torch.tensor(x)
|
||||
y = y.to(x) # make sure they are in the same device
|
||||
mask = x.ge(1).float()
|
||||
# correct = torch.sum(x * mask.float() == (y * mask.long()).float())
|
||||
mask = x.ge(1).long()
|
||||
correct = torch.sum(x * mask == y * mask)
|
||||
correct -= torch.sum(x.le(0))
|
||||
total_correct += float(correct)
|
||||
|
@ -74,7 +74,7 @@ class Tester(object):
|
||||
output_list = []
|
||||
truth_list = []
|
||||
|
||||
data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
|
||||
data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda)
|
||||
|
||||
with torch.no_grad():
|
||||
for batch_x, batch_y in data_iterator:
|
||||
|
@ -11,12 +11,14 @@ from fastNLP.core.metrics import Evaluator
|
||||
from fastNLP.core.optimizer import Optimizer
|
||||
from fastNLP.core.sampler import RandomSampler
|
||||
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
|
||||
from fastNLP.core.tester import Tester
|
||||
from fastNLP.saver.logger import create_logger
|
||||
from fastNLP.saver.model_saver import ModelSaver
|
||||
|
||||
logger = create_logger(__name__, "./train_test.log")
|
||||
logger.disabled = True
|
||||
|
||||
|
||||
class Trainer(object):
|
||||
"""Operations of training a model, including data loading, gradient descent, and validation.
|
||||
|
||||
@ -138,23 +140,22 @@ class Trainer(object):
|
||||
print("training epochs started " + self.start_time)
|
||||
logger.info("training epochs started " + self.start_time)
|
||||
epoch, iters = 1, 0
|
||||
while(1):
|
||||
if self.n_epochs != -1 and epoch > self.n_epochs:
|
||||
break
|
||||
while epoch <= self.n_epochs:
|
||||
logger.info("training epoch {}".format(epoch))
|
||||
|
||||
# prepare mini-batch iterator
|
||||
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
|
||||
use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
|
||||
use_cuda=self.use_cuda)
|
||||
logger.info("prepared data iterator")
|
||||
|
||||
# one forward and backward pass
|
||||
iters = self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch, step=iters, dev_data=dev_data)
|
||||
iters = self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch,
|
||||
step=iters, dev_data=dev_data)
|
||||
|
||||
# validation
|
||||
if self.validate:
|
||||
self.valid_model()
|
||||
self.save_model(self._model, 'training_model_'+self.start_time)
|
||||
self.save_model(self._model, 'training_model_' + self.start_time)
|
||||
epoch += 1
|
||||
|
||||
def _train_step(self, data_iterator, network, **kwargs):
|
||||
@ -171,13 +172,13 @@ class Trainer(object):
|
||||
|
||||
loss = self.get_loss(prediction, batch_y)
|
||||
self.grad_backward(loss)
|
||||
# if torch.rand(1).item() < 0.001:
|
||||
# print('[grads at epoch: {:>3} step: {:>4}]'.format(kwargs['epoch'], step))
|
||||
# for name, p in self._model.named_parameters():
|
||||
# if p.requires_grad:
|
||||
# print('\t{} {} {}'.format(name, tuple(p.size()), torch.sum(p.grad).item()))
|
||||
self.update()
|
||||
self._summary_writer.add_scalar("loss", loss.item(), global_step=step)
|
||||
for name, param in self._model.named_parameters():
|
||||
if param.requires_grad:
|
||||
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
|
||||
self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
|
||||
self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
|
||||
|
||||
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
|
||||
end = time.time()
|
||||
@ -193,14 +194,14 @@ class Trainer(object):
|
||||
|
||||
def valid_model(self):
|
||||
if self.dev_data is None:
|
||||
raise RuntimeError(
|
||||
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
|
||||
raise RuntimeError(
|
||||
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
|
||||
logger.info("validation started")
|
||||
res = self.validator.test(self._model, self.dev_data)
|
||||
if self.save_best_dev and self.best_eval_result(res):
|
||||
logger.info('save best result! {}'.format(res))
|
||||
print('save best result! {}'.format(res))
|
||||
self.save_model(self._model, 'best_model_'+self.start_time)
|
||||
self.save_model(self._model, 'best_model_' + self.start_time)
|
||||
return res
|
||||
|
||||
def mode(self, model, is_test=False):
|
||||
@ -230,7 +231,6 @@ class Trainer(object):
|
||||
def update(self):
|
||||
"""Perform weight update on a model.
|
||||
|
||||
For PyTorch, just call optimizer to update.
|
||||
"""
|
||||
self._optimizer.step()
|
||||
|
||||
@ -319,15 +319,17 @@ class Trainer(object):
|
||||
ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network)
|
||||
|
||||
def _create_validator(self, valid_args):
|
||||
raise NotImplementedError
|
||||
return Tester(**valid_args)
|
||||
|
||||
def set_validator(self, validor):
|
||||
self.validator = validor
|
||||
|
||||
|
||||
class SeqLabelTrainer(Trainer):
|
||||
"""Trainer for Sequence Labeling
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
print(
|
||||
"[FastNLP Warning] SeqLabelTrainer will be deprecated. Please use Trainer directly.")
|
||||
|
@ -116,11 +116,11 @@ class AdvSeqLabel(SeqLabeling):
|
||||
num_classes = args["num_classes"]
|
||||
|
||||
self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb)
|
||||
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.3, bidirectional=True)
|
||||
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True)
|
||||
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
|
||||
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3)
|
||||
self.relu = torch.nn.ReLU()
|
||||
self.drop = torch.nn.Dropout(0.3)
|
||||
self.drop = torch.nn.Dropout(0.5)
|
||||
self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes)
|
||||
|
||||
self.Crf = decoder.CRF.ConditionalRandomField(num_classes)
|
||||
@ -135,7 +135,7 @@ class AdvSeqLabel(SeqLabeling):
|
||||
"""
|
||||
word_seq = word_seq.long()
|
||||
word_seq_origin_len = word_seq_origin_len.long()
|
||||
truth = truth.long()
|
||||
truth = truth.long() if truth is not None else None
|
||||
self.mask = self.make_mask(word_seq, word_seq_origin_len)
|
||||
|
||||
batch_size = word_seq.size(0)
|
||||
|
@ -3,6 +3,7 @@ from torch import nn
|
||||
|
||||
from fastNLP.modules.utils import initial_parameter
|
||||
|
||||
|
||||
def log_sum_exp(x, dim=-1):
|
||||
max_value, _ = x.max(dim=dim, keepdim=True)
|
||||
res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value
|
||||
@ -91,7 +92,6 @@ class ConditionalRandomField(nn.Module):
|
||||
st_scores = self.start_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[0]]
|
||||
last_idx = mask.long().sum(0) - 1
|
||||
ed_scores = self.end_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[last_idx, batch_idx]]
|
||||
print(score.size(), st_scores.size(), ed_scores.size())
|
||||
score += st_scores + ed_scores
|
||||
# return [B,]
|
||||
return score
|
||||
@ -128,7 +128,7 @@ class ConditionalRandomField(nn.Module):
|
||||
vpath = data.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
|
||||
vscore = data[0]
|
||||
if self.include_start_end_trans:
|
||||
vscore += self.start_scores.view(1. -1)
|
||||
vscore += self.start_scores.view(1, -1)
|
||||
for i in range(1, seq_len):
|
||||
prev_score = vscore.view(batch_size, n_tags, 1)
|
||||
cur_score = data[i].view(batch_size, 1, n_tags)
|
||||
|
@ -1,6 +1,6 @@
|
||||
[train]
|
||||
epochs = 30
|
||||
batch_size = 64
|
||||
epochs = 40
|
||||
batch_size = 8
|
||||
pickle_path = "./save/"
|
||||
validate = true
|
||||
save_best_dev = true
|
||||
|
@ -1,6 +1,6 @@
|
||||
[train]
|
||||
epochs = 5
|
||||
batch_size = 2
|
||||
batch_size = 64
|
||||
pickle_path = "./save/"
|
||||
validate = false
|
||||
save_best_dev = true
|
||||
|
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
import os
|
||||
|
||||
import torch
|
||||
@ -6,15 +7,20 @@ from fastNLP.api.pipeline import Pipeline
|
||||
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor
|
||||
from fastNLP.core.dataset import DataSet
|
||||
from fastNLP.core.instance import Instance
|
||||
from fastNLP.core.metrics import SeqLabelEvaluator
|
||||
from fastNLP.core.optimizer import Optimizer
|
||||
from fastNLP.core.trainer import Trainer
|
||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
|
||||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
|
||||
from fastNLP.models.sequence_modeling import AdvSeqLabel
|
||||
|
||||
cfgfile = './pos_tag.cfg'
|
||||
# datadir = "/home/zyfeng/data/"
|
||||
# data_name = "POS_PD_1998.txt"
|
||||
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/"
|
||||
data_name = "people_daily_raw.txt"
|
||||
|
||||
|
||||
pos_tag_data_path = os.path.join(datadir, data_name)
|
||||
pickle_path = "save"
|
||||
data_infer_path = os.path.join(datadir, "infer.utf8")
|
||||
@ -53,6 +59,9 @@ def train():
|
||||
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len")
|
||||
seq_len_proc(dataset)
|
||||
|
||||
dev_set = copy.deepcopy(dataset)
|
||||
dev_set.set_is_target(truth=True)
|
||||
|
||||
print("processors defined")
|
||||
# dataset.set_is_target(tag_ids=True)
|
||||
model_param["vocab_size"] = len(word_vocab_proc.get_vocab())
|
||||
@ -63,12 +72,17 @@ def train():
|
||||
model = AdvSeqLabel(model_param)
|
||||
|
||||
# call trainer to train
|
||||
trainer = Trainer(**train_param.data)
|
||||
trainer.train(model, dataset)
|
||||
trainer = Trainer(epochs=train_param["epochs"],
|
||||
batch_size=train_param["batch_size"],
|
||||
validate=True,
|
||||
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
|
||||
evaluator=SeqLabelEvaluator()
|
||||
)
|
||||
trainer.train(model, dataset, dev_set)
|
||||
|
||||
# save model & pipeline
|
||||
pp = Pipeline([word_vocab_proc, word_indexer, seq_len_proc])
|
||||
save_dict = {"pipeline": pp, "model": model}
|
||||
pp = Pipeline([word_indexer, seq_len_proc])
|
||||
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()}
|
||||
torch.save(save_dict, "model_pp.pkl")
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user