Merge pull request #132 from FengZiYjun/v0.3.1

fastNLP V0.3.1
This commit is contained in:
Coet 2019-02-06 10:40:53 +08:00 committed by GitHub
commit 13faa2b410
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
70 changed files with 3959 additions and 4834 deletions

5
codecov.yml Normal file
View File

@ -0,0 +1,5 @@
ignore:
- "reproduction" # ignore folders and all its contents
- "setup.py"
- "docs"
- "tutorials"

View File

@ -1,7 +1,8 @@
fastNLP上手教程
fastNLP 10分钟上手教程
===============
教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_10min_tutorial.ipynb
fastNLP提供方便的数据预处理训练和测试模型的功能
DataSet & Instance

View File

@ -2,6 +2,8 @@
FastNLP 1分钟上手教程
=====================
教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_1min_tutorial.ipynb
step 1
------

View File

@ -0,0 +1,5 @@
fastNLP 进阶教程
===============
教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb

View File

@ -0,0 +1,5 @@
fastNLP 开发者指南
===============
原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/tutorial_for_developer.md

View File

@ -5,6 +5,7 @@ Installation
.. contents::
:local:
Make sure your environment satisfies https://github.com/fastnlp/fastNLP/blob/master/requirements.txt .
Run the following commands to install fastNLP package:

View File

@ -6,4 +6,6 @@ Quickstart
../tutorials/fastnlp_1_minute_tutorial
../tutorials/fastnlp_10tmin_tutorial
../tutorials/fastnlp_advanced_tutorial
../tutorials/fastnlp_developer_guide

View File

@ -18,26 +18,27 @@ print(cws.predict(text))
# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?']
```
### 中文分词+词性标注
### 词性标注
```python
text = ['编者按7月12日英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '']]
from fastNLP.api import POS
pos = POS(device='cpu')
print(pos.predict(text))
# [['编者/NN', '按/P', '/PU', '7月/NT', '12日/NR', '/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一/OD', '款高/NN', '科技/NN', '隐形/NN', '无/VE', '人机/NN', '雷电/NN', '之/DEG', '神/NN', '。/PU'], ['这/DT', '款/NN', '飞行/VV', '从/P', '外型/NN', '上/LC', '来/MSP', '看/VV', '酷似/VV', '电影/NN', '中/LC', '的/DEG', '太空/NN', '飞行器/NN', '/PU', '据/P', '英国/NR', '方面/NN', '介绍/VV', '/PU', '可以/VV', '实现/VV', '洲际/NN', '远程/NN', '打击/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无/VE', '人机/NN', '到底/AD', '有/VE', '多/CD', '厉害/NN', '/PU']]
# [['编者/NN', '按/NN', '7月/NT', '12日/NT', '/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '/PU']]
```
### 中文分词+词性标注+句法分析
### 句法分析
```python
text = ['编者按7月12日英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '', '', '无人机', '到底', '', '', '厉害', '']]
from fastNLP.api import Parser
parser = Parser(device='cpu')
print(parser.predict(text))
# [['12/nsubj', '12/prep', '2/punct', '5/nn', '2/pobj', '12/punct', '11/nn', '11/nn', '11/nn', '11/nn', '2/pobj', '0/root', '12/asp', '15/det', '16/nsubj', '21/rcmod', '16/cpm', '21/nummod', '21/nn', '21/nn', '22/top', '12/ccomp', '24/nn', '26/assmod', '24/assm', '22/dobj', '12/punct'], ['2/det', '8/xsubj', '8/mmod', '8/prep', '6/lobj', '4/plmod', '8/prtmod', '0/root', '8/ccomp', '11/lobj', '14/assmod', '11/assm', '14/nn', '9/dobj', '8/punct', '22/prep', '18/nn', '19/nsubj', '16/pccomp', '22/punct', '22/mmod', '8/dep', '25/nn', '25/nn', '22/dobj', '8/punct'], ['4/advmod', '3/det', '4/nsubj', '0/root', '4/dobj', '7/advmod', '4/conj', '9/nummod', '7/dobj', '4/punct']]
# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']]
```
完整样例见`examples.py`

View File

@ -9,9 +9,7 @@ from fastNLP.core.dataset import DataSet
from fastNLP.api.utils import load_url
from fastNLP.api.processor import ModelProcessor
from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.io.dataset_loader import ConllCWSReader, ConllxDataLoader
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.core.metrics import SpanFPreRecMetric
@ -19,9 +17,9 @@ from fastNLP.api.processor import IndexerProcessor
# TODO add pretrain urls
model_urls = {
"cws": "http://123.206.98.91:8888/download/cws_crf_1_11-457fc899.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190108-f3c60ee5.pkl",
"parser": "http://123.206.98.91:8888/download/biaffine_parser-3a2f052c.pkl"
"cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl",
"parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl"
}
@ -31,6 +29,16 @@ class API:
self._dict = None
def predict(self, *args, **kwargs):
"""Do prediction for the given input.
"""
raise NotImplementedError
def test(self, file_path):
"""Test performance over the given data set.
:param str file_path:
:return: a dictionary of metric values
"""
raise NotImplementedError
def load(self, path, device):
@ -69,12 +77,11 @@ class POS(API):
if not hasattr(self, "pipeline"):
raise ValueError("You have to load model first.")
sentence_list = []
sentence_list = content
# 1. 检查sentence的类型
if isinstance(content, str):
sentence_list.append(content)
elif isinstance(content, list):
sentence_list = content
for sentence in sentence_list:
if not all((type(obj) == str for obj in sentence)):
raise ValueError("Input must be list of list of string.")
# 2. 组建dataset
dataset = DataSet()
@ -83,36 +90,28 @@ class POS(API):
# 3. 使用pipeline
self.pipeline(dataset)
def decode_tags(ins):
pred_tags = ins["tag"]
chars = ins["words"]
words = []
start_idx = 0
for idx, tag in enumerate(pred_tags):
if tag[0] == "S":
words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
start_idx = idx + 1
elif tag[0] == "E":
words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:])
start_idx = idx + 1
return words
def merge_tag(words_list, tags_list):
rtn = []
for words, tags in zip(words_list, tags_list):
rtn.append([w + "/" + t for w, t in zip(words, tags)])
return rtn
dataset.apply(decode_tags, new_field_name="tag_output")
output = dataset.field_arrays["tag_output"].content
output = dataset.field_arrays["tag"].content
if isinstance(content, str):
return output[0]
elif isinstance(content, list):
return output
return merge_tag(content, output)
def test(self, file_path):
test_data = ZhConllPOSReader().load(file_path)
test_data = ConllxDataLoader().load(file_path)
tag_vocab = self._dict["tag_vocab"]
pipeline = self._dict["pipeline"]
save_dict = self._dict
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline
test_data.rename_field("pos_tags", "tag")
pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
@ -226,7 +225,7 @@ class CWS(API):
rec = eval_res['BMESF1PreRecMetric']['rec']
# print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))
return f1, pre, rec
return {"F1": f1, "precision": pre, "recall": rec}
class Parser(API):
@ -251,6 +250,7 @@ class Parser(API):
dataset.add_field('wp', pos_out)
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
dataset.rename_field("words", "raw_words")
# 3. 使用pipeline
self.pipeline(dataset)
@ -260,31 +260,74 @@ class Parser(API):
# output like: [['2/top', '0/root', '4/nn', '2/dep']]
return dataset.field_arrays['output'].content
def test(self, filepath):
data = ConllxDataLoader().load(filepath)
ds = DataSet()
for ins1, ins2 in zip(add_seg_tag(data), data):
ds.append(Instance(words=ins1[0], tag=ins1[1],
gold_words=ins2[0], gold_pos=ins2[1],
gold_heads=ins2[2], gold_head_tags=ins2[3]))
def load_test_file(self, path):
def get_one(sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
data = [get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))
return data_list
def test(self, filepath):
data = self.load_test_file(filepath)
def convert(data):
BOS = '<BOS>'
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + sample[2]
head_tags = [BOS] + sample[3]
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
tags=head_tags))
return dataset
ds = convert(data)
pp = self.pipeline
for p in pp:
if p.field_name == 'word_list':
p.field_name = 'gold_words'
elif p.field_name == 'pos_list':
p.field_name = 'gold_pos'
# ds.rename_field("words", "raw_words")
# ds.rename_field("tag", "pos")
pp(ds)
head_cor, label_cor, total = 0, 0, 0
for ins in ds:
head_gold = ins['gold_heads']
head_pred = ins['heads']
head_pred = ins['arc_pred']
length = len(head_gold)
total += length
for i in range(length):
head_cor += 1 if head_pred[i] == head_gold[i] else 0
uas = head_cor / total
print('uas:{:.2f}'.format(uas))
# print('uas:{:.2f}'.format(uas))
for p in pp:
if p.field_name == 'gold_words':
@ -292,7 +335,7 @@ class Parser(API):
elif p.field_name == 'gold_pos':
p.field_name = 'pos_list'
return uas
return {"USA": round(uas, 5)}
class Analyzer:

View File

@ -15,15 +15,42 @@ def chinese_word_segmentation():
print(cws.predict(text))
def chinese_word_segmentation_test():
cws = CWS(device='cpu')
print(cws.test("../../test/data_for_tests/zh_sample.conllx"))
def pos_tagging():
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '', '第一款', '高科技', '隐形', '无人机', '雷电之神', ''],
['那么', '', '', '无人机', '到底', '', '', '厉害', '']]
pos = POS(device='cpu')
print(pos.predict(text))
def pos_tagging_test():
pos = POS(device='cpu')
print(pos.test("../../test/data_for_tests/zh_sample.conllx"))
def syntactic_parsing():
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '', '第一款', '高科技', '隐形', '无人机', '雷电之神', ''],
['那么', '', '', '无人机', '到底', '', '', '厉害', '']]
parser = Parser(device='cpu')
print(parser.predict(text))
def syntactic_parsing_test():
parser = Parser(device='cpu')
print(parser.test("../../test/data_for_tests/zh_sample.conllx"))
if __name__ == "__main__":
# chinese_word_segmentation()
# chinese_word_segmentation_test()
# pos_tagging()
# pos_tagging_test()
syntactic_parsing()
# syntactic_parsing_test()

View File

@ -102,6 +102,7 @@ class PreAppendProcessor(Processor):
[data] + instance[field_name]
"""
def __init__(self, data, field_name, new_added_field_name=None):
super(PreAppendProcessor, self).__init__(field_name, new_added_field_name)
self.data = data
@ -116,6 +117,7 @@ class SliceProcessor(Processor):
从某个field中只取部分内容等价于instance[field_name][start:end:step]
"""
def __init__(self, start, end, step, field_name, new_added_field_name=None):
super(SliceProcessor, self).__init__(field_name, new_added_field_name)
for o in (start, end, step):
@ -132,6 +134,7 @@ class Num2TagProcessor(Processor):
将一句话中的数字转换为某个tag
"""
def __init__(self, tag, field_name, new_added_field_name=None):
"""
@ -163,6 +166,7 @@ class IndexerProcessor(Processor):
给定一个vocabulary , 将指定field转换为index形式指定field应该是一维的list比如
['', '', xxx]
"""
def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True):
assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
@ -215,6 +219,7 @@ class SeqLenProcessor(Processor):
根据某个field新增一个sequence length的field取该field的第一维
"""
def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True):
super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)
self.is_input = is_input
@ -229,6 +234,7 @@ class SeqLenProcessor(Processor):
from fastNLP.core.utils import _build_args
class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
@ -292,6 +298,7 @@ class Index2WordProcessor(Processor):
将DataSet中某个为index的field根据vocab转换为str
"""
def __init__(self, vocab, field_name, new_added_field_name):
super(Index2WordProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab
@ -303,7 +310,6 @@ class Index2WordProcessor(Processor):
class SetTargetProcessor(Processor):
# TODO; remove it.
def __init__(self, *fields, flag=True):
super(SetTargetProcessor, self).__init__(None, None)
self.fields = fields
@ -313,6 +319,7 @@ class SetTargetProcessor(Processor):
dataset.set_target(*self.fields, flag=self.flag)
return dataset
class SetInputProcessor(Processor):
def __init__(self, *fields, flag=True):
super(SetInputProcessor, self).__init__(None, None)
@ -322,3 +329,103 @@ class SetInputProcessor(Processor):
def process(self, dataset):
dataset.set_input(*self.fields, flag=self.flag)
return dataset
class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary并将其用数字index新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.
"""
def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=0, is_input=True):
"""
:param field_name: 从哪个field_name创建词表以及对哪个field_name进行index操作
:param new_added_filed_name: index时生成的index field的名称如果不传入则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息1输出信息
:param bool is_input:
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size
self.verbose = verbose
self.is_input = is_input
def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary
:param datasets: DataSet类型的数据用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))
def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary则使用dataset中的DataSet建立vocabulary若已经有了vocabulary则使用已有的vocabulary得到vocabulary
则会index datasets与only_index_dataset
:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index不会被用于生成vocabulary
:return:
"""
if len(datasets) == 0 and not hasattr(self, 'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets) != 0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))
for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name, is_input=self.is_input)
# 只返回一个infer时为了跟其他processor保持一致
if len(to_index_datasets) == 1:
return to_index_datasets[0]
def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab
def delete_vocab(self):
del self.vocab
def get_vocab_size(self):
return len(self.vocab)
def set_verbose(self, verbose):
"""
设置processor verbose状态
:param verbose: int, 0不输出任何信息1输出vocab 信息
:return:
"""
self.verbose = verbose

View File

@ -2,7 +2,7 @@ import numpy as np
import torch
from fastNLP.core.sampler import RandomSampler
import torch.multiprocessing as mp
class Batch(object):
"""Batch is an iterable object which iterates over mini-batches.
@ -16,10 +16,11 @@ class Batch(object):
:param int batch_size: the size of the batch
:param Sampler sampler: a Sampler object
:param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors.
:param bool prefetch: If True, use multiprocessing to fetch next batch when training.
:param str or torch.device device: the batch's device, if as_numpy is True, device is ignored.
"""
def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False):
def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False, prefetch=False):
self.dataset = dataset
self.batch_size = batch_size
self.sampler = sampler
@ -28,16 +29,12 @@ class Batch(object):
self.curidx = 0
self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
self.cur_batch_indices = None
self.prefetch = prefetch
self.lengths = 0
def __iter__(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()
return self
def __next__(self):
def fetch_one(self):
if self.curidx >= len(self.idx_list):
raise StopIteration
return None
else:
endidx = min(self.curidx + self.batch_size, len(self.idx_list))
batch_x, batch_y = {}, {}
@ -48,7 +45,7 @@ class Batch(object):
for field_name, field in self.dataset.get_all_fields().items():
if field.is_target or field.is_input:
batch = field.get(indices)
if not self.as_numpy:
if not self.as_numpy and field.padder is not None:
batch = to_tensor(batch, field.dtype)
if field.is_target:
batch_y[field_name] = batch
@ -56,9 +53,29 @@ class Batch(object):
batch_x[field_name] = batch
self.curidx = endidx
return batch_x, batch_y
def __iter__(self):
"""
Iterate on dataset, fetch batch data. Fetch process don't block the iterate process
:return:
"""
if self.prefetch:
return run_batch_iter(self)
def batch_iter():
self.init_iter()
while 1:
res = self.fetch_one()
if res is None:
break
yield res
return batch_iter()
def init_iter(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()
def __len__(self):
return self.num_batches
@ -67,8 +84,50 @@ class Batch(object):
def to_tensor(batch, dtype):
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
try:
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
except:
pass
return batch
def run_fetch(batch, q):
batch.init_iter()
# print('start fetch')
while 1:
res = batch.fetch_one()
# print('fetch one')
q.put(res)
if res is None:
# print('fetch done, waiting processing')
q.join()
break
# print('fetch exit')
def run_batch_iter(batch):
q = mp.JoinableQueue(maxsize=10)
fetch_p = mp.Process(target=run_fetch, args=(batch, q))
fetch_p.daemon = True
fetch_p.start()
# print('fork fetch process')
while 1:
try:
res = q.get(timeout=1)
q.task_done()
# print('get fetched')
if res is None:
break
yield res
except Exception as e:
if fetch_p.is_alive():
continue
else:
break
fetch_p.terminate()
fetch_p.join()
# print('iter done')

View File

@ -1,3 +1,11 @@
import os
import torch
from tensorboardX import SummaryWriter
from fastNLP.io.model_io import ModelSaver, ModelLoader
class Callback(object):
"""An Interface for all callbacks.
@ -7,38 +15,42 @@ class Callback(object):
def __init__(self):
super(Callback, self).__init__()
self.trainer = None # 在Trainer内部被重新赋值
def before_train(self):
def on_train_begin(self):
# before the main training loop
pass
def before_epoch(self, cur_epoch, total_epoch):
def on_epoch_begin(self, cur_epoch, total_epoch):
# at the beginning of each epoch
pass
def before_batch(self, batch_x, batch_y, indices):
def on_batch_begin(self, batch_x, batch_y, indices):
# at the beginning of each step/mini-batch
pass
def before_loss(self, batch_y, predict_y):
def on_loss_begin(self, batch_y, predict_y):
# after data_forward, and before loss computation
pass
def before_backward(self, loss, model):
def on_backward_begin(self, loss, model):
# after loss computation, and before gradient backward
pass
def after_backward(self, model):
def on_backward_end(self, model):
pass
def after_step(self, optimizer):
def on_step_end(self, optimizer):
pass
def after_batch(self, *args):
def on_batch_end(self, *args):
# at the end of each step/mini-batch
pass
def after_valid(self, eval_result, metric_key, optimizer):
def on_valid_begin(self):
pass
def on_valid_end(self, eval_result, metric_key, optimizer):
"""
每次执行验证机的evaluation后会调用传入eval_result
@ -49,7 +61,7 @@ class Callback(object):
"""
pass
def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
"""
每个epoch结束将会调用该方法
@ -60,7 +72,7 @@ class Callback(object):
"""
pass
def after_train(self, model):
def on_train_end(self, model):
"""
训练结束调用该方法
@ -69,16 +81,16 @@ class Callback(object):
"""
pass
def on_exception(self, exception, model, indices):
def on_exception(self, exception, model):
"""
当训练过程出现异常会触发该方法
:param exception: 某种类型的Exception比如KeyboardInterrupt等
:param model: 传入Trainer的模型
:param indices: 当前batch的index
:return:
"""
pass
def transfer(func):
"""装饰器将对CallbackManager的调用转发到各个Callback子类.
@ -125,91 +137,95 @@ class CallbackManager(Callback):
raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.")
@transfer
def before_train(self):
def on_train_begin(self):
pass
@transfer
def before_epoch(self, cur_epoch, total_epoch):
def on_epoch_begin(self, cur_epoch, total_epoch):
pass
@transfer
def before_batch(self, batch_x, batch_y, indices):
def on_batch_begin(self, batch_x, batch_y, indices):
pass
@transfer
def before_loss(self, batch_y, predict_y):
def on_loss_begin(self, batch_y, predict_y):
pass
@transfer
def before_backward(self, loss, model):
def on_backward_begin(self, loss, model):
pass
@transfer
def after_backward(self, model):
def on_backward_end(self, model):
pass
@transfer
def after_step(self, optimizer):
def on_step_end(self, optimizer):
pass
@transfer
def after_batch(self):
def on_batch_end(self):
pass
@transfer
def after_valid(self, eval_result, metric_key, optimizer):
def on_valid_begin(self):
pass
@transfer
def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_valid_end(self, eval_result, metric_key, optimizer):
pass
@transfer
def after_train(self, model):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
pass
@transfer
def on_exception(self, exception, model, indices):
def on_train_end(self, model):
pass
@transfer
def on_exception(self, exception, model):
pass
class DummyCallback(Callback):
def before_train(self, *arg):
def on_train_begin(self, *arg):
print(arg)
def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
print(cur_epoch, n_epoch, optimizer)
class EchoCallback(Callback):
def before_train(self):
def on_train_begin(self):
print("before_train")
def before_epoch(self, cur_epoch, total_epoch):
def on_epoch_begin(self, cur_epoch, total_epoch):
print("before_epoch")
def before_batch(self, batch_x, batch_y, indices):
def on_batch_begin(self, batch_x, batch_y, indices):
print("before_batch")
def before_loss(self, batch_y, predict_y):
def on_loss_begin(self, batch_y, predict_y):
print("before_loss")
def before_backward(self, loss, model):
def on_backward_begin(self, loss, model):
print("before_backward")
def after_batch(self):
def on_batch_end(self):
print("after_batch")
def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
print("after_epoch")
def after_train(self, model):
def on_train_end(self, model):
print("after_train")
class GradientClipCallback(Callback):
def __init__(self, parameters=None, clip_value=1, clip_type='norm'):
"""
每次backward前将parameter的gradient clip到某个范围
"""每次backward前将parameter的gradient clip到某个范围。
:param parameters: None, torch.Tensor或List[torch.Tensor], 一般通过model.parameters()获得如果为None则默认对Trainer
的model中所有参数进行clip
@ -231,12 +247,229 @@ class GradientClipCallback(Callback):
self.parameters = parameters
self.clip_value = clip_value
def after_backward(self, model):
def on_backward_end(self, model):
self.clip_fun(model.parameters(), self.clip_value)
class CallbackException(BaseException):
def __init__(self, msg):
super(CallbackException, self).__init__(msg)
class EarlyStopError(CallbackException):
def __init__(self, msg):
super(EarlyStopError, self).__init__(msg)
class EarlyStopCallback(Callback):
def __init__(self, patience):
"""
:param int patience: 停止之前等待的epoch数
"""
super(EarlyStopCallback, self).__init__()
self.trainer = None # override by CallbackManager
self.patience = patience
self.wait = 0
self.epoch = 0
def on_valid_end(self, eval_result, metric_key, optimizer):
self.epoch += 1
if not self.trainer._better_eval_result(eval_result):
# current result is getting worse
if self.wait == self.patience:
raise EarlyStopError("Early stopping raised.")
else:
self.wait += 1
else:
self.wait = 0
def on_exception(self, exception, model):
if isinstance(exception, EarlyStopError):
print("Early Stopping triggered in epoch {}!".format(self.epoch))
else:
raise exception # 抛出陌生Error
class LRScheduler(Callback):
def __init__(self, lr_scheduler):
"""对PyTorch LR Scheduler的包装
:param lr_scheduler: PyTorch的lr_scheduler
"""
super(LRScheduler, self).__init__()
import torch.optim
if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler):
self.scheduler = lr_scheduler
else:
raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
def on_epoch_begin(self, cur_epoch, total_epoch):
self.scheduler.step()
print("scheduler step ", "lr=", self.trainer.optimizer.param_groups[0]["lr"])
class ControlC(Callback):
def __init__(self, quit_all):
"""
:param quit_all: 若为True,则检测到control+C 直接退出程序否则只退出Trainer
"""
super(ControlC, self).__init__()
if type(quit_all) != bool:
raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.")
self.quit_all = quit_all
def on_exception(self, exception, model):
if isinstance(exception, KeyboardInterrupt):
if self.quit_all is True:
import sys
sys.exit(0) # 直接退出程序
else:
pass
else:
raise exception # 抛出陌生Error
class SmoothValue(object):
def __init__(self, beta: float):
self.beta, self.n, self.mov_avg = beta, 0, 0
self.smooth = None
def add_value(self, val: float) -> None:
"Add `val` to calculate updated smoothed value."
self.n += 1
self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val
self.smooth = self.mov_avg / (1 - self.beta ** self.n)
class LRFinder(Callback):
def __init__(self, n_batch, start_lr=1e-6, end_lr=10):
"""用第一个 epoch 找最佳的学习率从第二个epoch开始应用它
:param n_batch: 一个epoch内的iteration数
:param start_lr: 学习率下界
:param end_lr: 学习率上界
"""
super(LRFinder, self).__init__()
self.start_lr, self.end_lr = start_lr, end_lr
self.num_it = n_batch
self.stop = False
self.best_loss = 0.
self.best_lr = None
self.loss_history = []
self.smooth_value = SmoothValue(0.8)
self.opt = None
scale = (self.end_lr - self.start_lr) / self.num_it
self.lr_gen = (self.start_lr + scale * (step + 1) for step in range(self.num_it))
self.find = None
self.loader = ModelLoader()
def on_epoch_begin(self, cur_epoch, total_epoch):
if cur_epoch == 1:
self.opt = self.trainer.optimizer # pytorch optimizer
self.opt.param_groups[0]["lr"] = self.start_lr
# save model
ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True)
self.find = True
def on_backward_begin(self, loss, model):
if self.find:
if torch.isnan(loss) or self.stop is True:
self.stop = True
return
loss_val = loss.detach().cpu().data
self.loss_history.append(loss_val)
self.smooth_value.add_value(loss_val)
if self.best_loss == 0. or self.smooth_value.smooth < self.best_loss:
self.best_loss = self.smooth_value.smooth
self.best_lr = self.opt.param_groups[0]["lr"]
def on_batch_end(self, *args):
if self.find:
lr = next(self.lr_gen, None)
if lr is None or self.stop is True or self.loss_history[-1] > 4 * self.best_loss:
self.stop = True
return
self.opt.param_groups[0]["lr"] = lr
# self.loader.load_pytorch(self.trainer.model, "tmp")
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
if cur_epoch == 1:
self.opt.param_groups[0]["lr"] = self.best_lr
self.find = False
# reset model
ModelLoader().load_pytorch(self.trainer.model, "tmp")
print("Model reset. \nFind best lr={}".format(self.best_lr))
class TensorboardCallback(Callback):
"""
接受以下一个或多个字符串作为参数
- "model"
- "loss"
- "metric"
"""
def __init__(self, *options):
super(TensorboardCallback, self).__init__()
args = {"model", "loss", "metric"}
for opt in options:
if opt not in args:
raise ValueError("Unrecognized argument {}. Expect one of {}".format(opt, args))
self.options = options
self._summary_writer = None
self.graph_added = False
def on_train_begin(self):
save_dir = self.trainer.save_path
if save_dir is None:
path = os.path.join("./", 'tensorboard_logs_{}'.format(self.trainer.start_time))
else:
path = os.path.join(save_dir, 'tensorboard_logs_{}'.format(self.trainer.start_time))
self._summary_writer = SummaryWriter(path)
def on_batch_begin(self, batch_x, batch_y, indices):
if "model" in self.options and self.graph_added is False:
# tesorboardX 这里有大bug暂时没法画模型图
# from fastNLP.core.utils import _build_args
# inputs = _build_args(self.trainer.model, **batch_x)
# args = tuple([value for value in inputs.values()])
# args = args[0] if len(args) == 1 else args
# self._summary_writer.add_graph(self.trainer.model, torch.zeros(32, 2))
self.graph_added = True
def on_backward_begin(self, loss, model):
if "loss" in self.options:
self._summary_writer.add_scalar("loss", loss.item(), global_step=self.trainer.step)
if "model" in self.options:
for name, param in self.trainer.model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.trainer.step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.trainer.step)
self._summary_writer.add_scalar(name + "_grad_mean", param.grad.mean(),
global_step=self.trainer.step)
def on_valid_end(self, eval_result, metric_key, optimizer):
if "metric" in self.options:
for name, metric in eval_result.items():
for metric_key, metric_val in metric.items():
self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val,
global_step=self.trainer.step)
def on_train_end(self, model):
self._summary_writer.close()
del self._summary_writer
def on_exception(self, exception, model):
if hasattr(self, "_summary_writer"):
self._summary_writer.close()
del self._summary_writer
if __name__ == "__main__":
manager = CallbackManager(env={"n_epoch": 3}, callbacks=[DummyCallback(), DummyCallback()])
manager.before_train(10, 11, 12)
manager.on_train_begin(10, 11, 12)
# print(manager.after_epoch())

View File

@ -2,6 +2,7 @@ import _pickle as pickle
import numpy as np
from fastNLP.core.fieldarray import AutoPadder
from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance
from fastNLP.core.utils import get_func_signature
@ -88,12 +89,13 @@ class DataSet(object):
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}")
data_set = DataSet()
for field in self.field_arrays.values():
data_set.add_field(name=field.name,
fields=field.content[idx],
padding_val=field.padding_val,
is_input=field.is_input,
is_target=field.is_target)
data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder,
is_input=field.is_input, is_target=field.is_target)
return data_set
elif isinstance(idx, str):
if idx not in self:
raise KeyError("No such field called {} in DataSet.".format(idx))
return self.field_arrays[idx]
else:
raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx)))
@ -144,19 +146,23 @@ class DataSet(object):
if len(self.field_arrays) == 0:
# DataSet has no field yet
for name, field in ins.fields.items():
self.field_arrays[name] = FieldArray(name, [field])
field = field.tolist() if isinstance(field, np.ndarray) else field
self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本必须用list包装起来
else:
assert len(self.field_arrays) == len(ins.fields)
if len(self.field_arrays) != len(ins.fields):
raise ValueError(
"DataSet object has {} fields, but attempt to append an Instance object with {} fields."
.format(len(self.field_arrays), len(ins.fields)))
for name, field in ins.fields.items():
assert name in self.field_arrays
self.field_arrays[name].append(field)
def add_field(self, name, fields, padding_val=0, is_input=False, is_target=False):
def add_field(self, name, fields, padder=AutoPadder(pad_val=0), is_input=False, is_target=False):
"""Add a new field to the DataSet.
:param str name: the name of the field.
:param fields: a list of int, float, or other objects.
:param int padding_val: integer for padding.
:param int padder: PadBase对象如何对该Field进行padding大部分情况使用默认值即可
:param bool is_input: whether this field is model input.
:param bool is_target: whether this field is label or target.
"""
@ -164,8 +170,8 @@ class DataSet(object):
if len(self) != len(fields):
raise RuntimeError(f"The field to append must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target,
is_input=is_input)
self.field_arrays[name] = FieldArray(name, fields, is_target=is_target, is_input=is_input,
padder=padder)
def delete_field(self, name):
"""Delete a field based on the field name.
@ -229,6 +235,25 @@ class DataSet(object):
else:
raise KeyError("{} is not a valid field name.".format(name))
def set_padder(self, field_name, padder):
"""
为field_name设置padder
:param field_name: str, 设置field的padding方式为padder
:param padder: PadderBase类型或None. 设置为None即删除padder即对该field不进行padding操作.
:return:
"""
self.field_arrays[field_name].set_padder(padder)
def set_pad_val(self, field_name, pad_val):
"""
为某个
:param field_name: str修改该field的pad_val
:param pad_val: int该field的padder会以pad_val作为padding index
:return:
"""
self.field_arrays[field_name].set_pad_val(pad_val)
def get_input_name(self):
"""Get all field names with `is_input` as True.
@ -254,7 +279,7 @@ class DataSet(object):
:return results: if new_field_name is not passed, returned values of the function over all instances.
"""
results = [func(ins) for ins in self._inner_iter()]
if len(list(filter(lambda x: x is not None, results))) == 0 and not (new_field_name is None): # all None
if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None
raise ValueError("{} always return None.".format(get_func_signature(func=func)))
extra_param = {}
@ -270,12 +295,11 @@ class DataSet(object):
extra_param['is_input'] = old_field.is_input
if 'is_target' not in extra_param:
extra_param['is_target'] = old_field.is_target
self.add_field(name=new_field_name,
fields=results,
padding_val=old_field.padding_val,
**extra_param)
self.add_field(name=new_field_name, fields=results, is_input=extra_param["is_input"],
is_target=extra_param["is_target"])
else:
self.add_field(name=new_field_name, fields=results, **extra_param)
self.add_field(name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
is_target=extra_param.get("is_target", None))
else:
return results
@ -314,8 +338,17 @@ class DataSet(object):
for field_name in self.field_arrays:
train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
train_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim
dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
dev_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim
return train_set, dev_set

View File

@ -1,51 +1,168 @@
import numpy as np
class PadderBase:
"""
所有padder都需要继承这个类并覆盖__call__()方法
用于对batch进行padding操作传入的element是inplace的即直接修改element可能导致数据变化建议inplace修改之前deepcopy一份
"""
def __init__(self, pad_val=0, **kwargs):
self.pad_val = pad_val
def set_pad_val(self, pad_val):
self.pad_val = pad_val
def __call__(self, contents, field_name, field_ele_dtype):
"""
传入的是List内容假设有以下的DataSet
from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
dataset.append(Instance(word='this is a demo', length=4,
chars=[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']]))
dataset.append(Instance(word='another one', length=2,
chars=[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]))
# 如果batch_size=2, 下面只是用str的方式看起来更直观一点但实际上可能word和chars在pad时都已经为index了。
word这个field的pad_func会接收到的内容会是
[
'this is a demo',
'another one'
]
length这个field的pad_func会接收到的内容会是
[4, 2]
chars这个field的pad_func会接收到的内容会是
[
[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']],
[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]
]
即把每个instance中某个field的内容合成一个List传入
:param contents: List[element]传入的element是inplace的即直接修改element可能导致数据变化建议inplace修改之前
deepcopy一份
:param field_name: str, field的名称帮助定位错误
:param field_ele_dtype: np.int64, np.float64, np.str. 该field的内层list元素的类型辅助判断是否pad大多数情况用不上
:return: List[padded_element]或np.array([padded_element])
"""
raise NotImplementedError
class AutoPadder(PadderBase):
"""
根据contents的数据自动判定是否需要做padding
(1) 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看比如['This', 'is', ...]的元素类
型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding
(2) 如果元素类型为(np.int64, np.float64),
(2.1) 如果该field的内容只有一个比如为sequence_length, 则不进行padding
(2.2) 如果该field的内容为List, 那么会将Batch中的List pad为一样长若该List下还有里层的List需要padding请使用其它padder
如果某个instance中field为[1, 2, 3]则可以pad 若为[[1,2], [3,4, ...]]则不能进行pad
"""
def __init__(self, pad_val=0):
"""
:param pad_val: int, padding的位置使用该index
"""
super().__init__(pad_val=pad_val)
def _is_two_dimension(self, contents):
"""
判断contents是不是只有两个维度[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度
:param contents:
:return:
"""
value = contents[0]
if isinstance(value , (np.ndarray, list)):
value = value[0]
if isinstance(value, (np.ndarray, list)):
return False
return True
return False
def __call__(self, contents, field_name, field_ele_dtype):
if not is_iterable(contents[0]):
array = np.array([content for content in contents], dtype=field_ele_dtype)
elif field_ele_dtype in (np.int64, np.float64) and self._is_two_dimension(contents):
max_len = max([len(content) for content in contents])
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content in enumerate(contents):
array[i][:len(content)] = content
else: # should only be str
array = np.array([content for content in contents])
return array
class FieldArray(object):
"""``FieldArray`` is the collection of ``Instance``s of the same field.
It is the basic element of ``DataSet`` class.
:param str name: the name of the FieldArray
:param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray.
:param int padding_val: the integer for padding. Default: 0.
:param bool is_target: If True, this FieldArray is used to compute loss.
:param bool is_input: If True, this FieldArray is used to the model input.
:param padder: PadderBase类型大多数情况下都不需要设置该值除非需要在多个维度上进行padding(比如英文中对character进行padding)
"""
def __init__(self, name, content, padding_val=0, is_target=None, is_input=None):
def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)):
"""DataSet在初始化时会有两类方法对FieldArray操作
1 如果DataSet使用dict初始化那么在add_field中会构造FieldArray
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
1.4) list of array: DataSet({"x": [np.array([1,2,3]), np.array([1,2,3])]})
2 如果DataSet使用list of Instance 初始化,那么在append中会先对第一个样本初始化FieldArray
然后后面的样本使用FieldArray.append进行添加
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])
类型检查(dtype check)发生在当该field被设置为is_input或者is_target时
"""
self.name = name
if isinstance(content, list):
content = content
# 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list
# 如果DataSet使用list of Instance 初始化, content可能是 [list]/[array]/[2D list]
for idx, item in enumerate(content):
# 这是使用list of Instance 初始化时第一个样本FieldArray(name, [field])
# 将[np.array] 转化为 list of list
# 也可以支持[array, array, array]的情况
if isinstance(item, np.ndarray):
content[idx] = content[idx].tolist()
elif isinstance(content, np.ndarray):
content = content.tolist() # convert np.ndarray into 2-D list
else:
raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
self.content = content
self.padding_val = padding_val
if len(content) == 0:
raise RuntimeError("Cannot initialize FieldArray with empty list.")
self._is_target = None
self.content = content # 1维 或 2维 或 3维 list, 形状可能不对齐
self.content_dim = None # 表示content是多少维的list
self.set_padder(padder)
self.BASIC_TYPES = (int, float, str) # content中可接受的Python基本类型这里没有np.array
self.pytype = None
self.dtype = None
self._is_input = None
self._is_target = None
self.BASIC_TYPES = (int, float, str, np.ndarray)
self.is_2d_list = False
self.pytype = None # int, float, str, or np.ndarray
self.dtype = None # np.int64, np.float64, np.str
if is_input is not None:
if is_input is not None or is_target is not None:
self.is_input = is_input
if is_target is not None:
self.is_target = is_target
def _set_dtype(self):
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
@property
def is_input(self):
return self._is_input
@is_input.setter
def is_input(self, value):
"""
field_array.is_input = True / False 时被调用
"""
if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_input = value
@property
@ -54,46 +171,99 @@ class FieldArray(object):
@is_target.setter
def is_target(self, value):
"""
field_array.is_target = True / False 时被调用
"""
if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_target = value
def _type_detection(self, content):
"""
:param content: a list of int, float, str or np.ndarray, or a list of list of one.
:return type: one of int, float, str, np.ndarray
"""当该field被设置为is_input或者is_target时被调用
"""
if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list):
# content is a 2-D list
if not all(isinstance(_, list) for _ in content): # strict check 2-D list
raise TypeError("Please provide 2-D list.")
type_set = set([self._type_detection(x) for x in content])
if len(type_set) == 2 and int in type_set and float in type_set:
type_set = {float}
elif len(type_set) > 1:
raise TypeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set))
self.is_2d_list = True
if len(content) == 0:
raise RuntimeError("Empty list in Field {}.".format(self.name))
type_set = set([type(item) for item in content])
if list in type_set:
if len(type_set) > 1:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
# >1维list
inner_type_set = set()
for l in content:
[inner_type_set.add(type(obj)) for obj in l]
if list not in inner_type_set:
# 二维list
self.content_dim = 2
return self._basic_type_detection(inner_type_set)
else:
if len(inner_type_set) == 1:
# >2维list
inner_inner_type_set = set()
for _2d_list in content:
for _1d_list in _2d_list:
[inner_inner_type_set.add(type(obj)) for obj in _1d_list]
if list in inner_inner_type_set:
raise RuntimeError("FieldArray cannot handle 4-D or more-D list.")
# 3维list
self.content_dim = 3
return self._basic_type_detection(inner_inner_type_set)
else:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, inner_type_set))
else:
# 一维list
for content_type in type_set:
if content_type not in self.BASIC_TYPES:
raise RuntimeError("Unexpected data type in Field '{}'. Expect one of {}. Got {}.".format(
self.name, self.BASIC_TYPES, content_type))
self.content_dim = 1
return self._basic_type_detection(type_set)
def _basic_type_detection(self, type_set):
"""
:param type_set: a set of Python types
:return: one of self.BASIC_TYPES
"""
if len(type_set) == 1:
return type_set.pop()
elif isinstance(content, list):
# content is a 1-D list
if len(content) == 0:
# the old error is not informative enough.
raise RuntimeError("Cannot create FieldArray with an empty list. Or one element in the list is empty.")
type_set = set([type(item) for item in content])
if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES:
return type_set.pop()
elif len(type_set) == 2 and float in type_set and int in type_set:
elif len(type_set) == 2:
# 有多个basic type; 可能需要up-cast
if float in type_set and int in type_set:
# up-cast int to float
return float
else:
raise TypeError("Cannot create FieldArray with type {}".format(*type_set))
# str 跟 int 或者 float 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
else:
raise TypeError("Cannot create FieldArray with type {}".format(type(content)))
# str, int, float混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
def _1d_list_check(self, val):
"""如果不是1D list就报错
"""
type_set = set((type(obj) for obj in val))
if any(obj not in self.BASIC_TYPES for obj in type_set):
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
self._basic_type_detection(type_set)
# otherwise: _basic_type_detection will raise error
return True
def _2d_list_check(self, val):
"""如果不是2D list 就报错
"""
type_set = set(type(obj) for obj in val)
if list(type_set) != [list]:
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
inner_type_set = set()
for l in val:
for obj in l:
inner_type_set.add(type(obj))
self._basic_type_detection(inner_type_set)
return True
@staticmethod
def _map_to_np_type(basic_type):
@ -108,38 +278,39 @@ class FieldArray(object):
:param val: int, float, str, or a list of one.
"""
if self.is_target is True or self.is_input is True:
# only check type when used as target or input
if isinstance(val, list):
pass
elif isinstance(val, tuple): # 确保最外层是list
val = list(val)
elif isinstance(val, np.ndarray):
val = val.tolist()
elif any((isinstance(val, t) for t in self.BASIC_TYPES)):
pass
else:
raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
val_type = type(val)
if val_type == list: # shape check
if self.is_2d_list is False:
raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.")
if self.is_input is True or self.is_target is True:
if type(val) == list:
if len(val) == 0:
raise RuntimeError("Cannot append an empty list.")
val_list_type = set([type(_) for _ in val]) # type check
if len(val_list_type) == 2 and int in val_list_type and float in val_list_type:
# up-cast int to float
val_type = float
elif len(val_list_type) == 1:
val_type = val_list_type.pop()
raise ValueError("Cannot append an empty list.")
if self.content_dim == 2 and self._1d_list_check(val):
# 1维list检查
pass
elif self.content_dim == 3 and self._2d_list_check(val):
# 2维list检查
pass
else:
raise TypeError("Cannot append a list of {}".format(val_list_type))
raise RuntimeError(
"Dimension not matched: expect dim={}, got {}.".format(self.content_dim - 1, val))
elif type(val) in self.BASIC_TYPES and self.content_dim == 1:
# scalar检查
if type(val) == float and self.pytype == int:
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
else:
if self.is_2d_list is True:
raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.")
if val_type == float and self.pytype == int:
# up-cast
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
elif val_type == int and self.pytype == float:
pass
elif val_type == self.pytype:
pass
else:
raise TypeError("Cannot append type {} into type {}".format(val_type, self.pytype))
raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
self.content.append(val)
def __getitem__(self, indices):
@ -149,28 +320,44 @@ class FieldArray(object):
assert isinstance(idx, int)
self.content[idx] = val
def get(self, indices):
def get(self, indices, pad=True):
"""Fetch instances based on indices.
:param indices: an int, or a list of int.
:param pad: bool, 是否对返回的结果进行padding
:return:
"""
if isinstance(indices, int):
return self.content[indices]
if self.is_input is False and self.is_target is False:
raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name))
batch_size = len(indices)
if not is_iterable(self.content[0]):
array = np.array([self.content[i] for i in indices], dtype=self.dtype)
elif self.dtype in (np.int64, np.float64):
max_len = max([len(self.content[i]) for i in indices])
array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype)
for i, idx in enumerate(indices):
array[i][:len(self.content[idx])] = self.content[idx]
else: # should only be str
array = np.array([self.content[i] for i in indices])
return array
contents = [self.content[i] for i in indices]
if self.padder is None or pad is False:
return np.array(contents)
else:
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype)
def set_padder(self, padder):
"""
设置padding方式
:param padder: PadderBase类型或None. 设置为None即删除padder.
:return:
"""
if padder is not None:
assert isinstance(padder, PadderBase), "padder must be of type PadderBase."
self.padder = padder
def set_pad_val(self, pad_val):
"""
修改padder的pad_val.
:param pad_val: int
:return:
"""
if self.padder is not None:
self.padder.set_pad_val(pad_val)
def __len__(self):
"""Returns the size of FieldArray.
@ -186,3 +373,80 @@ def is_iterable(content):
except TypeError:
return False
return True
class EngChar2DPadder(PadderBase):
"""
用于为英语执行character级别的2D padding操作对应的field内容应该为[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']](这里为
了更直观把它们写为str但实际使用时它们应该是character的index)
padded过后的batch内容形状为(batch_size, max_sentence_length, max_word_length). max_sentence_length最大句子长度
max_word_length最长的word的长度
"""
def __init__(self, pad_val=0, pad_length=0):
"""
:param pad_val: int, padding的位置使用该index
:param pad_length: int, 如果为0则取一个batch中最大的单词长度作为padding长度如果为大于0的数则将所有单词的长度都pad或截
取到该长度.
"""
super().__init__(pad_val=pad_val)
self.pad_length = pad_length
def _exactly_three_dims(self, contents, field_name):
"""
检查传入的contents是否刚好是3维如果不是3维就报错理论上第一个维度是batch第二个维度是word第三个维度是character
:param contents:
:param field_name: str
:return:
"""
if not isinstance(contents, list):
raise TypeError("contents should be a list, not {}.".format(type(contents)))
value = contents[0]
try:
value = value[0]
except:
raise ValueError("Field:{} only has one dimension.".format(field_name))
try:
value = value[0]
except:
raise ValueError("Field:{} only has two dimensions.".format(field_name))
if is_iterable(value):
raise ValueError("Field:{} has more than 3 dimension.".format(field_name))
def __call__(self, contents, field_name, field_ele_dtype):
"""
期望输入类似于
[
[[0, 2], [2, 3, 4], ..],
[[9, 8, 2, 4], [1, 2,], ...],
....
]
:param contents:
:param field_name:
:param field_ele_dtype
:return:
"""
if field_ele_dtype not in (np.int64, np.float64):
raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format(
field_name, field_ele_dtype
))
self._exactly_three_dims(contents, field_name)
if self.pad_length < 1:
max_char_length = max(max([[len(char_lst) for char_lst in word_lst] for word_lst in contents]))
else:
max_char_length = self.pad_length
max_sent_length = max(len(word_lst) for word_lst in contents)
batch_size = len(contents)
dtype = type(contents[0][0][0])
padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val,
dtype=dtype)
for b_idx, word_lst in enumerate(contents):
for c_idx, char_lst in enumerate(word_lst):
chars = char_lst[:max_char_length]
padded_array[b_idx, c_idx, :len(chars)] = chars
return padded_array

View File

@ -11,6 +11,10 @@ class Instance(object):
"""
def __init__(self, **fields):
"""
:param fields: 可能是一维或者二维的 list or np.array
"""
self.fields = fields
def add_field(self, field_name, field):
@ -32,5 +36,5 @@ class Instance(object):
def __repr__(self):
s = '\''
return "{" + ",\n".join(
"\'" + field_name + "\': " + str(self.fields[field_name]) +\
"\'" + field_name + "\': " + str(self.fields[field_name]) + \
f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}"

View File

@ -1,7 +1,11 @@
from collections import defaultdict
import torch
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core import Batch
from fastNLP.core import DataSet
from fastNLP.core import SequentialSampler
from fastNLP.core.utils import _build_args
class Predictor(object):
@ -13,37 +17,55 @@ class Predictor(object):
Currently, Predictor does not support GPU.
"""
def __init__(self):
def __init__(self, network):
if not isinstance(network, torch.nn.Module):
raise ValueError(
"Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network)))
self.network = network
self.batch_size = 1
self.batch_output = []
def predict(self, network, data):
def predict(self, data, seq_len_field_name=None):
"""Perform inference using the trained model.
:param network: a PyTorch model (cpu)
:param data: a DataSet object.
:param str seq_len_field_name: field name indicating sequence lengths
:return: list of batch outputs
"""
# turn on the testing mode; clean up the history
self.mode(network, test=True)
batch_output = []
if not isinstance(data, DataSet):
raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
self.network.eval()
batch_output = defaultdict(list)
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False,
prefetch=False)
for batch_x, _ in data_iterator:
with torch.no_grad():
prediction = self.data_forward(network, batch_x)
batch_output.append(prediction)
if hasattr(self.network, "predict"):
predict_func = self.network.predict
else:
predict_func = self.network.forward
with torch.no_grad():
for batch_x, _ in data_iterator:
refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x)
if seq_len_field_name is not None:
seq_lens = batch_x[seq_len_field_name].tolist()
for key, value in prediction.items():
value = value.cpu().numpy()
if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
batch_output[key].extend(value.tolist())
else:
if seq_len_field_name is not None:
tmp_batch = []
for idx, seq_len in enumerate(seq_lens):
tmp_batch.append(value[idx, :seq_len])
batch_output[key].extend(tmp_batch)
else:
batch_output[key].append(value)
return batch_output
def mode(self, network, test=True):
if test:
network.eval()
else:
network.train()
def data_forward(self, network, x):
"""Forward through network."""
y = network(**x)
return y

View File

@ -5,7 +5,6 @@ from datetime import timedelta
import numpy as np
import torch
from tensorboardX import SummaryWriter
from torch import nn
try:
@ -14,7 +13,7 @@ except:
from fastNLP.core.utils import pseudo_tqdm as tqdm
from fastNLP.core.batch import Batch
from fastNLP.core.callback import CallbackManager
from fastNLP.core.callback import CallbackManager, CallbackException
from fastNLP.core.dataset import DataSet
from fastNLP.core.losses import _prepare_losser
from fastNLP.core.metrics import _prepare_metrics
@ -34,8 +33,8 @@ from fastNLP.core.utils import get_func_signature
class Trainer(object):
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50,
validate_every=-1, dev_data=None, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0),
check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True, use_cuda=False,
callbacks=None):
check_code_level=0, metric_key=None, sampler=RandomSampler(), prefetch=False, use_tqdm=True,
use_cuda=False, callbacks=None):
"""
:param DataSet train_data: the training data
:param torch.nn.modules.module model: a PyTorch model
@ -46,20 +45,23 @@ class Trainer(object):
:param int print_every: step interval to print next training information. Default: -1(no print).
:param int validate_every: step interval to do next validation. Default: -1(validate every epoch).
:param DataSet dev_data: the validation data
:param bool use_cuda: whether to use CUDA in training.
:param str save_path: file path to save models
:param Optimizer optimizer: an optimizer object
:param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict.\\
`ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means
it will raise error if some field are not used.
it will raise error if some field are not used. 检查的原理是通过使用很小的batch(默认两个sample)来检查代码是
否能够运行但是这个过程理论上不会修改任何参数只是会检查能否运行但如果(1)模型中存在将batch_size写为某个
固定值的情况(2)模型中存在累加前向计算次数的可能会多计算几次以上情况建议将check_code_level设置为-1
:param str metric_key: a single indicator used to decide the best model based on metric results. It must be one
of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets
smaller, add "-" in front of the string. For example::
metric_key="-PPL" # language model gets better as perplexity gets smaller
:param BaseSampler sampler: method used to generate batch data.
:param prefetch: bool, 是否使用额外的进程对产生batch数据
:param bool use_tqdm: whether to use tqdm to show train progress.
:param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数比如early stopnegative sampling等可以
通过callback机制实现
"""
super(Trainer, self).__init__()
@ -114,7 +116,11 @@ class Trainer(object):
self.print_every = int(print_every)
self.validate_every = int(validate_every) if validate_every!=0 else -1
self.best_metric_indicator = None
self.best_dev_epoch = None
self.best_dev_step = None
self.best_dev_perf = None
self.sampler = sampler
self.prefetch = prefetch
self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks)
if isinstance(optimizer, torch.optim.Optimizer):
@ -175,32 +181,26 @@ class Trainer(object):
"""
results = {}
if self.n_epochs <= 0:
print(f"training epoch is {self.n_epochs}, nothing was done.")
results['seconds'] = 0.
return results
try:
if torch.cuda.is_available() and self.use_cuda:
self.model = self.model.cuda()
self._model_device = self.model.parameters().__next__().device
self._mode(self.model, is_test=False)
self.start_time = str(datetime.now().strftime('%Y-%m-%d %H-%M-%S'))
self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
start_time = time.time()
print("training epochs started " + self.start_time, flush=True)
if self.save_path is None:
class psudoSW:
def __getattr__(self, item):
def pass_func(*args, **kwargs):
pass
return pass_func
self._summary_writer = psudoSW()
else:
path = os.path.join(self.save_path, 'tensorboard_logs_{}'.format(self.start_time))
self._summary_writer = SummaryWriter(path)
self.callback_manager.before_train()
self._train()
self.callback_manager.after_train(self.model)
try:
self.callback_manager.on_train_begin()
self._train()
self.callback_manager.on_train_end(self.model)
except (CallbackException, KeyboardInterrupt) as e:
self.callback_manager.on_exception(e, self.model)
if self.dev_data is not None:
print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
@ -216,8 +216,7 @@ class Trainer(object):
else:
print("Fail to reload best model.")
finally:
self._summary_writer.close()
del self._summary_writer
pass
results['seconds'] = round(time.time() - start_time, 2)
return results
@ -229,42 +228,36 @@ class Trainer(object):
inner_tqdm = tqdm
self.step = 0
start = time.time()
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False)
total_steps = data_iterator.num_batches * self.n_epochs
total_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
avg_loss = 0
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for epoch in range(1, self.n_epochs+1):
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
# early stopping
self.callback_manager.before_epoch(epoch, self.n_epochs)
self.callback_manager.on_epoch_begin(epoch, self.n_epochs)
for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
indices = data_iterator.get_batch_indices()
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.before_batch(batch_x, batch_y, indices)
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
prediction = self._data_forward(self.model, batch_x)
# edit prediction
self.callback_manager.before_loss(batch_y, prediction)
self.callback_manager.on_loss_begin(batch_y, prediction)
loss = self._compute_loss(prediction, batch_y)
avg_loss += loss.item()
# Is loss NaN or inf? requires_grad = False
self.callback_manager.before_backward(loss, self.model)
self.callback_manager.on_backward_begin(loss, self.model)
self._grad_backward(loss)
# gradient clipping
self.callback_manager.after_backward(self.model)
self.callback_manager.on_backward_end(self.model)
self._update()
# lr scheduler; lr_finder; one_cycle
self.callback_manager.after_step(self.optimizer)
self.callback_manager.on_step_end(self.optimizer)
self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step)
for name, param in self.model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
if (self.step+1) % self.print_every == 0:
if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
@ -277,11 +270,10 @@ class Trainer(object):
pbar.set_postfix_str(print_output)
avg_loss = 0
self.step += 1
# do nothing
self.callback_manager.after_batch()
self.callback_manager.on_batch_end()
if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
(self.validate_every < 0 and self.step % len(data_iterator)) == 0) \
(self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
and self.dev_data is not None:
eval_res = self._do_validation(epoch=epoch, step=self.step)
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
@ -289,35 +281,29 @@ class Trainer(object):
self.tester._format_eval_results(eval_res)
pbar.write(eval_str)
# if self.validate_every < 0 and self.dev_data:
# eval_res = self._do_validation(epoch=epoch, step=self.step)
# eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
# self.tester._format_eval_results(eval_res)
# pbar.write(eval_str)
if epoch != self.n_epochs:
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler,
as_numpy=False)
# ================= mini-batch end ==================== #
# lr decay; early stopping
self.callback_manager.after_epoch(epoch, self.n_epochs, self.optimizer)
self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer)
# =============== epochs end =================== #
pbar.close()
# ============ tqdm end ============== #
def _do_validation(self, epoch, step):
self.callback_manager.on_valid_begin()
res = self.tester.test()
for name, metric in res.items():
for metric_key, metric_val in metric.items():
self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val,
global_step=self.step)
if self._better_eval_result(res):
if self.save_path is not None:
self._save_model(self.model,
"best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]))
else:
self._best_model_states = {name:param.cpu().clone() for name, param in self.model.named_parameters()}
self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()}
self.best_dev_perf = res
self.best_dev_epoch = epoch
self.best_dev_step = step
# get validation results; adjust optimizer
self.callback_manager.after_valid(res, self.metric_key, self.optimizer)
self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer)
return res
def _mode(self, model, is_test=False):
@ -365,12 +351,23 @@ class Trainer(object):
return self.losser(predict, truth)
def _save_model(self, model, model_name, only_param=False):
""" 存储不含有显卡信息的state_dict或model
:param model:
:param model_name:
:param only_param:
:return:
"""
if self.save_path is not None:
model_name = os.path.join(self.save_path, model_name)
model_path = os.path.join(self.save_path, model_name)
if only_param:
torch.save(model.state_dict(), model_name)
state_dict = model.state_dict()
for key in state_dict:
state_dict[key] = state_dict[key].cpu()
torch.save(state_dict, model_path)
else:
torch.save(model, model_name)
model.cpu()
torch.save(model, model_path)
model.cuda()
def _load_model(self, model, model_name, only_param=False):
# 返回bool值指示是否成功reload模型

View File

@ -186,11 +186,12 @@ def _check_function_or_method(func):
raise TypeError(f"{type(func)} is not a method or function.")
def _move_dict_value_to_device(*args, device: torch.device):
def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False):
"""
move data to model's device, element in *args should be dict. This is a inplace change.
:param device: torch.device
:param non_blocking: bool, 是否异步将数据转移到cpu, 需要tensor使用pin_memory()
:param args:
:return:
"""
@ -201,7 +202,7 @@ def _move_dict_value_to_device(*args, device: torch.device):
if isinstance(arg, dict):
for key, value in arg.items():
if isinstance(value, torch.Tensor):
arg[key] = value.to(device)
arg[key] = value.to(device, non_blocking=non_blocking)
else:
raise TypeError("Only support `dict` type right now.")

View File

@ -11,18 +11,24 @@ class BaseLoader(object):
@staticmethod
def load_lines(data_path):
"""按行读取舍弃每行两侧空白字符返回list of str
"""
with open(data_path, "r", encoding="utf=8") as f:
text = f.readlines()
return [line.strip() for line in text]
@classmethod
def load(cls, data_path):
"""先按行读取去除一行两侧空白再提取每行的字符。返回list of list of str
"""
with open(data_path, "r", encoding="utf-8") as f:
text = f.readlines()
return [[word for word in sent.strip()] for sent in text]
@classmethod
def load_with_cache(cls, data_path, cache_path):
"""缓存版的load
"""
if os.path.isfile(cache_path) and os.path.getmtime(data_path) < os.path.getmtime(cache_path):
with open(cache_path, 'rb') as f:
return pickle.load(f)

View File

@ -11,7 +11,6 @@ class ConfigLoader(BaseLoader):
:param str data_path: path to the config
"""
def __init__(self, data_path=None):
super(ConfigLoader, self).__init__()
if data_path is not None:
@ -30,7 +29,7 @@ class ConfigLoader(BaseLoader):
Example::
test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})
"""
assert isinstance(sections, dict)
@ -202,8 +201,6 @@ class ConfigSaver(object):
continue
if '=' not in line:
# log = create_logger(__name__, './config_saver.log')
# log.error("can NOT load config file [%s]" % self.file_path)
raise RuntimeError("can NOT load config file {}".__format__(self.file_path))
key = line.split('=', maxsplit=1)[0].strip()
@ -263,10 +260,6 @@ class ConfigSaver(object):
change_file = True
break
if section_file[k] != section[k]:
# logger = create_logger(__name__, "./config_loader.log")
# logger.warning("section [%s] in config file [%s] has been changed" % (
# section_name, self.file_path
# ))
change_file = True
break
if not change_file:

View File

@ -90,6 +90,7 @@ class NativeDataSetLoader(DataSetLoader):
"""A simple example of DataSetLoader
"""
def __init__(self):
super(NativeDataSetLoader, self).__init__()
@ -107,6 +108,7 @@ class RawDataSetLoader(DataSetLoader):
"""A simple example of raw data reader
"""
def __init__(self):
super(RawDataSetLoader, self).__init__()
@ -124,8 +126,8 @@ class RawDataSetLoader(DataSetLoader):
DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata')
class POSDataSetLoader(DataSetLoader):
"""Dataset Loader for a POS Tag dataset.
class DummyPOSReader(DataSetLoader):
"""A simple reader for a dummy POS tagging dataset.
In these datasets, each line are divided by "\t". The first Col is the vocabulary and the second
Col is the label. Different sentence are divided by an empty line.
@ -142,8 +144,9 @@ class POSDataSetLoader(DataSetLoader):
In this example, there are two sentences "Tom and Jerry ." and "Hello world !". Each word has its own label.
"""
def __init__(self):
super(POSDataSetLoader, self).__init__()
super(DummyPOSReader, self).__init__()
def load(self, data_path):
"""
@ -191,16 +194,14 @@ class POSDataSetLoader(DataSetLoader):
return convert_seq2seq_dataset(data)
DataLoaderRegister.set_reader(POSDataSetLoader, 'read_pos')
DataLoaderRegister.set_reader(DummyPOSReader, 'read_pos')
class TokenizeDataSetLoader(DataSetLoader):
class DummyCWSReader(DataSetLoader):
"""Load pku dataset for Chinese word segmentation.
"""
Data set loader for tokenization data sets
"""
def __init__(self):
super(TokenizeDataSetLoader, self).__init__()
super(DummyCWSReader, self).__init__()
def load(self, data_path, max_seq_len=32):
"""Load pku dataset for Chinese word segmentation.
@ -253,11 +254,11 @@ class TokenizeDataSetLoader(DataSetLoader):
return convert_seq2seq_dataset(data)
class ClassDataSetLoader(DataSetLoader):
class DummyClassificationReader(DataSetLoader):
"""Loader for a dummy classification data set"""
def __init__(self):
super(ClassDataSetLoader, self).__init__()
super(DummyClassificationReader, self).__init__()
def load(self, data_path):
assert os.path.exists(data_path)
@ -268,7 +269,7 @@ class ClassDataSetLoader(DataSetLoader):
@staticmethod
def parse(lines):
"""
"""每行第一个token是标签其余是字/词;由空格分隔。
:param lines: lines from dataset
:return: list(list(list())): the three level of lists are words, sentence, and dataset
@ -324,16 +325,11 @@ class ConllLoader(DataSetLoader):
pass
class LMDataSetLoader(DataSetLoader):
"""Language Model Dataset Loader
This loader produces data for language model training in a supervised way.
That means it has X and Y.
class DummyLMReader(DataSetLoader):
"""A Dummy Language Model Dataset Reader
"""
def __init__(self):
super(LMDataSetLoader, self).__init__()
super(DummyLMReader, self).__init__()
def load(self, data_path):
if not os.path.exists(data_path):
@ -361,19 +357,25 @@ class LMDataSetLoader(DataSetLoader):
class PeopleDailyCorpusLoader(DataSetLoader):
"""人民日报数据集
"""
People Daily Corpus: Chinese word segmentation, POS tag, NER
"""
def __init__(self):
super(PeopleDailyCorpusLoader, self).__init__()
self.pos = True
self.ner = True
def load(self, data_path):
def load(self, data_path, pos=True, ner=True):
"""
:param str data_path: 数据路径
:param bool pos: 是否使用词性标签
:param bool ner: 是否使用命名实体标签
:return: a DataSet object
"""
self.pos, self.ner = pos, ner
with open(data_path, "r", encoding="utf-8") as f:
sents = f.readlines()
pos_tag_examples = []
ner_examples = []
examples = []
for sent in sents:
if len(sent) <= 2:
continue
@ -407,40 +409,44 @@ class PeopleDailyCorpusLoader(DataSetLoader):
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
# List[List[List[str], List[str]]]
# ner_examples not used
return self.convert(pos_tag_examples)
example = [sent_words]
if self.pos is True:
example.append(sent_pos_tag)
if self.ner is True:
example.append(sent_ner)
examples.append(example)
return self.convert(examples)
def convert(self, data):
data_set = DataSet()
for item in data:
sent_words, sent_pos_tag = item[0], item[1]
data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
data_set.set_target("tags")
data_set.set_input("sent_words")
data_set.set_input("seq_len")
sent_words = item[0]
if self.pos is True and self.ner is True:
instance = Instance(words=sent_words, pos_tags=item[1], ner=item[2])
elif self.pos is True:
instance = Instance(words=sent_words, pos_tags=item[1])
elif self.ner is True:
instance = Instance(words=sent_words, ner=item[1])
else:
instance = Instance(words=sent_words)
data_set.append(instance)
data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len")
return data_set
class Conll2003Loader(DataSetLoader):
"""Self-defined loader of conll2003 dataset
"""Loader for conll2003 dataset
More information about the given dataset cound be found on
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
"""
def __init__(self):
super(Conll2003Loader, self).__init__()
def load(self, dataset_path):
with open(dataset_path, "r", encoding="utf-8") as f:
lines = f.readlines()
##Parse the dataset line by line
parsed_data = []
sentence = []
tokens = []
@ -467,21 +473,20 @@ class Conll2003Loader(DataSetLoader):
lambda labels: labels[1], sample[1]))
label2_list = list(map(
lambda labels: labels[2], sample[1]))
dataset.append(Instance(token_list=sample[0],
label0_list=label0_list,
label1_list=label1_list,
label2_list=label2_list))
dataset.append(Instance(tokens=sample[0],
pos=label0_list,
chucks=label1_list,
ner=label2_list))
return dataset
class SNLIDataSetLoader(DataSetLoader):
class SNLIDataSetReader(DataSetLoader):
"""A data set loader for SNLI data set.
"""
def __init__(self):
super(SNLIDataSetLoader, self).__init__()
super(SNLIDataSetReader, self).__init__()
def load(self, path_list):
"""
@ -540,3 +545,298 @@ class SNLIDataSetLoader(DataSetLoader):
data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
data_set.set_target("truth")
return data_set
class ConllCWSReader(object):
def __init__(self):
pass
def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field内容为str
假定了输入为conll的格式以空行隔开两个句子每行共7列
::
1 编者按 编者按 NN O 11 nmod:topic
2 PU O 11 punct
3 7 7 NT DATE 4 compound:nn
4 12 12 NT DATE 11 nmod:tmod
5 PU O 11 punct
1 DT O 3 det
2 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.strip().split())
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:
ds.append(Instance(raw_sentence=raw_sentence))
return ds
def get_char_lst(self, sample):
if len(sample) == 0:
return None
text = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
return text
class NaiveCWSReader(DataSetLoader):
"""
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
例如::
这是 fastNLP , 一个 非常 good .
或者,即每个part后面还有一个pos tag
例如::
/D /P 團員/Na 之中/Ng /COMMACATEGORY
"""
def __init__(self, in_word_splitter=None):
super(NaiveCWSReader, self).__init__()
self.in_word_splitter = in_word_splitter
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
"""
允许使用的情况有(默认以\t或空格作为seg)
这是 fastNLP , 一个 非常 good .
/D /P 團員/Na 之中/Ng /COMMACATEGORY
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
:param filepath:
:param in_word_splitter:
:param cut_long_sent:
:return:
"""
if in_word_splitter == None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if len(line.replace(' ', '')) == 0: # 不能接受空行
continue
if not in_word_splitter is None:
words = []
for part in line.split():
word = part.split(in_word_splitter)[0]
words.append(word)
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
return dataset
def cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段只会在有空格的地方发生截断所以截取的句子可能长于或者短于max_sample_length
:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence
class ZhConllPOSReader(object):
"""读取中文Conll格式。返回“字级别”的标签使用BMES记号扩展原来的词级别标签。
"""
def __init__(self):
pass
def load(self, path):
"""
返回的DataSet, 包含以下的field
wordslist of str,
tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..]会被认为是["S-VP", "B-NN", "M-NN",..]
假定了输入为conll的格式以空行隔开两个句子每行共7列
::
1 编者按 编者按 NN O 11 nmod:topic
2 PU O 11 punct
3 7 7 NT DATE 4 compound:nn
4 12 12 NT DATE 11 nmod:tmod
5 PU O 11 punct
1 DT O 3 det
2 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
char_seq.extend(list(word))
if len(word) == 1:
pos_seq.append('S-{}'.format(tag))
elif len(word) > 1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word) - 2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
else:
raise ValueError("Zero length of word detected.")
ds.append(Instance(words=char_seq,
tag=pos_seq))
return ds
def get_one(self, sample):
if len(sample) == 0:
return None
text = []
pos_tags = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
pos_tags.append(t2)
return text, pos_tags
class ConllxDataLoader(object):
"""返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。
"""
def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
data = [self.get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))
ds = DataSet()
for example in data_list:
ds.append(Instance(words=example[0],
pos_tags=example[1],
heads=example[2],
labels=example[3]))
return ds
def get_one(self, sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]
def add_seg_tag(data):
"""
:param data: list of ([word], [pos], [heads], [head_tags])
:return: list of ([word], [pos])
"""
_processed = []
for word_list, pos_list, _, _ in data:
new_sample = []
for word, pos in zip(word_list, pos_list):
if len(word) == 1:
new_sample.append((word, 'S-' + pos))
else:
new_sample.append((word[0], 'B-' + pos))
for c in word[1:-1]:
new_sample.append((c, 'M-' + pos))
new_sample.append((word[-1], 'E-' + pos))
_processed.append(list(map(list, zip(*new_sample))))
return _processed

View File

@ -101,9 +101,12 @@ class EmbedLoader(BaseLoader):
"""
if vocab is None:
raise RuntimeError("You must provide a vocabulary.")
embedding_matrix = np.zeros(shape=(len(vocab), emb_dim))
embedding_matrix = np.zeros(shape=(len(vocab), emb_dim), dtype=np.float32)
hit_flags = np.zeros(shape=(len(vocab),), dtype=int)
with open(emb_file, "r", encoding="utf-8") as f:
startline = f.readline()
if len(startline.split()) > 2:
f.seek(0)
for line in f:
word, vector = EmbedLoader.parse_glove_line(line)
if word in vocab:

362
fastNLP/models/bert.py Normal file
View File

@ -0,0 +1,362 @@
"""
bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.
"""
import copy
import json
import math
import os
import torch
from torch import nn
CONFIG_FILE = 'bert_config.json'
MODEL_WEIGHTS = 'pytorch_model.bin'
def gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
def swish(x):
return x * torch.sigmoid(x)
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias
class BertEmbeddings(nn.Module):
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertSelfAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.dropout = nn.Dropout(attention_probs_dropout_prob)
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)
query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer
class BertSelfOutput(nn.Module):
def __init__(self, hidden_size, hidden_dropout_prob):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)
def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output
class BertIntermediate(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_act):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(hidden_size, intermediate_size)
self.intermediate_act_fn = ACT2FN[hidden_act] \
if isinstance(hidden_act, str) else hidden_act
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class BertOutput(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
super(BertOutput, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob)
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)
def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class BertEncoder(nn.Module):
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertEncoder, self).__init__()
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers
class BertPooler(nn.Module):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.activation = nn.Tanh()
def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output
class BertModel(nn.Module):
"""Bidirectional Embedding Representations from Transformers.
If you want to use pre-trained weights, please download from the following sources provided by pytorch-pretrained-BERT.
sources::
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
Construct a BERT model with pre-trained weights::
model = BertModel.from_pretrained("path/to/weights/directory")
"""
def __init__(self, vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02, **kwargs):
super(BertModel, self).__init__()
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
type_vocab_size, hidden_dropout_prob)
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
hidden_act)
self.pooler = BertPooler(hidden_size)
self.initializer_range = initializer_range
self.apply(self.init_bert_weights)
def init_bert_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output
@classmethod
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
# Load config
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
config = json.load(open(config_file, "r"))
# config = BertConfig.from_json_file(config_file)
# logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(*inputs, **config, **kwargs)
if state_dict is None:
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS)
state_dict = torch.load(weights_path)
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
if len(missing_keys) > 0:
print("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
return model

View File

@ -1,17 +1,20 @@
import copy
from collections import defaultdict
import numpy as np
import torch
from collections import defaultdict
from torch import nn
from torch.nn import functional as F
from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP.modules.dropout import TimestepDropout
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.utils import seq_mask
from fastNLP.core.losses import LossFunc
from fastNLP.core.metrics import MetricBase
from fastNLP.core.utils import seq_lens_to_masks
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.dropout import TimestepDropout
from fastNLP.modules.encoder.transformer import TransformerEncoder
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.utils import seq_mask
def mst(scores):
"""
@ -197,53 +200,64 @@ class BiaffineParser(GraphParser):
pos_vocab_size,
pos_emb_dim,
num_label,
word_hid_dim=100,
pos_hid_dim=100,
rnn_layers=1,
rnn_hidden_size=200,
arc_mlp_size=100,
label_mlp_size=100,
dropout=0.3,
use_var_lstm=False,
encoder='lstm',
use_greedy_infer=False):
super(BiaffineParser, self).__init__()
rnn_out_size = 2 * rnn_hidden_size
word_hid_dim = pos_hid_dim = rnn_hidden_size
self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim)
self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim)
self.word_fc = nn.Linear(word_emb_dim, word_hid_dim)
self.pos_fc = nn.Linear(pos_emb_dim, pos_hid_dim)
self.word_norm = nn.LayerNorm(word_hid_dim)
self.pos_norm = nn.LayerNorm(pos_hid_dim)
self.use_var_lstm = use_var_lstm
if use_var_lstm:
self.lstm = VarLSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
input_dropout=dropout,
hidden_dropout=dropout,
bidirectional=True)
self.encoder_name = encoder
self.max_len = 512
if encoder == 'var-lstm':
self.encoder = VarLSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
input_dropout=dropout,
hidden_dropout=dropout,
bidirectional=True)
elif encoder == 'lstm':
self.encoder = nn.LSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
dropout=dropout,
bidirectional=True)
elif encoder == 'transformer':
n_head = 16
d_k = d_v = int(rnn_out_size / n_head)
if (d_k * n_head) != rnn_out_size:
raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size))
self.position_emb = nn.Embedding(num_embeddings=self.max_len,
embedding_dim=rnn_out_size,)
self.encoder = TransformerEncoder(num_layers=rnn_layers,
model_size=rnn_out_size,
inner_size=1024,
key_size=d_k,
value_size=d_v,
num_head=n_head,
dropout=dropout,)
else:
self.lstm = nn.LSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
dropout=dropout,
bidirectional=True)
raise ValueError('unsupported encoder type: {}'.format(encoder))
self.arc_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size),
nn.LayerNorm(arc_mlp_size),
self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2),
nn.ELU(),
TimestepDropout(p=dropout),)
self.arc_dep_mlp = copy.deepcopy(self.arc_head_mlp)
self.label_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, label_mlp_size),
nn.LayerNorm(label_mlp_size),
nn.ELU(),
TimestepDropout(p=dropout),)
self.label_dep_mlp = copy.deepcopy(self.label_head_mlp)
self.arc_mlp_size = arc_mlp_size
self.label_mlp_size = label_mlp_size
self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True)
self.use_greedy_infer = use_greedy_infer
@ -286,24 +300,27 @@ class BiaffineParser(GraphParser):
word, pos = self.word_fc(word), self.pos_fc(pos)
word, pos = self.word_norm(word), self.pos_norm(pos)
x = torch.cat([word, pos], dim=2) # -> [N,L,C]
del word, pos
# lstm, extract features
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
x = x[sort_idx]
x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
feat, _ = self.lstm(x) # -> [N,L,C]
feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
feat = feat[unsort_idx]
# encoder, extract features
if self.encoder_name.endswith('lstm'):
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
x = x[sort_idx]
x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
feat, _ = self.encoder(x) # -> [N,L,C]
feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
feat = feat[unsort_idx]
else:
seq_range = torch.arange(seq_len, dtype=torch.long, device=x.device)[None,:]
x = x + self.position_emb(seq_range)
feat = self.encoder(x, mask.float())
# for arc biaffine
# mlp, reduce dim
arc_dep = self.arc_dep_mlp(feat)
arc_head = self.arc_head_mlp(feat)
label_dep = self.label_dep_mlp(feat)
label_head = self.label_head_mlp(feat)
del feat
feat = self.mlp(feat)
arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size
arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz]
label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:]
# biaffine arc classifier
arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
@ -349,7 +366,7 @@ class BiaffineParser(GraphParser):
batch_size, seq_len, _ = arc_pred.shape
flip_mask = (mask == 0)
_arc_pred = arc_pred.clone()
_arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf)
_arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf'))
arc_logits = F.log_softmax(_arc_pred, dim=2)
label_logits = F.log_softmax(label_pred, dim=2)
batch_index = torch.arange(batch_size, device=arc_logits.device, dtype=torch.long).unsqueeze(1)
@ -357,12 +374,11 @@ class BiaffineParser(GraphParser):
arc_loss = arc_logits[batch_index, child_index, arc_true]
label_loss = label_logits[batch_index, child_index, label_true]
arc_loss = arc_loss[:, 1:]
label_loss = label_loss[:, 1:]
float_mask = mask[:, 1:].float()
arc_nll = -(arc_loss*float_mask).mean()
label_nll = -(label_loss*float_mask).mean()
byte_mask = flip_mask.byte()
arc_loss.masked_fill_(byte_mask, 0)
label_loss.masked_fill_(byte_mask, 0)
arc_nll = -arc_loss.mean()
label_nll = -label_loss.mean()
return arc_nll + label_nll
def predict(self, word_seq, pos_seq, seq_lens):

View File

@ -4,6 +4,7 @@ import torch
import torch.nn.functional as F
from torch import nn
from fastNLP.modules.dropout import TimestepDropout
from fastNLP.modules.utils import mask_softmax
@ -23,46 +24,89 @@ class Attention(torch.nn.Module):
class DotAtte(nn.Module):
def __init__(self, key_size, value_size):
def __init__(self, key_size, value_size, dropout=0.1):
super(DotAtte, self).__init__()
self.key_size = key_size
self.value_size = value_size
self.scale = math.sqrt(key_size)
self.drop = nn.Dropout(dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, Q, K, V, seq_mask=None):
def forward(self, Q, K, V, mask_out=None):
"""
:param Q: [batch, seq_len, key_size]
:param K: [batch, seq_len, key_size]
:param V: [batch, seq_len, value_size]
:param seq_mask: [batch, seq_len]
:param mask_out: [batch, seq_len]
"""
output = torch.matmul(Q, K.transpose(1, 2)) / self.scale
if seq_mask is not None:
output.masked_fill_(seq_mask.lt(1), -float('inf'))
output = nn.functional.softmax(output, dim=2)
if mask_out is not None:
output.masked_fill_(mask_out, -float('inf'))
output = self.softmax(output)
output = self.drop(output)
return torch.matmul(output, V)
class MultiHeadAtte(nn.Module):
def __init__(self, input_size, output_size, key_size, value_size, num_atte):
super(MultiHeadAtte, self).__init__()
self.in_linear = nn.ModuleList()
for i in range(num_atte * 3):
out_feat = key_size if (i % 3) != 2 else value_size
self.in_linear.append(nn.Linear(input_size, out_feat))
self.attes = nn.ModuleList([DotAtte(key_size, value_size) for _ in range(num_atte)])
self.out_linear = nn.Linear(value_size * num_atte, output_size)
def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
"""
def forward(self, Q, K, V, seq_mask=None):
heads = []
for i in range(len(self.attes)):
j = i * 3
qi, ki, vi = self.in_linear[j](Q), self.in_linear[j+1](K), self.in_linear[j+2](V)
headi = self.attes[i](qi, ki, vi, seq_mask)
heads.append(headi)
output = torch.cat(heads, dim=2)
return self.out_linear(output)
:param input_size: int, 输入维度的大小同时也是输出维度的大小
:param key_size: int, 每个head的维度大小
:param value_size: int每个head中value的维度
:param num_head: inthead的数量
:param dropout: float
"""
super(MultiHeadAtte, self).__init__()
self.input_size = input_size
self.key_size = key_size
self.value_size = value_size
self.num_head = num_head
in_size = key_size * num_head
self.q_in = nn.Linear(input_size, in_size)
self.k_in = nn.Linear(input_size, in_size)
self.v_in = nn.Linear(input_size, in_size)
self.attention = DotAtte(key_size=key_size, value_size=value_size)
self.out = nn.Linear(value_size * num_head, input_size)
self.drop = TimestepDropout(dropout)
self.reset_parameters()
def reset_parameters(self):
sqrt = math.sqrt
nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size)))
nn.init.xavier_normal_(self.out.weight)
def forward(self, Q, K, V, atte_mask_out=None):
"""
:param Q: [batch, seq_len, model_size]
:param K: [batch, seq_len, model_size]
:param V: [batch, seq_len, model_size]
:param seq_mask: [batch, seq_len]
"""
batch, seq_len, _ = Q.size()
d_k, d_v, n_head = self.key_size, self.value_size, self.num_head
# input linear
q = self.q_in(Q).view(batch, seq_len, n_head, d_k)
k = self.k_in(K).view(batch, seq_len, n_head, d_k)
v = self.v_in(V).view(batch, seq_len, n_head, d_k)
# transpose q, k and v to do batch attention
q = q.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_k)
k = k.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_k)
v = v.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_v)
if atte_mask_out is not None:
atte_mask_out = atte_mask_out.repeat(n_head, 1, 1)
atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, seq_len, d_v)
# concat all heads, do output linear
atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, seq_len, -1)
output = self.drop(self.out(atte))
return output
class Bi_Attention(nn.Module):

View File

@ -1,29 +1,56 @@
from torch import nn
from ..aggregator.attention import MultiHeadAtte
from ..other_modules import LayerNormalization
from ..dropout import TimestepDropout
class TransformerEncoder(nn.Module):
class SubLayer(nn.Module):
def __init__(self, input_size, output_size, key_size, value_size, num_atte):
super(TransformerEncoder.SubLayer, self).__init__()
self.atte = MultiHeadAtte(input_size, output_size, key_size, value_size, num_atte)
self.norm1 = LayerNormalization(output_size)
self.ffn = nn.Sequential(nn.Linear(output_size, output_size),
nn.ReLU(),
nn.Linear(output_size, output_size))
self.norm2 = LayerNormalization(output_size)
def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
"""
def forward(self, input, seq_mask):
attention = self.atte(input)
:param model_size: int, 输入维度的大小同时也是输出维度的大小
:param inner_size: int, FFN层的hidden大小
:param key_size: int, 每个head的维度大小
:param value_size: int每个head中value的维度
:param num_head: inthead的数量
:param dropout: float
"""
super(TransformerEncoder.SubLayer, self).__init__()
self.atte = MultiHeadAtte(model_size, key_size, value_size, num_head, dropout)
self.norm1 = nn.LayerNorm(model_size)
self.ffn = nn.Sequential(nn.Linear(model_size, inner_size),
nn.ReLU(),
nn.Linear(inner_size, model_size),
TimestepDropout(dropout),)
self.norm2 = nn.LayerNorm(model_size)
def forward(self, input, seq_mask=None, atte_mask_out=None):
"""
:param input: [batch, seq_len, model_size]
:param seq_mask: [batch, seq_len]
:return: [batch, seq_len, model_size]
"""
attention = self.atte(input, input, input, atte_mask_out)
norm_atte = self.norm1(attention + input)
attention *= seq_mask
output = self.ffn(norm_atte)
return self.norm2(output + norm_atte)
output = self.norm2(output + norm_atte)
output *= seq_mask
return output
def __init__(self, num_layers, **kargs):
super(TransformerEncoder, self).__init__()
self.layers = nn.Sequential(*[self.SubLayer(**kargs) for _ in range(num_layers)])
self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)])
def forward(self, x, seq_mask=None):
return self.layers(x, seq_mask)
output = x
if seq_mask is None:
atte_mask_out = None
else:
atte_mask_out = (seq_mask < 1)[:,None,:]
seq_mask = seq_mask[:,:,None]
for layer in self.layers:
output = layer(output, seq_mask, atte_mask_out)
return output

View File

@ -1,8 +1,9 @@
[train]
n_epochs = 40
n_epochs = 20
batch_size = 32
use_cuda = true
validate_every = 500
use_tqdm=true
validate_every = 1000
use_golden_train=true
[test]
@ -16,20 +17,18 @@ use_cuda = true
[model]
word_vocab_size = -1
word_emb_dim = 100
word_emb_dim = 300
pos_vocab_size = -1
pos_emb_dim = 100
word_hid_dim = 100
pos_hid_dim = 100
rnn_layers = 3
rnn_hidden_size = 400
rnn_hidden_size = 256
arc_mlp_size = 500
label_mlp_size = 100
num_label = -1
dropout = 0.33
use_var_lstm=true
dropout = 0.3
encoder="var-lstm"
use_greedy_infer=false
[optim]
lr = 3e-4
lr = 2e-3
;weight_decay = 3e-5

View File

@ -5,7 +5,7 @@ sys.path.extend(['/home/yfshao/workdir/dev_fastnlp'])
import torch
import argparse
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.io.dataset_loader import ConllxDataLoader, add_seg_tag
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance

View File

@ -4,25 +4,23 @@ import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
import fastNLP
import torch
from fastNLP.core.trainer import Trainer
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.models.biaffine_parser import BiaffineParser, ParserMetric, ParserLoss
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.core.tester import Tester
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.io.model_io import ModelLoader
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.io.model_io import ModelSaver
from reproduction.Biaffine_parser.util import ConllxDataLoader, MyDataloader
from fastNLP.io.dataset_loader import ConllxDataLoader
from fastNLP.api.processor import *
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.core.callback import Callback
BOS = '<BOS>'
EOS = '<EOS>'
UNK = '<UNK>'
PAD = '<PAD>'
NUM = '<NUM>'
ENG = '<ENG>'
@ -33,11 +31,11 @@ if len(os.path.dirname(__file__)) != 0:
def convert(data):
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + list(map(int, sample[2]))
head_tags = [BOS] + sample[3]
dataset.append(Instance(words=word_seq,
word_seq = [BOS] + sample['words']
pos_seq = [BOS] + sample['pos_tags']
heads = [0] + sample['heads']
head_tags = [BOS] + sample['labels']
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
@ -50,24 +48,11 @@ def load(path):
return convert(data)
# datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT"
# datadir = "/home/yfshao/UD_English-EWT"
# train_data_name = "en_ewt-ud-train.conllu"
# dev_data_name = "en_ewt-ud-dev.conllu"
# emb_file_name = '/home/yfshao/glove.6B.100d.txt'
# loader = ConlluDataLoader()
# datadir = '/home/yfshao/workdir/parser-data/'
# train_data_name = "train_ctb5.txt"
# dev_data_name = "dev_ctb5.txt"
# test_data_name = "test_ctb5.txt"
datadir = "/home/yfshao/workdir/ctb7.0/"
datadir = "/remote-home/yfshao/workdir/ctb9.0/"
train_data_name = "train.conllx"
dev_data_name = "dev.conllx"
test_data_name = "test.conllx"
# emb_file_name = "/home/yfshao/workdir/parser-data/word_OOVthr_30_100v.txt"
emb_file_name = "/home/yfshao/workdir/word_vector/cc.zh.300.vec"
emb_file_name = "/remote-home/yfshao/workdir/word_vector/cc.zh.300.vec"
cfgfile = './cfg.cfg'
processed_datadir = './save'
@ -113,27 +98,23 @@ def update_v(vocab, data, field):
data.apply(lambda x: vocab.add_word_lst(x[field]), new_field_name=None)
print('load raw data and preprocess')
# use pretrain embedding
word_v = Vocabulary()
word_v.unknown_label = UNK
pos_v = Vocabulary()
word_v = Vocabulary(unknown=UNK, padding=PAD)
pos_v = Vocabulary(unknown=None, padding=PAD)
tag_v = Vocabulary(unknown=None, padding=None)
train_data = load(os.path.join(datadir, train_data_name))
dev_data = load(os.path.join(datadir, dev_data_name))
test_data = load(os.path.join(datadir, test_data_name))
print(train_data[0])
num_p = Num2TagProcessor('words', 'words')
print('load raw data and preprocess')
num_p = Num2TagProcessor(tag=NUM, field_name='raw_words', new_added_field_name='words')
for ds in (train_data, dev_data, test_data):
num_p(ds)
update_v(word_v, train_data, 'words')
update_v(pos_v, train_data, 'pos')
update_v(tag_v, train_data, 'tags')
print('vocab build success {}, {}, {}'.format(len(word_v), len(pos_v), len(tag_v)))
# embed, _ = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
# print(embed.size())
# Model
model_args['word_vocab_size'] = len(word_v)
@ -141,7 +122,7 @@ model_args['pos_vocab_size'] = len(pos_v)
model_args['num_label'] = len(tag_v)
model = BiaffineParser(**model_args.data)
model.reset_parameters()
print(model)
word_idxp = IndexerProcessor(word_v, 'words', 'word_seq')
pos_idxp = IndexerProcessor(pos_v, 'pos', 'pos_seq')
@ -164,7 +145,6 @@ for ds in (train_data, dev_data, test_data):
if train_args['use_golden_train']:
train_data.set_input('gold_heads', flag=True)
train_args.data.pop('use_golden_train')
ignore_label = pos_v['punct']
print(test_data[0])
print('train len {}'.format(len(train_data)))
@ -172,44 +152,62 @@ print('dev len {}'.format(len(dev_data)))
print('test len {}'.format(len(test_data)))
def train(path):
# test saving pipeline
save_pipe(path)
embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
embed = torch.tensor(embed, dtype=torch.float32)
# embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v)
# embed = torch.tensor(embed, dtype=torch.float32)
# model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=True)
model.word_embedding.padding_idx = word_v.padding_idx
model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
model.pos_embedding.padding_idx = pos_v.padding_idx
model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)
class MyCallback(Callback):
def on_step_end(self, optimizer):
step = self.trainer.step
# learning rate decay
if step > 0 and step % 1000 == 0:
for pg in optimizer.param_groups:
pg['lr'] *= 0.93
print('decay lr to {}'.format([pg['lr'] for pg in optimizer.param_groups]))
if step == 3000:
# start training embedding
print('start training embedding at {}'.format(step))
model = self.trainer.model
for m in model.modules():
if isinstance(m, torch.nn.Embedding):
m.weight.requires_grad = True
# Trainer
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
**train_args.data,
optimizer=fastNLP.Adam(**optim_args.data),
save_path=path)
# model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
model.word_embedding.padding_idx = word_v.padding_idx
model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
model.pos_embedding.padding_idx = pos_v.padding_idx
model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)
# try:
# ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
# print('model parameter loaded!')
# except Exception as _:
# print("No saved model. Continue.")
# pass
save_path=path,
callbacks=[MyCallback()])
# Start training
trainer.train()
print("Training finished!")
# save pipeline
save_pipe(path)
print('pipe saved')
try:
trainer.train()
print("Training finished!")
finally:
# save pipeline
save_pipe(path)
print('pipe saved')
def save_pipe(path):
pipe = Pipeline(processors=[num_p, word_idxp, pos_idxp, seq_p, set_input_p])
pipe.add_processor(ModelProcessor(model=model, batch_size=32))
pipe.add_processor(label_toword_p)
torch.save(pipe, os.path.join(path, 'pipe.pkl'))
os.makedirs(path, exist_ok=True)
torch.save({'pipeline': pipe,
'names':['num word_idx pos_idx seq set_input model tag_to_word'.split()],
}, os.path.join(path, 'pipe.pkl'))
def test(path):
@ -234,16 +232,11 @@ def test(path):
print("Testing Test data")
tester.test(model, test_data)
def build_pipe(parser_pipe_path):
parser_pipe = torch.load(parser_pipe_path)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer', 'save'])
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
parser.add_argument('--path', type=str, default='')
# parser.add_argument('--dst', type=str, default='')
args = parser.parse_args()
@ -253,12 +246,6 @@ if __name__ == "__main__":
test(args.path)
elif args.mode == 'infer':
pass
# elif args.mode == 'save':
# print(f'save model from {args.path} to {args.dst}')
# save_model(args.path, args.dst)
# load_path = os.path.dirname(args.dst)
# print(f'save pipeline in {load_path}')
# build(load_path)
else:
print('no mode specified for model!')
parser.print_help()

View File

@ -1,34 +1,3 @@
class ConllxDataLoader(object):
def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
data = [self.get_one(sample) for sample in datalist]
return list(filter(lambda x: x is not None, data))
def get_one(self, sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]
class MyDataloader:
def load(self, data_path):
with open(data_path, "r", encoding="utf-8") as f:
@ -56,23 +25,3 @@ class MyDataloader:
return data
def add_seg_tag(data):
"""
:param data: list of ([word], [pos], [heads], [head_tags])
:return: list of ([word], [pos])
"""
_processed = []
for word_list, pos_list, _, _ in data:
new_sample = []
for word, pos in zip(word_list, pos_list):
if len(word) == 1:
new_sample.append((word, 'S-' + pos))
else:
new_sample.append((word[0], 'B-' + pos))
for c in word[1:-1]:
new_sample.append((c, 'M-' + pos))
new_sample.append((word[-1], 'E-' + pos))
_processed.append(list(map(list, zip(*new_sample))))
return _processed

View File

@ -0,0 +1,3 @@

View File

@ -1,11 +1,11 @@
from torch import nn
import torch
import torch.nn.functional as F
from torch import nn
from fastNLP.modules.decoder.MLP import MLP
from fastNLP.models.base_model import BaseModel
from reproduction.chinese_word_segment.utils import seq_lens_to_mask
from fastNLP.modules.decoder.MLP import MLP
from reproduction.Chinese_word_segmentation.utils import seq_lens_to_mask
class CWSBiLSTMEncoder(BaseModel):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,

View File

@ -0,0 +1,125 @@
"""
使用transformer作为分词的encoder端
"""
from torch import nn
import torch
from fastNLP.modules.encoder.transformer import TransformerEncoder
from fastNLP.modules.decoder.CRF import ConditionalRandomField,seq_len_to_byte_mask
from fastNLP.modules.decoder.CRF import allowed_transitions
class TransformerCWS(nn.Module):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4):
super().__init__()
self.embedding = nn.Embedding(vocab_num, embed_dim)
input_size = embed_dim
if bigram_vocab_num:
self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim)
input_size += num_bigram_per_char*bigram_embed_dim
self.drop = nn.Dropout(embed_drop_p, inplace=True)
self.fc1 = nn.Linear(input_size, hidden_size)
value_size = hidden_size//num_heads
self.transformer = TransformerEncoder(num_layers, model_size=hidden_size, inner_size=hidden_size,
key_size=value_size,
value_size=value_size, num_head=num_heads)
self.fc2 = nn.Linear(hidden_size, tag_size)
allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes')
self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False,
allowed_transitions=allowed_trans)
def forward(self, chars, target, seq_lens, bigrams=None):
masks = seq_len_to_byte_mask(seq_lens).float()
x = self.embedding(chars)
batch_size = x.size(0)
length = x.size(1)
if hasattr(self, 'bigram_embedding'):
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
self.drop(x)
x = self.fc1(x)
feats = self.transformer(x, masks)
feats = self.fc2(feats)
losses = self.crf(feats, target, masks.float())
pred_dict = {}
pred_dict['seq_lens'] = seq_lens
pred_dict['loss'] = torch.mean(losses)
return pred_dict
def predict(self, chars, seq_lens, bigrams=None):
masks = seq_len_to_byte_mask(seq_lens).float()
x = self.embedding(chars)
batch_size = x.size(0)
length = x.size(1)
if hasattr(self, 'bigram_embedding'):
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
self.drop(x)
x = self.fc1(x)
feats = self.transformer(x, masks)
feats = self.fc2(feats)
probs = self.crf.viterbi_decode(feats, masks, get_score=False)
return {'pred': probs, 'seq_lens':seq_lens}
class NoamOpt(torch.optim.Optimizer):
"Optim wrapper that implements rate."
def __init__(self, model_size, factor, warmup, optimizer):
super().__init__([torch.nn.Parameter(torch.ones(1))], {})
self.optimizer = optimizer
self._step = 0
self.warmup = warmup
self.factor = factor
self.model_size = model_size
self._rate = 0
def step(self, **kwargs):
"Update parameters and rate"
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()
def rate(self, step=None):
"Implement `lrate` above"
if step is None:
step = self._step
return self.factor * \
(self.model_size ** (-0.5) *
min(step ** (-0.5), step * self.warmup ** (-1.5)))
if __name__ == '__main__':
transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8,
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4)
chars = torch.randint(10, size=(4, 7)).long()
bigrams = torch.randint(10, size=(4, 56)).long()
seq_lens = torch.ones(4).long()*7
target = torch.randint(4, size=(4, 7))
print(transformer(chars, target, seq_lens, bigrams))
optimizer = torch.optim.Adam(transformer.parameters())
opt = NoamOpt(10 ,1, 400, optimizer)

View File

@ -4,7 +4,7 @@ import re
from fastNLP.api.processor import Processor
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary
from reproduction.chinese_word_segment.process.span_converter import SpanConverter
from reproduction.Chinese_word_segmentation.process.span_converter import SpanConverter
_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'
@ -226,109 +226,6 @@ class Pre2Post2BigramProcessor(BigramProcessor):
return bigrams
# 这里需要建立vocabulary了但是遇到了以下的问题
# (1) 如果使用Processor的方式的话但是在这种情况返回的不是dataset。所以建立vocabulary的工作用另外的方式实现不借用
# Processor了
# TODO 如何将建立vocab和index这两步统一了
class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary并将其用数字index新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.
"""
def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=0, is_input=True):
"""
:param field_name: 从哪个field_name创建词表以及对哪个field_name进行index操作
:param new_added_filed_name: index时生成的index field的名称如果不传入则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息1输出信息
:param bool is_input:
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size
self.verbose =verbose
self.is_input = is_input
def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary
:param datasets: DataSet类型的数据用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))
def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary则使用dataset中的DataSet建立vocabulary若已经有了vocabulary则使用已有的vocabulary得到vocabulary
则会index datasets与only_index_dataset
:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index不会被用于生成vocabulary
:return:
"""
if len(datasets)==0 and not hasattr(self,'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets)!=0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))
for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name, is_input=self.is_input)
# 只返回一个infer时为了跟其他processor保持一致
if len(to_index_datasets) == 1:
return to_index_datasets[0]
def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab
def delete_vocab(self):
del self.vocab
def get_vocab_size(self):
return len(self.vocab)
def set_verbose(self, verbose):
"""
设置processor verbose状态
:param verbose: int, 0不输出任何信息1输出vocab 信息
:return:
"""
self.verbose = verbose
class VocabProcessor(Processor):
def __init__(self, field_name, min_freq=1, max_size=None):

View File

@ -4,7 +4,7 @@ from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.utils import ClassPreprocess as Preprocess
from fastNLP.io.config_io import ConfigLoader
from fastNLP.io.config_io import ConfigSection
from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader
from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention
from fastNLP.modules.decoder.MLP import MLP

View File

@ -0,0 +1,29 @@
from fastNLP.io.dataset_loader import ZhConllPOSReader
def cut_long_sentence(sent, max_sample_length=200):
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence
if __name__ == '__main__':
reader = ZhConllPOSReader()
d = reader.load('/home/hyan/train.conllx')
print(d)

View File

@ -10,7 +10,7 @@ eval_sort_key = 'accuracy'
[model]
rnn_hidden_units = 300
word_emb_dim = 100
word_emb_dim = 300
dropout = 0.5
use_crf = true
print_every_step = 10

View File

@ -0,0 +1,163 @@
import argparse
import os
import pickle
import sys
import torch
# in order to run fastNLP without installation
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import SeqLenProcessor, VocabIndexerProcessor, SetInputProcessor, IndexerProcessor
from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.core.trainer import Trainer
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.io.dataset_loader import ConllxDataLoader
from fastNLP.api.processor import ModelProcessor, Index2WordProcessor
cfgfile = './pos_tag.cfg'
pickle_path = "save"
def load_tencent_embed(embed_path, word2id):
hit = 0
with open(embed_path, "rb") as f:
embed_dict = pickle.load(f)
embedding_tensor = torch.randn(len(word2id), 200)
for key in word2id:
if key in embed_dict:
embedding_tensor[word2id[key]] = torch.Tensor(embed_dict[key])
hit += 1
print("vocab_size={} hit={} hit/vocab_size={}".format(len(word2id), hit, hit / len(word2id)))
return embedding_tensor
def train(train_data_path, dev_data_path, checkpoint=None, save=None):
# load config
train_param = ConfigSection()
model_param = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_param, "model": model_param})
print("config loaded")
# Data Loader
print("loading training set...")
dataset = ConllxDataLoader().load(train_data_path, return_dataset=True)
print("loading dev set...")
dev_data = ConllxDataLoader().load(dev_data_path, return_dataset=True)
print(dataset)
print("================= dataset ready =====================")
dataset.rename_field("tag", "truth")
dev_data.rename_field("tag", "truth")
vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq")
tag_proc = VocabIndexerProcessor("truth", is_input=True)
seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True)
set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len")
vocab_proc(dataset)
tag_proc(dataset)
seq_len_proc(dataset)
# index dev set
word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab
dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]], new_field_name="word_seq")
dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]], new_field_name="truth")
dev_data.apply(lambda ins: len(ins["word_seq"]), new_field_name="word_seq_origin_len")
# set input & target
dataset.set_input("word_seq", "word_seq_origin_len", "truth")
dev_data.set_input("word_seq", "word_seq_origin_len", "truth")
dataset.set_target("truth", "word_seq_origin_len")
dev_data.set_target("truth", "word_seq_origin_len")
# dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = vocab_proc.get_vocab_size()
model_param["num_classes"] = tag_proc.get_vocab_size()
print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"]))
# define a model
if checkpoint is None:
# pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
pre_trained = None
model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained)
print(model)
else:
model = torch.load(checkpoint)
# call trainer to train
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"),
dev_data=dev_data, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save)
trainer.train(load_best_model=True)
# save model & pipeline
model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len")
id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")
pp = Pipeline([vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
torch.save(save_dict, os.path.join(save, "model_pp.pkl"))
print("pipeline saved")
def run_test(test_path):
test_data = ConllxDataLoader().load(test_path, return_dataset=True)
with open("model_pp_0117.pkl", "rb") as f:
save_dict = torch.load(f)
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline
pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
truth = test_data.field_arrays["truth"].content
seq_len = test_data.field_arrays["word_seq_origin_len"].content
# padding by hand
max_length = max([len(seq) for seq in prediction])
for idx in range(len(prediction)):
prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx])))
truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx])))
evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth",
seq_lens="word_seq_origin_len")
evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)},
{"truth": torch.Tensor(truth)})
test_result = evaluator.get_metric()
f1 = round(test_result['f'] * 100, 2)
pre = round(test_result['pre'] * 100, 2)
rec = round(test_result['rec'] * 100, 2)
return {"F1": f1, "precision": pre, "recall": rec}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train", type=str, help="training conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--dev", type=str, help="dev conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--test", type=str, help="test conll file", default=None)
parser.add_argument("--save", type=str, help="path to save", default=None)
parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training")
parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model")
args = parser.parse_args()
if args.test is not None:
print(run_test(args.test))
else:
if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.train, args.dev, args.checkpoint, save=args.save)
else:
# 一次训练 python train_pos_tag.py
train(args.train, args.dev, save=args.save)

0
reproduction/__init__.py Normal file
View File

View File

@ -1,197 +0,0 @@
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.io.dataset_loader import DataSetLoader
def cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段只会在有空格的地方发生截断所以截取的句子可能长于或者短于max_sample_length
:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence
class NaiveCWSReader(DataSetLoader):
"""
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
这是 fastNLP , 一个 非常 good .
或者,即每个part后面还有一个pos tag
/D /P 團員/Na 之中/Ng /COMMACATEGORY
"""
def __init__(self, in_word_splitter=None):
super().__init__()
self.in_word_splitter = in_word_splitter
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
"""
允许使用的情况有(默认以\t或空格作为seg)
这是 fastNLP , 一个 非常 good .
/D /P 團員/Na 之中/Ng /COMMACATEGORY
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
:param filepath:
:param in_word_splitter:
:return:
"""
if in_word_splitter == None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if len(line.replace(' ', ''))==0: # 不能接受空行
continue
if not in_word_splitter is None:
words = []
for part in line.split():
word = part.split(in_word_splitter)[0]
words.append(word)
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
return dataset
class POSCWSReader(DataSetLoader):
"""
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
N
N
N
...
I-PER
I-PER
N
N
N
...
:param filepath:
:return:
"""
def __init__(self, in_word_splitter=None):
super().__init__()
self.in_word_splitter = in_word_splitter
def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
if in_word_splitter is None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
words = []
for line in f:
line = line.strip()
if len(line) == 0: # new line
if len(words)==0: # 不能接受空行
continue
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
words = []
else:
line = line.split()[0]
if in_word_splitter is None:
words.append(line)
else:
words.append(line.split(in_word_splitter)[0])
return dataset
class ConllCWSReader(object):
def __init__(self):
pass
def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field内容为str
假定了输入为conll的格式以空行隔开两个句子每行共7列
1 编者按 编者按 NN O 11 nmod:topic
2 PU O 11 punct
3 7 7 NT DATE 4 compound:nn
4 12 12 NT DATE 11 nmod:tmod
5 PU O 11 punct
1 DT O 3 det
2 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:
ds.append(Instance(raw_sentence=raw_sentence))
return ds
def get_char_lst(self, sample):
if len(sample)==0:
return None
text = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
return text

View File

@ -1,151 +0,0 @@
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.utils import load_pickle
from fastNLP.io.model_io import ModelLoader, ModelSaver
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.utils import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator
# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'
cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")
def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)
# Define the same model
model = AdvSeqLabel(test_args)
try:
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise
# Data Loader
infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
print('data loaded')
# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)
print(results)
print("Inference finished!")
def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})
print("loading data set...")
data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
data.load(cws_data_path)
data_train, data_dev = data.split(ratio=0.3)
train_args["vocab_size"] = len(data.word_vocab)
train_args["num_classes"] = len(data.label_vocab)
print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab)))
change_field_is_target(data_dev, "truth", True)
save_pickle(data_dev, "./save/", "data_dev.pkl")
save_pickle(data.word_vocab, "./save/", "word2id.pkl")
save_pickle(data.label_vocab, "./save/", "label2id.pkl")
# Trainer
trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"],
validate=train_args["validate"],
use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"],
save_best_dev=True, print_every_step=10, model_name="trained_model.pkl",
evaluator=SeqLabelEvaluator())
# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass
# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")
# Saver
saver = ModelSaver("./save/trained_model.pkl")
saver.save_pytorch(model)
print("Model saved!")
def predict():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})
# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)
# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")
# Define the same model
model = AdvSeqLabel(test_args)
# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print("model loaded!")
# Tester
test_args["evaluator"] = SeqLabelEvaluator()
tester = SeqLabelTester(**test_args.data)
# Start testing
tester.test(model, dev_data)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
predict()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

View File

@ -1,153 +0,0 @@
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
def cut_long_sentence(sent, max_sample_length=200):
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence
class ConllPOSReader(object):
# 返回的Dataset包含words(list of list, 里层的list是character), tag两个field(list of str, str是标有BIO的tag)。
def __init__(self):
pass
def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
if len(word)==1:
char_seq.append(word)
pos_seq.append('S-{}'.format(tag))
elif len(word)>1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word)-2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
char_seq.extend(list(word))
else:
raise ValueError("Zero length of word detected.")
ds.append(Instance(words=char_seq,
tag=pos_seq))
return ds
class ZhConllPOSReader(object):
# 中文colln格式reader
def __init__(self):
pass
def load(self, path):
"""
返回的DataSet, 包含以下的field
wordslist of str,
tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..]会被认为是["S-VP", "B-NN", "M-NN",..]
假定了输入为conll的格式以空行隔开两个句子每行共7列
1 编者按 编者按 NN O 11 nmod:topic
2 PU O 11 punct
3 7 7 NT DATE 4 compound:nn
4 12 12 NT DATE 11 nmod:tmod
5 PU O 11 punct
1 DT O 3 det
2 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)
ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
char_seq.extend(list(word))
if len(word)==1:
pos_seq.append('S-{}'.format(tag))
elif len(word)>1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word)-2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
else:
raise ValueError("Zero length of word detected.")
ds.append(Instance(words=char_seq,
tag=pos_seq))
return ds
def get_one(self, sample):
if len(sample)==0:
return None
text = []
pos_tags = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
pos_tags.append(t2)
return text, pos_tags
if __name__ == '__main__':
reader = ZhConllPOSReader()
d = reader.load('/home/hyan/train.conllx')
print(d)

View File

@ -1,113 +0,0 @@
import argparse
import os
import pickle
import sys
import torch
# in order to run fastNLP without installation
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import SeqLenProcessor
from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.core.trainer import Trainer
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.models.sequence_modeling import AdvSeqLabel
from reproduction.chinese_word_segment.process.cws_processor import VocabIndexerProcessor
from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
from fastNLP.api.processor import ModelProcessor, Index2WordProcessor
cfgfile = './pos_tag.cfg'
pickle_path = "save"
def load_tencent_embed(embed_path, word2id):
hit = 0
with open(embed_path, "rb") as f:
embed_dict = pickle.load(f)
embedding_tensor = torch.randn(len(word2id), 200)
for key in word2id:
if key in embed_dict:
embedding_tensor[word2id[key]] = torch.Tensor(embed_dict[key])
hit += 1
print("vocab_size={} hit={} hit/vocab_size={}".format(len(word2id), hit, hit / len(word2id)))
return embedding_tensor
def train(checkpoint=None):
# load config
train_param = ConfigSection()
model_param = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_param, "model": model_param})
print("config loaded")
# Data Loader
dataset = ZhConllPOSReader().load("/home/hyan/train.conllx")
print(dataset)
print("dataset transformed")
dataset.rename_field("tag", "truth")
vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq")
tag_proc = VocabIndexerProcessor("truth")
seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True)
vocab_proc(dataset)
tag_proc(dataset)
seq_len_proc(dataset)
dataset.set_input("word_seq", "word_seq_origin_len", "truth")
dataset.set_target("truth", "word_seq_origin_len")
print("processors defined")
# dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = vocab_proc.get_vocab_size()
model_param["num_classes"] = tag_proc.get_vocab_size()
print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"]))
# define a model
if checkpoint is None:
# pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
pre_trained = None
model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word, emb=pre_trained)
print(model)
else:
model = torch.load(checkpoint)
# call trainer to train
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"),
dev_data=dataset, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save")
trainer.train(load_best_model=True)
# save model & pipeline
model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len")
id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")
pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
torch.save(save_dict, "model_pp.pkl")
print("pipeline saved")
torch.save(model, "./save/best_model.pkl")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training")
parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model")
args = parser.parse_args()
if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.checkpoint)
else:
# 一次训练 python train_pos_tag.py
train()

View File

@ -1,9 +1,12 @@
import random
import unittest
from fastNLP import Vocabulary
import numpy as np
from fastNLP import Vocabulary, Instance
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor, PreAppendProcessor, SliceProcessor, Num2TagProcessor, \
IndexerProcessor, VocabProcessor, SeqLenProcessor
IndexerProcessor, VocabProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor, SetTargetProcessor, \
SetInputProcessor, VocabIndexerProcessor
from fastNLP.core.dataset import DataSet
@ -53,3 +56,46 @@ class TestProcessor(unittest.TestCase):
ds = proc(ds)
for data in ds.field_arrays["len"].content:
self.assertEqual(data, 30)
def test_ModelProcessor(self):
from fastNLP.models.cnn_text_classification import CNNText
model = CNNText(100, 100, 5)
ins_list = []
for _ in range(64):
seq_len = np.random.randint(5, 30)
ins_list.append(Instance(word_seq=[np.random.randint(0, 100) for _ in range(seq_len)], seq_lens=seq_len))
data_set = DataSet(ins_list)
data_set.set_input("word_seq", "seq_lens")
proc = ModelProcessor(model)
data_set = proc(data_set)
self.assertTrue("pred" in data_set)
def test_Index2WordProcessor(self):
vocab = Vocabulary()
vocab.add_word_lst(["a", "b", "c", "d", "e"])
proc = Index2WordProcessor(vocab, "tag_id", "tag")
data_set = DataSet([Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
data_set = proc(data_set)
self.assertTrue("tag" in data_set)
def test_SetTargetProcessor(self):
proc = SetTargetProcessor("a", "b", "c")
data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
data_set = proc(data_set)
self.assertTrue(data_set["a"].is_target)
self.assertTrue(data_set["b"].is_target)
self.assertTrue(data_set["c"].is_target)
def test_SetInputProcessor(self):
proc = SetInputProcessor("a", "b", "c")
data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
data_set = proc(data_set)
self.assertTrue(data_set["a"].is_input)
self.assertTrue(data_set["b"].is_input)
self.assertTrue(data_set["c"].is_input)
def test_VocabIndexerProcessor(self):
proc = VocabIndexerProcessor("word_seq", "word_ids")
data_set = DataSet([Instance(word_seq=["a", "b", "c", "d", "e"])])
data_set = proc(data_set)
self.assertTrue("word_ids" in data_set)

View File

@ -1,13 +1,44 @@
import time
import unittest
import numpy as np
import torch
from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet
from fastNLP.core.dataset import construct_dataset
from fastNLP.core.instance import Instance
from fastNLP.core.sampler import SequentialSampler
def generate_fake_dataset(num_samples=1000):
"""
产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
:param num_samples: sample的数量
:return:
"""
max_len = 50
min_len = 10
num_features = 4
data_dict = {}
for i in range(num_features):
data = []
lengths = np.random.randint(min_len, max_len, size=(num_samples))
for length in lengths:
data.append(np.random.randint(100, size=length))
data_dict[str(i)] = data
dataset = DataSet(data_dict)
for i in range(num_features):
if np.random.randint(2) == 0:
dataset.set_input(str(i))
else:
dataset.set_target(str(i))
return dataset
class TestCase1(unittest.TestCase):
def test_simple(self):
dataset = construct_dataset(
@ -31,3 +62,116 @@ class TestCase1(unittest.TestCase):
self.assertEqual(len(y["y"]), 4)
self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
self.assertListEqual(list(y["y"][-1]), [5, 6])
def test_list_padding(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))
def test_numpy_padding(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))
def test_list_to_tensor(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_numpy_to_tensor(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_list_of_list_to_tensor(self):
ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
[Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_list_of_numpy_to_tensor(self):
ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
[Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
print(x, y)
def test_sequential_batch(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_x, batch_y in batch:
time.sleep(pause_seconds)
"""
def test_multi_workers_batch(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)
num_workers = 1
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
for batch_x, batch_y in batch:
time.sleep(pause_seconds)
num_workers = 2
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
end1 = time.time()
for batch_x, batch_y in batch:
time.sleep(pause_seconds)
"""
"""
def test_pin_memory(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), pin_memory=True)
# 这里发生OOM
# for batch_x, batch_y in batch:
# time.sleep(pause_seconds)
num_workers = 2
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers,
pin_memory=True)
# 这里发生OOM
# for batch_x, batch_y in batch:
# time.sleep(pause_seconds)
"""

View File

@ -1,40 +1,47 @@
import unittest
import numpy as np
import torch
from fastNLP.core.callback import EchoCallback
from fastNLP.core.callback import EchoCallback, EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \
LRFinder, \
TensorboardCallback
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.losses import BCELoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.optimizer import SGD
from fastNLP.core.trainer import Trainer
from fastNLP.models.base_model import NaiveClassifier
def prepare_env():
def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set
data_set = prepare_fake_dataset()
data_set.set_input("x")
data_set.set_target("y")
model = NaiveClassifier(2, 1)
return data_set, model
class TestCallback(unittest.TestCase):
def test_case(self):
def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set
data_set = prepare_fake_dataset()
data_set.set_input("x")
data_set.set_target("y")
model = NaiveClassifier(2, 1)
def test_echo_callback(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=1,
n_epochs=2,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
@ -42,3 +49,90 @@ class TestCallback(unittest.TestCase):
use_tqdm=False,
callbacks=[EchoCallback()])
trainer.train()
def test_gradient_clip(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[GradientClipCallback(model.parameters(), clip_value=2)])
trainer.train()
def test_early_stop(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.01),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[EarlyStopCallback(5)])
trainer.train()
def test_lr_scheduler(self):
data_set, model = prepare_env()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=optimizer,
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))])
trainer.train()
def test_KeyBoardInterrupt(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[ControlC(False)])
trainer.train()
def test_LRFinder(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[LRFinder(len(data_set) // 32)])
trainer.train()
def test_TensorboardCallback(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[TensorboardCallback("loss", "metric")])
trainer.train()

View File

@ -6,15 +6,29 @@ from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance
class TestDataSet(unittest.TestCase):
class TestDataSetInit(unittest.TestCase):
"""初始化DataSet的办法有以下几种
1) 用dict:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2) 用list of Instance:
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])
只接受纯list或者最外层ndarray
"""
def test_init_v1(self):
# 一维list
ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40)
def test_init_v2(self):
# 用dict
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
@ -28,6 +42,8 @@ class TestDataSet(unittest.TestCase):
with self.assertRaises(ValueError):
_ = DataSet(0.00001)
class TestDataSetMethods(unittest.TestCase):
def test_append(self):
dd = DataSet()
for _ in range(3):

View File

@ -5,8 +5,65 @@ import numpy as np
from fastNLP.core.fieldarray import FieldArray
class TestFieldArrayInit(unittest.TestCase):
"""
1 如果DataSet使用dict初始化那么在add_field中会构造FieldArray
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2 如果DataSet使用list of Instance 初始化,那么在append中会先对第一个样本初始化FieldArray
然后后面的样本使用FieldArray.append进行添加
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])
"""
def test_init_v1(self):
# 二维list
fa = FieldArray("x", [[1, 2], [3, 4]] * 5, is_input=True)
def test_init_v2(self):
# 二维array
fa = FieldArray("x", np.array([[1, 2], [3, 4]] * 5), is_input=True)
def test_init_v3(self):
# 三维list
fa = FieldArray("x", [[[1, 2], [3, 4]], [[1, 2], [3, 4]]], is_input=True)
def test_init_v7(self):
# list of array
fa = FieldArray("x", [np.array([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])], is_input=True)
self.assertEqual(fa.pytype, int)
self.assertEqual(fa.dtype, np.int)
def test_init_v4(self):
# 一维list
val = [1, 2, 3, 4]
fa = FieldArray("x", [val], is_input=True)
fa.append(val)
def test_init_v5(self):
# 一维array
val = np.array([1, 2, 3, 4])
fa = FieldArray("x", [val], is_input=True)
fa.append(val)
def test_init_v6(self):
# 二维array
val = [[1, 2], [3, 4]]
fa = FieldArray("x", [val], is_input=True)
fa.append(val)
def test_init_v7(self):
# 二维list
val = np.array([[1, 2], [3, 4]])
fa = FieldArray("x", [val], is_input=True)
fa.append(val)
class TestFieldArray(unittest.TestCase):
def test(self):
def test_main(self):
fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True)
self.assertEqual(len(fa), 5)
fa.append(6)
@ -42,13 +99,13 @@ class TestFieldArray(unittest.TestCase):
self.assertEqual(fa.pytype, str)
def test_support_np_array(self):
fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=True)
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
fa = FieldArray("y", np.array([[1.1, 2.2, 3.3, 4.4, 5.5]]), is_input=True)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)
fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5]))
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)
fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True)
# in this case, pytype is actually a float. We do not care about it.
@ -97,3 +154,65 @@ class TestFieldArray(unittest.TestCase):
fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])
class TestPadder(unittest.TestCase):
def test01(self):
"""
测试AutoPadder能否正常工作
:return:
"""
from fastNLP.core.fieldarray import AutoPadder
padder = AutoPadder()
content = ['This is a str', 'this is another str']
self.assertListEqual(content, padder(content, None, np.str).tolist())
content = [1, 2]
self.assertListEqual(content, padder(content, None, np.int64).tolist())
content = [[1,2], [3], [4]]
self.assertListEqual([[1,2], [3, 0], [4, 0]],
padder(content, None, np.int64).tolist())
content = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
self.assertListEqual(content,
padder(content, None, np.int64).tolist())
def test02(self):
"""
测试EngChar2DPadder能不能正确使用
:return:
"""
from fastNLP.core.fieldarray import EngChar2DPadder
padder = EngChar2DPadder(pad_length=0)
contents = [1, 2]
# 不能是1维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[1, 2]]
# 不能是2维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[[[1, 2]]]]
# 不能是3维以上
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
self.assertListEqual([[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]],
padder(contents, None, np.int64).tolist())
padder = EngChar2DPadder(pad_length=5, pad_val=-100)
self.assertListEqual(
[[[1, 2, 3, -100, -100], [4, 5, -100, -100, -100], [7, 8, 9, 10, -100]],
[[1, -100, -100, -100, -100], [-100, -100, -100, -100, -100], [-100, -100, -100, -100, -100]]],
padder(contents, None, np.int64).tolist()
)

View File

@ -1,4 +1,5 @@
import unittest
from collections import defaultdict
import numpy as np
import torch
@ -23,12 +24,26 @@ def prepare_fake_dataset():
return data_set
class LinearModel(torch.nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.linear = Linear(2, 1)
def forward(self, x):
return {"predict": self.linear(x)}
class TestPredictor(unittest.TestCase):
def test(self):
predictor = Predictor()
model = Linear(2, 1)
def test_simple(self):
model = LinearModel()
predictor = Predictor(model)
data = prepare_fake_dataset()
data.set_input("x")
ans = predictor.predict(model, data)
self.assertEqual(len(ans), 2000)
self.assertTrue(isinstance(ans[0], torch.Tensor))
ans = predictor.predict(data)
self.assertTrue(isinstance(ans, defaultdict))
self.assertTrue("predict" in ans)
self.assertTrue(isinstance(ans["predict"], list))
def test_sequence(self):
# test sequence input/output
pass

View File

@ -237,6 +237,32 @@ class TrainerTestGround(unittest.TestCase):
use_tqdm=False,
print_every=2)
def test_case2(self):
# check metrics Wrong
data_set = prepare_fake_dataset2('x1', 'x2')
"""
def test_trainer_multiprocess(self):
dataset = prepare_fake_dataset2('x1', 'x2')
dataset.set_input('x1', 'x2', 'y', flag=True)
class Model(nn.Module):
def __init__(self):
super().__init__()
self.fc = nn.Linear(5, 4)
def forward(self, x1, x2, y):
x1 = self.fc(x1)
x2 = self.fc(x2)
x = x1 + x2
loss = F.cross_entropy(x, y)
return {'loss': loss}
model = Model()
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=True,
print_every=2,
num_workers=2,
pin_memory=False,
timeout=0,
)
trainer.train()
"""

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +0,0 @@
迈向充满希望的新世纪——一九九八年新年讲话
附图片1张

View File

@ -0,0 +1,100 @@
1 上海 _ NR NR _ 3 nsubj _ _
2 积极 _ AD AD _ 3 advmod _ _
3 准备 _ VV VV _ 0 root _ _
4 迎接 _ VV VV _ 3 ccomp _ _
5 欧元 _ NN NN _ 6 nn _ _
6 诞生 _ NN NN _ 4 dobj _ _
1 新华社 _ NR NR _ 7 dep _ _
2 上海 _ NR NR _ 7 dep _ _
3 十二月 _ NT NT _ 7 dep _ _
4 三十日 _ NT NT _ 7 dep _ _
5 电 _ NN NN _ 7 dep _ _
6 _ PU PU _ 7 punct _ _
7 记者 _ NN NN _ 0 root _ _
8 潘清 _ NR NR _ 7 dep _ _
9 _ PU PU _ 7 punct _ _
1 即将 _ AD AD _ 2 advmod _ _
2 诞生 _ VV VV _ 4 rcmod _ _
3 的 _ DEC DEC _ 2 cpm _ _
4 欧元 _ NN NN _ 6 nsubj _ _
5 _ PU PU _ 6 punct _ _
6 引起 _ VV VV _ 0 root _ _
7 了 _ AS AS _ 6 asp _ _
8 上海 _ NR NR _ 14 nn _ _
9 这 _ DT DT _ 14 det _ _
10 个 _ M M _ 9 clf _ _
11 中国 _ NR NR _ 13 nn _ _
12 金融 _ NN NN _ 13 nn _ _
13 中心 _ NN NN _ 14 nn _ _
14 城市 _ NN NN _ 16 assmod _ _
15 的 _ DEG DEG _ 14 assm _ _
16 关注 _ NN NN _ 6 dobj _ _
17 。 _ PU PU _ 6 punct _ _
1 上海 _ NR NR _ 2 nn _ _
2 银行界 _ NN NN _ 4 nsubj _ _
3 纷纷 _ AD AD _ 4 advmod _ _
4 推出 _ VV VV _ 0 root _ _
5 了 _ AS AS _ 4 asp _ _
6 与 _ P P _ 8 prep _ _
7 之 _ PN PN _ 6 pobj _ _
8 相关 _ VA VA _ 15 rcmod _ _
9 的 _ DEC DEC _ 8 cpm _ _
10 外汇 _ NN NN _ 15 nn _ _
11 业务 _ NN NN _ 15 nn _ _
12 品种 _ NN NN _ 15 conj _ _
13 和 _ CC CC _ 15 cc _ _
14 服务 _ NN NN _ 15 nn _ _
15 举措 _ NN NN _ 4 dobj _ _
16 _ PU PU _ 4 punct _ _
17 积极 _ AD AD _ 18 advmod _ _
18 准备 _ VV VV _ 4 dep _ _
19 启动 _ VV VV _ 18 ccomp _ _
20 欧元 _ NN NN _ 21 nn _ _
21 业务 _ NN NN _ 19 dobj _ _
22 。 _ PU PU _ 4 punct _ _
1 一些 _ CD CD _ 8 nummod _ _
2 热衷于 _ VV VV _ 8 rcmod _ _
3 个人 _ NN NN _ 5 nn _ _
4 外汇 _ NN NN _ 5 nn _ _
5 交易 _ NN NN _ 2 dobj _ _
6 的 _ DEC DEC _ 2 cpm _ _
7 上海 _ NR NR _ 8 nn _ _
8 市民 _ NN NN _ 13 nsubj _ _
9 _ PU PU _ 13 punct _ _
10 也 _ AD AD _ 13 advmod _ _
11 对 _ P P _ 13 prep _ _
12 欧元 _ NN NN _ 11 pobj _ _
13 表示 _ VV VV _ 0 root _ _
14 出 _ VV VV _ 13 rcomp _ _
15 极 _ AD AD _ 16 advmod _ _
16 大 _ VA VA _ 18 rcmod _ _
17 的 _ DEC DEC _ 16 cpm _ _
18 兴趣 _ NN NN _ 13 dobj _ _
19 。 _ PU PU _ 13 punct _ _
1 继 _ P P _ 38 prep _ _
2 上海 _ NR NR _ 6 nn _ _
3 大众 _ NR NR _ 6 nn _ _
4 汽车 _ NN NN _ 6 nn _ _
5 有限 _ JJ JJ _ 6 amod _ _
6 公司 _ NN NN _ 13 nsubj _ _
7 十八日 _ NT NT _ 13 tmod _ _
8 在 _ P P _ 13 prep _ _
9 中国 _ NR NR _ 10 nn _ _
10 银行 _ NN NN _ 12 nn _ _
11 上海 _ NR NR _ 12 nn _ _
12 分行 _ NN NN _ 8 pobj _ _
13 开立 _ VV VV _ 19 lccomp _ _
14 上海 _ NR NR _ 16 dep _ _
15 第一 _ OD OD _ 16 ordmod _ _
16 个 _ M M _ 18 clf _ _
17 欧元 _ NN NN _ 18 nn _ _
18 帐户 _ NN NN _ 13 dobj _ _
19 后 _ LC LC _ 1 plmod _ _
20 _ PU PU _ 38 punct _ _
21 工商 _ NN NN _ 28 nn _ _
22 银行 _ NN NN _ 28 conj _ _

View File

@ -1,24 +1,27 @@
import unittest
from fastNLP.io.dataset_loader import Conll2003Loader
from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, ConllCWSReader, \
ZhConllPOSReader, ConllxDataLoader
class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):
'''
def test_Conll2003Loader(self):
"""
Test the the loader of Conll2003 dataset
'''
"""
dataset_path = "test/data_for_tests/conll_2003_example.txt"
loader = Conll2003Loader()
dataset_2003 = loader.load(dataset_path)
for item in dataset_2003:
len0 = len(item["label0_list"])
len1 = len(item["label1_list"])
len2 = len(item["label2_list"])
lentoken = len(item["token_list"])
self.assertNotEqual(len0, 0)
self.assertEqual(len0, len1)
self.assertEqual(len1, len2)
def test_PeopleDailyCorpusLoader(self):
data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt")
def test_ConllCWSReader(self):
dataset = ConllCWSReader().load("test/data_for_tests/conll_example.txt")
def test_ZhConllPOSReader(self):
dataset = ZhConllPOSReader().load("test/data_for_tests/zh_sample.conllx")
def test_ConllxDataLoader(self):
dataset = ConllxDataLoader().load("test/data_for_tests/zh_sample.conllx")

21
test/models/test_bert.py Normal file
View File

@ -0,0 +1,21 @@
import unittest
import torch
from fastNLP.models.bert import BertModel
class TestBert(unittest.TestCase):
def test_bert_1(self):
# model = BertModel.from_pretrained("/home/zyfeng/data/bert-base-chinese")
model = BertModel(vocab_size=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
for layer in all_encoder_layers:
self.assertEqual(tuple(layer.shape), (2, 3, 768))
self.assertEqual(tuple(pooled_output.shape), (2, 768))

View File

@ -1,8 +1,8 @@
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
import fastNLP
import unittest
import fastNLP
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
data_file = """
1 The _ DET DT _ 3 det _ _
2 new _ ADJ JJ _ 3 amod _ _
@ -41,6 +41,7 @@ data_file = """
"""
def init_data():
ds = fastNLP.DataSet()
v = {'word_seq': fastNLP.Vocabulary(),
@ -60,28 +61,31 @@ def init_data():
data.append(line)
for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: ['<st>']+list(x[name]), new_field_name=name)
ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
ds.apply(lambda x: v[name].add_word_lst(x[name]))
for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)
ds.apply(lambda x: [0]+list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens')
ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True)
ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True)
return ds, v['word_seq'], v['pos_seq'], v['label_true']
class TestBiaffineParser(unittest.TestCase):
def test_train(self):
ds, v1, v2, v3 = init_data()
model = BiaffineParser(word_vocab_size=len(v1), word_emb_dim=30,
pos_vocab_size=len(v2), pos_emb_dim=30,
num_label=len(v3), use_var_lstm=True)
num_label=len(v3), encoder='var-lstm')
trainer = fastNLP.Trainer(model=model, train_data=ds, dev_data=ds,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
batch_size=1, validate_every=10,
n_epochs=10, use_cuda=False, use_tqdm=False)
trainer.train(load_best_model=False)
if __name__ == '__main__':
unittest.main()
unittest.main()

View File

@ -1,91 +0,0 @@
import unittest
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Tester
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.models import CNNText
class TestTutorial(unittest.TestCase):
def test_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])
dataset.append(Instance(raw_sentence='fake data', label='0'))
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')
# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()
dataset.apply(split_sent, new_field_name='words')
# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])
# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))
# 设置DataSet中哪些field要转为tensor
# set targetloss或evaluate中的golden计算loss模型评估时使用
dataset.set_target("label")
# set input模型forward时使用
dataset.set_input("words")
# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))
# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()
# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
from fastNLP import Trainer
from copy import deepcopy
# 更改DataSet中对应field的名称要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')
# 实例化Trainer传入模型和数据进行训练
copy_model = deepcopy(model)
overfit_trainer = Trainer(train_data=test_data, model=copy_model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
overfit_trainer.train()
trainer = Trainer(train_data=train_data, model=model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
trainer.train()
print('Train finished!')
# 使用fastNLP的Tester测试脚本
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)

432
test/test_tutorials.py Normal file
View File

@ -0,0 +1,432 @@
import unittest
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
class TestTutorial(unittest.TestCase):
def test_fastnlp_10min_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])
print(dataset[-3])
dataset.append(Instance(raw_sentence='fake data', label='0'))
# 将所有数字转为小写
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')
# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()
dataset.apply(split_sent, new_field_name='words')
# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])
# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))
# 设置DataSet中哪些field要转为tensor
# set targetloss或evaluate中的golden计算loss模型评估时使用
dataset.set_target("label")
# set input模型forward时使用
dataset.set_input("words", "seq_len")
# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))
# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()
# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])
# 如果你们需要做强化学习或者GAN之类的项目你们也可以使用这些数据预处理的工具
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import RandomSampler
batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
print("batch_x has: ", batch_x)
print("batch_y has: ", batch_y)
break
from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
from fastNLP import Trainer
from copy import deepcopy
# 更改DataSet中对应field的名称要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')
loss = CrossEntropyLoss(pred="output", target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")
# 实例化Trainer传入模型和数据进行训练
# 先在test_data拟合确保模型的实现是正确的
copy_model = deepcopy(model)
overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,
loss=loss,
metrics=metric,
save_path=None,
batch_size=32,
n_epochs=5)
overfit_trainer.train()
# 用train_data训练在test_data验证
trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"),
save_path=None,
batch_size=32,
n_epochs=5)
trainer.train()
print('Train finished!')
# 调用Tester在test_data上评价效果
from fastNLP import Tester
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)
def test_fastnlp_1min_tutorial(self):
# tutorials/fastnlp_1min_tutorial.ipynb
data_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t')
print(ds[1])
# 将所有数字转为小写
ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)
def split_sent(ins):
return ins['raw_sentence'].split()
ds.apply(split_sent, new_field_name='words', is_input=True)
# 分割训练集/验证集
train_data, dev_data = ds.split(0.3)
print("Train size: ", len(train_data))
print("Test size: ", len(dev_data))
from fastNLP import Vocabulary
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)
from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
trainer = Trainer(model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric()
)
trainer.train()
print('Train finished!')
def test_fastnlp_advanced_tutorial(self):
import os
os.chdir("tutorials/fastnlp_advanced_tutorial")
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Trainer
from fastNLP import Tester
# ### Instance
# Instance表示一个样本由一个或者多个field域、属性、特征组成每个field具有自己的名字以及值
# 在初始化Instance的时候可以定义它包含的field使用"field_name=field_value"的写法
# In[2]:
# 组织一个Instance这个Instance由premise、hypothesis、label三个field组成
instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1)
instance
# In[3]:
data_set = DataSet([instance] * 5)
data_set.append(instance)
data_set[-2:]
# In[4]:
# 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中
instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.',
label='1')
try:
data_set.append(instance2)
except:
pass
data_set[-2:]
# In[5]:
# 如果某一个field的名字不对则该instance不能被append到dataset中
instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.',
label=1)
try:
data_set.append(instance3)
except:
print('cannot append instance')
pass
data_set[-2:]
# In[6]:
# 除了文本以外还可以将tensor作为其中一个field的value
import torch
tensor_ins = Instance(image=torch.randn(5, 5), label=0)
ds = DataSet()
ds.append(tensor_ins)
ds
from fastNLP import DataSet
from fastNLP import Instance
# 从csv读取数据到DataSet
# 类csv文件即每一行为一个example的文件都可以使用这种方法进行数据读取
dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t')
# 查看DataSet的大小
len(dataset)
# In[8]:
# 使用数字索引[k]获取第k个样本
dataset[0]
# In[9]:
# 获取的样本是一个Instance
type(dataset[0])
# In[10]:
# 使用数字索引[a: b]获取第a到第b个样本
dataset[0: 3]
# In[11]:
# 索引也可以是负数
dataset[-1]
data_path = ['premise', 'hypothesis', 'label']
# 读入文件
with open(data_path[0]) as f:
premise = f.readlines()
with open(data_path[1]) as f:
hypothesis = f.readlines()
with open(data_path[2]) as f:
label = f.readlines()
assert len(premise) == len(hypothesis) and len(hypothesis) == len(label)
# 组织DataSet
data_set = DataSet()
for p, h, l in zip(premise, hypothesis, label):
p = p.strip() # 将行末空格去除
h = h.strip() # 将行末空格去除
data_set.append(Instance(premise=p, hypothesis=h, truth=l))
data_set[0]
# ### DataSet的其他操作
# 在构建完毕DataSet后仍然可以对DataSet的内容进行操作函数接口为DataSet.apply()
# In[13]:
# 将premise域的所有文本转成小写
data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise')
data_set[-2:]
# In[14]:
# label转int
data_set.apply(lambda x: int(x['truth']), new_field_name='truth')
data_set[-2:]
# In[15]:
# 使用空格分割句子
def split_sent(ins):
return ins['premise'].split()
data_set.apply(split_sent, new_field_name='premise')
data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis')
data_set[-2:]
# In[16]:
# 筛选数据
origin_data_set_len = len(data_set)
data_set.drop(lambda x: len(x['premise']) <= 6)
origin_data_set_len, len(data_set)
# In[17]:
# 增加长度信息
data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len')
data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len')
data_set[-1]
# In[18]:
# 设定特征域、标签域
data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len")
data_set.set_target("truth")
# In[19]:
# 重命名field
data_set.rename_field('truth', 'label')
data_set[-1]
# In[20]:
# 切分训练、验证集、测试集
train_data, vad_data = data_set.split(0.5)
dev_data, test_data = vad_data.split(0.4)
len(train_data), len(dev_data), len(test_data)
# In[21]:
# 深拷贝一个数据集
import copy
train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy(dev_data)
del copy
# 初始化词表该词表最大的vocab_size为10000词表中每个词出现的最低频率为2'<unk>'表示未知词语,'<pad>'表示padding词语
# Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'
vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>')
# 构建词表
train_data.apply(lambda x: [vocab.add(word) for word in x['premise']])
train_data.apply(lambda x: [vocab.add(word) for word in x['hypothesis']])
vocab.build_vocab()
# In[23]:
# 根据词表index句子
train_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
train_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data[-1], dev_data[-1], test_data[-1]
# 读入vocab文件
with open('vocab.txt') as f:
lines = f.readlines()
vocabs = []
for line in lines:
vocabs.append(line.strip())
# 实例化Vocabulary
vocab_bert = Vocabulary(unknown=None, padding=None)
# 将vocabs列表加入Vocabulary
vocab_bert.add_word_lst(vocabs)
# 构建词表
vocab_bert.build_vocab()
# 更新unknown与padding的token文本
vocab_bert.unknown = '[UNK]'
vocab_bert.padding = '[PAD]'
# In[25]:
# 根据词表index句子
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
new_field_name='hypothesis')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data_2[-1], dev_data_2[-1]
# step 1加载模型参数非必选
from fastNLP.io.config_io import ConfigSection, ConfigLoader
args = ConfigSection()
ConfigLoader().load_config("./data/config", {"esim_model": args})
args["vocab_size"] = len(vocab)
args.data
# In[27]:
# step 2加载ESIM模型
from fastNLP.models import ESIM
model = ESIM(**args.data)
model
# In[28]:
# 另一个例子加载CNN文本分类模型
from fastNLP.models import CNNText
cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
cnn_text_model
from fastNLP import CrossEntropyLoss
from fastNLP import Adam
from fastNLP import AccuracyMetric
trainer = Trainer(
train_data=train_data,
model=model,
loss=CrossEntropyLoss(pred='pred', target='label'),
metrics=AccuracyMetric(),
n_epochs=3,
batch_size=16,
print_every=-1,
validate_every=-1,
dev_data=dev_data,
use_cuda=False,
optimizer=Adam(lr=1e-3, weight_decay=0),
check_code_level=-1,
metric_key='acc',
use_tqdm=False,
)
trainer.train()
tester = Tester(
data=test_data,
model=model,
metrics=AccuracyMetric(),
batch_size=args["batch_size"],
)
tester.test()
os.chdir("../..")

View File

@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n"
]
},
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample\n",
"import sys\n",
"import os\n",
"sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"from fastNLP import Vocabulary\n",
"\n",
"dataset = DataSet()\n",
"dataset.append(Instance(raw_sent='This is a bad idea .', label=0))\n",
"dataset.append(Instance(raw_sent='It is great .', label=1))\n",
"\n",
"# 按照fastNLP_10min_tutorial.ipynb的步骤对数据进行一些处理。这里为了演示padding操作把field的名称做了一些改变\n",
"dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')\n",
"dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')\n",
"\n",
"# 建立Vocabulary\n",
"word_vocab = Vocabulary()\n",
"dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))\n",
"dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')\n",
"\n",
"# 检查以下是否得到我们想要的结果了\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],\n",
" [8, 2, 9, 3, 0, 0]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\"\\n结果中\\n Batch会对元素类型(元素即最内层的数据raw_sent为strword_str_lst为strwords为int, label为int)为int或者float的数据进行默认\\n padding而非int或float的则不进行padding。但若每个Instance中该field为二维数据也不进行padding。因为二维数据的padding涉及到\\n 两个维度的padding不容易自动判断padding的形式。\\n'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将field设置为input或者target\n",
"dataset.set_input('word_str_lst')\n",
"dataset.set_input('words')\n",
"dataset.set_target('label')\n",
"\n",
"# 使用Batch取出batch数据\n",
"from fastNLP.core.batch import Batch\n",
"from fastNLP.core.sampler import RandomSampler\n",
"\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\"\"\"\"\n",
"结果中\n",
" Batch会对元素类型(元素即最内层的数据raw_sent为strword_str_lst为strwords为int, label为int)为int或者float的数据进行默认\n",
" padding而非int或float的则不进行padding。但若每个Instance中该field为二维数据也不进行padding。因为二维数据的padding涉及到\n",
" 两个维度的padding不容易自动判断padding的形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['it', 'is', 'great', '.']),\n",
" list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[ 8, 2, 9, 3, -100, -100],\n",
" [ 4, 2, 5, 6, 7, 3]])}\n",
"batch_y has: {'label': tensor([1, 0])}\n"
]
}
],
"source": [
"# 所有的pad_val都默认为0如果需要修改某一个field的默认pad值可以通过DataSet.set_pad_val(field_name, pad_val)进行修改\n",
"# 若需要将word的padding修改为-100\n",
"dataset.set_pad_val('words', pad_val=-100)\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"# pad的值修改为-100了"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list,\n",
"'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,\n",
"'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list,\n",
"'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,\n",
"'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若需要使用二维padding或指定padding方式可以通过设置该field的padder实现下面以英文的character padding为例。在某些场景下可能想要\n",
"# 使用英文word的character作为特征character的padding为二维paddingfastNLP默认只会进行一维padding。\n",
"\n",
"dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')\n",
"char_vocab = Vocabulary()\n",
"dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])\n",
"dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),\n",
" list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n 其它field与之前的是相同的。chars因为存在两个维度需要padding不能自动决定padding方式所以直接输出了原始形式。\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果不针对二维的character指定padding方法\n",
"dataset.set_input('chars')\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" 其它field与之前的是相同的。chars因为存在两个维度需要padding不能自动决定padding方式所以直接输出了原始形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n chars被正确padding了\\n'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若要使用二维padding需要手动设置padding方式\n",
"from fastNLP.core.fieldarray import EngChar2DPadder\n",
"dataset.set_padder('chars', EngChar2DPadder())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" chars被正确padding了\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'raw_sent': ['this is a bad idea .', 'it is great . '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n raw_sent正确输出对应内容也进行了pad。\\n'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果AutoPad与EngChar2DPadder不能满足需要可以自己实现Padder对象。这里举一个例子比如需要把raw_sentence pad到一样长\n",
"from fastNLP.core.fieldarray import PadderBase\n",
"\n",
"class PadStr(PadderBase):\n",
" def __init__(self, pad_val=' '):\n",
" super().__init__(pad_val=pad_val) #让父类管理pad_val的值这样可以通过DataSet.set_pad_val()修改到该值\n",
" \n",
" def __call__(self, contents, field_name, field_ele_dtype):\n",
" \"\"\"\n",
" 如果以上面的例子举例在raw_sent这个field进行pad时传入的\n",
" contents:\n",
" [\n",
" 'This is a bad idea .',\n",
" 'It is great .'\n",
" ]\n",
" field_name: 'raw_sent'当前field的名称主要用于帮助debug。\n",
" field_ele_dtype: np.str. 这个参数基本都用不上是该field中内部元素的类型\n",
" \"\"\"\n",
" max_len = max([len(str_) for str_ in contents])\n",
" pad_strs = []\n",
" for content in contents:\n",
" pad_strs.append(content + (max_len-len(content))*self.pad_val)\n",
" return pad_strs\n",
"\n",
"dataset.set_input('raw_sent')\n",
"dataset.set_padder('raw_sent', PadStr())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\n",
"\"\"\"\n",
" raw_sent正确输出对应内容也进行了pad。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,97 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## fastNLP测试说明\n",
"### 测试环境\n",
"fastNLP使用pytest对代码进行单元测试测试代码在test文件夹下测试所需数据在test/data_for_tests文件夹下\n",
"测试的步骤主要分为准备数据,执行测试,比对结果,清除环境四步\n",
"测试代码以test_xxx.py命名以DataSet的测试代码为例测试代码文件名为test_dataset.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import unittest # 单元测试需要用到unittest\n",
"\n",
"from fastNLP.core.dataset import DataSet\n",
"from fastNLP.core.fieldarray import FieldArray\n",
"from fastNLP.core.instance import Instance\n",
"# 在这个单元测试文件中需要测试DataSet、FieldArray、以及Instance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class TestDataSet(unittest.TestCase): # 类名字以Test打头继承unittest.TestCase\n",
"\n",
" def test_init_v1(self): # 测试样例1 函数名称以test_打头\n",
" # 该测试样例测试的是DataSet的初始化\n",
" ins = Instance(x=[1, 2, 3, 4], y=[5, 6]) # 准备数据\n",
" ds = DataSet([ins] * 40) # 执行测试调用DataSet的初始化函数\n",
" self.assertTrue(\"x\" in ds.field_arrays and \"y\" in ds.field_arrays) # 比对结果:'x'跟'y'都是ds的field\n",
" self.assertEqual(ds.field_arrays[\"x\"].content, [[1, 2, 3, 4], ] * 40) # 比对结果: field 'x'的内容正确\n",
" self.assertEqual(ds.field_arrays[\"y\"].content, [[5, 6], ] * 40) # 比对结果: field 'y'的内容正确\n",
" \n",
" def test_init_v2(self): # 测试样例2该样例测试DataSet的另一种初始化方式\n",
" ds = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 40})\n",
" self.assertTrue(\"x\" in ds.field_arrays and \"y\" in ds.field_arrays)\n",
" self.assertEqual(ds.field_arrays[\"x\"].content, [[1, 2, 3, 4], ] * 40)\n",
" self.assertEqual(ds.field_arrays[\"y\"].content, [[5, 6], ] * 40)\n",
" \n",
" def test_init_assert(self): # 测试样例3该样例测试不规范初始化DataSet时是否会报正确错误\n",
" with self.assertRaises(AssertionError):\n",
" _ = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 100})\n",
" with self.assertRaises(AssertionError):\n",
" _ = DataSet([[1, 2, 3, 4]] * 10)\n",
" with self.assertRaises(ValueError):\n",
" _ = DataSet(0.00001)\n",
" \n",
" def test_contains(self): # 测试样例4该样例测试DataSet的contains函数是功能测试\n",
" ds = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 40})\n",
" self.assertTrue(\"x\" in ds)\n",
" self.assertTrue(\"y\" in ds)\n",
" self.assertFalse(\"z\" in ds)\n",
" \n",
" # 更多测试样例见test/core/test_dataset.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}