Merge branch 'dev0.5.0' of https://github.com/fastnlp/fastNLP into dev0.5.0

This commit is contained in:
yh 2019-09-25 19:00:29 +08:00
commit 685e9900e5
22 changed files with 265 additions and 1360 deletions

View File

@ -2,6 +2,6 @@ fastNLP.core.callback
=====================
.. automodule:: fastNLP.core.callback
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError
:inherited-members:

View File

@ -2,6 +2,6 @@ fastNLP.io.loader
=================
.. automodule:: fastNLP.io.loader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CoReferenceLoader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader
:inherited-members:

View File

@ -2,6 +2,6 @@ fastNLP.io.pipe
===============
.. automodule:: fastNLP.io.pipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CoReferencePipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe
:inherited-members:

View File

@ -2,7 +2,7 @@ fastNLP.io
==========
.. automodule:: fastNLP.io
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:inherited-members:
子模块

View File

@ -2,7 +2,7 @@ fastNLP
=======
.. automodule:: fastNLP
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:inherited-members:
子模块

View File

@ -1,67 +1,132 @@
===================================================
使用Callback自定义你的训练过程
使用 Callback 自定义你的训练过程
===================================================
在训练时我们常常要使用trick来提高模型的性能如调节学习率或者要打印训练中的信息。
这里我们提供Callback类在Trainer中插入代码完成一些自定义的操作。
- 什么是 Callback
- 使用 Callback
- 一些常用的 Callback
- 自定义实现 Callback
我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。
给出一段评价性文字预测其情感倾向是积极label=1、消极label=0还是中性label=2使用 :class:`~fastNLP.Trainer`:class:`~fastNLP.Tester` 来进行快速训练和测试。
关于数据处理Loss和Optimizer的选择可以看其他教程这里仅在训练时加入学习率衰减。
---------------------
Callback的构建和使用
什么是Callback
---------------------
创建Callback
我们可以继承fastNLP :class:`~fastNLP.Callback` 类来定义自己的Callback。
这里我们实现一个让学习率线性衰减的Callback。
Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。
.. code-block:: python
fastNLP 中提供了很多常用的 Callback ,开箱即用。
import fastNLP
class LRDecay(fastNLP.Callback):
def __init__(self):
super(LRDecay, self).__init__()
self.base_lrs = []
self.delta = []
使用 Callback
---------------------
def on_train_begin(self):
# 初始化,仅训练开始时调用
self.base_lrs = [pg['lr'] for pg in self.optimizer.param_groups]
self.delta = [float(lr) / self.n_epochs for lr in self.base_lrs]
使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。
def on_epoch_end(self):
# 每个epoch结束时更新学习率
ep = self.epoch
lrs = [lr - d * ep for lr, d in zip(self.base_lrs, self.delta)]
self.change_lr(lrs)
def change_lr(self, lrs):
for pg, lr in zip(self.optimizer.param_groups, lrs):
pg['lr'] = lr
.. code-block:: python
这里,:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。
如 on_train_begin() 会在训练开始时被调用on_epoch_end() 会在每个 epoch 结束时调用。
具体有哪些类方法,参见文档 :class:`~fastNLP.Callback`
from fastNLP import (Callback, EarlyStopCallback,
Trainer, CrossEntropyLoss, AccuracyMetric)
from fastNLP.models import CNNText
import torch.cuda
另外,为了使用方便,可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性,如 optimizer, epoch, step分别对应训练时的优化器当前epoch数和当前的总step数。
具体可访问的属性,参见文档 :class:`~fastNLP.Callback`
# prepare data
def get_data():
from fastNLP.io import ChnSentiCorpPipe as pipe
data = pipe().process_from_file()
print(data)
data.rename_field('chars', 'words')
train_data = data.datasets['train']
dev_data = data.datasets['dev']
test_data = data.datasets['test']
vocab = data.vocabs['words']
tgt_vocab = data.vocabs['target']
return train_data, dev_data, test_data, vocab, tgt_vocab
使用Callback
在定义好 :class:`~fastNLP.Callback` 之后就能将它传入Trainer的 ``callbacks`` 参数,在实际训练时使用。
# prepare model
train_data, dev_data, _, vocab, tgt_vocab = get_data()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))
.. code-block:: python
"""
数据预处理,模型定义等等
"""
trainer = fastNLP.Trainer(
model=model, train_data=train_data, dev_data=dev_data,
optimizer=optimizer, metrics=metrics,
batch_size=10, n_epochs=100,
callbacks=[LRDecay()])
# define callback
callbacks=[EarlyStopCallback(5)]
# pass callbacks to Trainer
def train_with_callback(cb_list):
trainer = Trainer(
device=device,
n_epochs=3,
model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric(),
callbacks=cb_list,
check_code_level=-1
)
trainer.train()
train_with_callback(callbacks)
fastNLP 中的 Callback
---------------------
fastNLP 中提供了很多常用的 Callback如梯度裁剪训练时早停和测试验证集fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks
.. code-block:: python
from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback
callbacks = [
EarlyStopCallback(5),
GradientClipCallback(clip_value=5, clip_type='value'),
EvaluateCallback(dev_data)
]
train_with_callback(callbacks)
自定义 Callback
---------------------
这里我们以一个简单的 Callback作为例子它的作用是打印每一个 Epoch 平均训练 loss。
1. 创建 Callback
要自定义 Callback我们要实现一个类继承 fastNLP.Callback。这里我们定义 MyCallBack ,继承 fastNLP.Callback 。
2. 指定 Callback 调用的阶段
Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用on_epoch_end()
会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录
当前 loss在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。
3. 使用 Callback 的属性访问 Trainer 的内部信息
为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs分别对应训练时的优化器
当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步
数,可以通过 self.step 属性得到当前训练了多少步。
.. code-block:: python
from fastNLP import Callback
from fastNLP import logger
class MyCallBack(Callback):
"""Print average loss in each epoch"""
def __init__(self):
super().__init__()
self.total_loss = 0
self.start_step = 0
def on_backward_begin(self, loss):
self.total_loss += loss.item()
def on_epoch_end(self):
n_steps = self.step - self.start_step
avg_loss = self.total_loss / n_steps
logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)
self.start_step = self.step
callbacks = [MyCallBack()]
train_with_callback(callbacks)

View File

@ -86,7 +86,7 @@ Vocabulary
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的
情况下如果仅使用来自于train的数据建立vocabulary会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们
会被认为是unk)所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用会有如下的效果

View File

@ -187,7 +187,7 @@ BertEmbedding的使用
torch.Size([1, 7, 768])
在英文Bert模型中一个英文单词可能会被切分为多个subword例如"fairness"会被拆分为 ``["fair", "##ness"]`` 这样一个word对应的将有两个输出
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector通过pool_method可以控制
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector通过pool_method可以控制
该pooling方法支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和
##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。
@ -200,8 +200,8 @@ BertEmbedding的使用
torch.Size([1, 5, 768])
另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`_ Bert在针对具有两句话的任务时如matchingQ&A任务句子之间通过[SEP]拼接起来前一句话的token embedding为0
另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_
Bert在针对具有两句话的任务时如matchingQ&A任务句子之间通过[SEP]拼接起来前一句话的token embedding为0
后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。
.. code-block:: python
@ -230,7 +230,7 @@ Part VI: 使用character-level的embedding
-----------------------------------------------------
除了预训练的embedding以外fastNLP还提供了两种Character Embedding :class:`~fastNLP.embeddings.CNNCharEmbedding`
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时需要在预处理的时候将word拆分成character
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时需要在预处理的时候将word拆分成character
会使得预处理过程变得非常繁琐。在fastNLP中使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该
Vocabulary与其它Embedding使用的Vocabulary是一致的下面我们看两个例子。
@ -298,11 +298,12 @@ Part VII: 叠加使用多个embedding
torch.Size([1, 5, 114])
:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的即输出index返回对应的表示。但能够拼接起来的Embedding
:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的即输出index返回对应的表示。但能够拼接起来的Embedding
必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字
-----------------------------------------------------------
Part VIII: Embedding的其它说明
-----------------------------------------------------------

View File

@ -20,7 +20,7 @@ Part I: 数据集容器DataBundle
来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle`
的相关使用。
:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader`:class:`~fastNLP.io.Pipe` 中被使用。
:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader`:class:`~fastNLP.io.Pipe` 中被使用。
下面我们先介绍一下 :class:`~fastNLP.io.Loader`:class:`~fastNLP.io.Pipe`
Part II: 加载的各种数据集的Loader

View File

@ -47,7 +47,7 @@ __all__ = [
"SNLILoader",
"QNLILoader",
"RTELoader",
"XNLILoader",
"CNXNLILoader",
"BQCorpusLoader",
"LCQMCLoader",
@ -70,32 +70,61 @@ __all__ = [
"WeiboNERPipe",
"CWSPipe",
"Pipe",
"CWSPipe",
"YelpFullPipe",
"YelpPolarityPipe",
"SSTPipe",
"SST2Pipe",
"IMDBPipe",
"ChnSentiCorpPipe",
"THUCNewsPipe",
"WeiboSenti100kPipe",
"Conll2003NERPipe",
"OntoNotesNERPipe",
"MsraNERPipe",
"WeiboNERPipe",
"PeopleDailyPipe",
"Conll2003Pipe",
"MatchingBertPipe",
"RTEBertPipe",
"SNLIBertPipe",
"QuoraBertPipe",
"QNLIBertPipe",
"MNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe",
"LCQMCBertPipe",
"MatchingPipe",
"RTEPipe",
"SNLIPipe",
"QuoraPipe",
"QNLIPipe",
"MNLIPipe",
"LCQMCPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",
'ModelLoader',
'ModelSaver',
]
from .embed_loader import EmbedLoader
from .data_bundle import DataBundle
from .model_io import ModelLoader, ModelSaver
from .loader import *
from .pipe import *
import sys
from .data_bundle import DataBundle
from .embed_loader import EmbedLoader
from .loader import *
from .model_io import ModelLoader, ModelSaver
from .pipe import *
from ..doc_utils import doc_process
doc_process(sys.modules[__name__])

View File

@ -54,7 +54,9 @@ __all__ = [
'SSTLoader',
'SST2Loader',
"ChnSentiCorpLoader",
"THUCNewsLoader",
"WeiboSenti100kLoader",
'ConllLoader',
'Conll2003Loader',
'Conll2003NERLoader',
@ -63,26 +65,31 @@ __all__ = [
"MsraNERLoader",
"PeopleDailyNERLoader",
"WeiboNERLoader",
'CSVLoader',
'JsonLoader',
'CWSLoader',
'MNLILoader',
"QuoraLoader",
"SNLILoader",
"QNLILoader",
"RTELoader",
"CNXNLILoader",
"BQCorpusLoader",
"LCQMCLoader",
"CoReferenceLoader"
]
from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader
from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, \
ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
from .coreference import CoReferenceLoader
from .csv import CSVLoader
from .cws import CWSLoader
from .json import JsonLoader
from .loader import Loader
from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
from .coreference import CoReferenceLoader
from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, \
LCQMCLoader

View File

@ -409,6 +409,7 @@ class THUCNewsLoader(Loader):
.. csv-table::
:header: "raw_words", "target"
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
"...", "..."
@ -446,13 +447,18 @@ class WeiboSenti100kLoader(Loader):
别名
数据集简介微博sentiment classification二分类
原始数据内容为
label text
0 六一出生的好讽刺 //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [][][]
1 听过一场笑死了昂一听茄子脱口秀从此节操是路人[嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果]
.. .. code-block:: text
label text
0 六一出生的好讽刺 //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [][][]
1 听过一场笑死了昂一听茄子脱口秀从此节操是路人[嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果]
读取后的Dataset将具有以下数据结构
.. csv-table::
:header: "raw_chars", "target"
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0"
"...", "..."

View File

@ -15,14 +15,14 @@ import os
import warnings
from typing import Union, Dict
from .csv import CSVLoader
from .json import JsonLoader
from .loader import Loader
from .. import DataBundle
from ..utils import check_loader_paths
from ...core.const import Const
from ...core.dataset import DataSet
from ...core.instance import Instance
from .csv import CSVLoader
from ..utils import check_loader_paths
class MNLILoader(Loader):
@ -348,8 +348,9 @@ class CNXNLILoader(Loader):
.. csv-table::
:header: "raw_chars1", "raw_chars2", "target"
"从概念上看,奶油收入有两个基本方面产品和地理.", "产品和地理是什么使奶油抹霜工作.", "1"
""...", "...", "..."
"...", "...", "..."
"""
@ -412,6 +413,7 @@ class BQCorpusLoader(Loader):
.. csv-table::
:header: "raw_chars1", "raw_chars2", "target"
"不是邀请的如何贷款?", "我不是你们邀请的客人可以贷款吗?", "1"
"如何满足微粒银行的审核", "建设银行有微粒贷的资格吗", "0"
"...", "...", "..."
@ -448,20 +450,26 @@ class BQCorpusLoader(Loader):
class LCQMCLoader(Loader):
"""
别名
r"""
数据集简介句对匹配question matching
原始数据为
'喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n'
'晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n'
读取后的Dataset将具有以下的数据结构
.. code-block:: text
'喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n'
'晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n'
读取后的Dataset将具有以下的数据结构
.. csv-table::
:header: "raw_chars1", "raw_chars2", "target"
"喜欢打篮球的男生喜欢什么样的女生?", "爱打篮球的男生喜欢什么样的女生?", "1"
"晚上睡觉带着耳机听音乐有什么害处吗?", "妇可以戴耳机听音乐吗?", "0"
""...", "...", "..."
"...", "...", "..."
"""
def __init__(self):

View File

@ -9,9 +9,9 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
"""
__all__ = [
"Pipe",
"CWSPipe",
"YelpFullPipe",
"YelpPolarityPipe",
"SSTPipe",
@ -20,35 +20,46 @@ __all__ = [
"ChnSentiCorpPipe",
"THUCNewsPipe",
"WeiboSenti100kPipe",
"Conll2003NERPipe",
"OntoNotesNERPipe",
"MsraNERPipe",
"WeiboNERPipe",
"PeopleDailyPipe",
"Conll2003Pipe",
"MatchingBertPipe",
"RTEBertPipe",
"SNLIBertPipe",
"QuoraBertPipe",
"QNLIBertPipe",
"MNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe",
"LCQMCBertPipe",
"MatchingPipe",
"RTEPipe",
"SNLIPipe",
"QuoraPipe",
"QNLIPipe",
"MNLIPipe",
"LCQMCPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",
"CoReferencePipe"
]
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \
WeiboSenti100kPipe
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
from .pipe import Pipe
from .conll import Conll2003Pipe
from .cws import CWSPipe
from .coreference import CoReferencePipe
from .cws import CWSPipe
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \
LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, MachingTruncatePipe
from .pipe import Pipe

View File

@ -21,11 +21,11 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_insta
from ..data_bundle import DataBundle
from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader
from ...core._logger import logger
from ...core.const import Const
from ...core.dataset import DataSet
from ...core.instance import Instance
from ...core.vocabulary import Vocabulary
from ...core._logger import logger
nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
@ -718,6 +718,7 @@ class THUCNewsPipe(_CLSPipe):
.. csv-table::
:header: "raw_words", "target"
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
"...", "..."
@ -826,6 +827,7 @@ class WeiboSenti100kPipe(_CLSPipe):
.. csv-table::
:header: "raw_chars", "target"
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0"
"...", "..."

View File

@ -16,20 +16,24 @@ __all__ = [
"QuoraPipe",
"QNLIPipe",
"MNLIPipe",
"LCQMCPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"LCQMCPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",
]
import warnings
from .pipe import Pipe
from .utils import get_tokenizer
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
from ..data_bundle import DataBundle
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, \
LCQMCLoader
from ...core._logger import logger
from ...core.const import Const
from ...core.vocabulary import Vocabulary
from ...core._logger import logger
from ..data_bundle import DataBundle
class MatchingBertPipe(Pipe):
@ -145,7 +149,7 @@ class MatchingBertPipe(Pipe):
f"data set but not in train data set!."
warnings.warn(warn_msg)
logger.warning(warn_msg)
has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
dataset.has_field(Const.TARGET)]
target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
@ -294,7 +298,7 @@ class MatchingPipe(Pipe):
f"data set but not in train data set!."
warnings.warn(warn_msg)
logger.warning(warn_msg)
has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
dataset.has_field(Const.TARGET)]
target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
@ -345,8 +349,9 @@ class MNLIPipe(MatchingPipe):
data_bundle = MNLILoader().load(paths)
return self.process(data_bundle)
class LCQMCPipe(MatchingPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = LCQMCLoader().load(paths)
data_bundle = RenamePipe().process(data_bundle)
data_bundle = self.process(data_bundle)
@ -358,14 +363,14 @@ class CNXNLIPipe(MatchingPipe):
def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
data_bundle = RenamePipe().process(data_bundle) # 使中文数据的field
data_bundle = self.process(data_bundle)
data_bundle = RenamePipe().process(data_bundle)
return data_bundle
class BQCorpusPipe(MatchingPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = BQCorpusLoader().load(paths)
data_bundle = RenamePipe().process(data_bundle)
data_bundle = self.process(data_bundle)
@ -374,12 +379,12 @@ class BQCorpusPipe(MatchingPipe):
class RenamePipe(Pipe):
def __init__(self, task = 'cn-nli'):
def __init__(self, task='cn-nli'):
super().__init__()
self.task = task
def process(self, data_bundle: DataBundle): # rename field name for Chinese Matching dataset
if(self.task == 'cn-nli'):
if (self.task == 'cn-nli'):
for name, dataset in data_bundle.datasets.items():
if (dataset.has_field(Const.RAW_CHARS(0))):
dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS
@ -392,12 +397,12 @@ class RenamePipe(Pipe):
else:
raise RuntimeError(
"field name of dataset is not qualified. It should have ether RAW_CHARS or WORDS")
elif(self.task == 'cn-nli-bert'):
elif (self.task == 'cn-nli-bert'):
for name, dataset in data_bundle.datasets.items():
if (dataset.has_field(Const.RAW_CHARS(0))):
dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS
dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1))
elif(dataset.has_field(Const.RAW_WORDS(0))):
elif (dataset.has_field(Const.RAW_WORDS(0))):
dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0))
dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1))
dataset.rename_field(Const.INPUT, Const.CHAR_INPUT)
@ -409,15 +414,15 @@ class RenamePipe(Pipe):
raise RuntimeError(
"Only support task='cn-nli' or 'cn-nli-bert'"
)
return data_bundle
class GranularizePipe(Pipe):
def __init__(self, task = None):
def __init__(self, task=None):
super().__init__()
self.task = task
def _granularize(self, data_bundle, tag_map):
"""
该函数对data_bundle中'target'列中的内容进行转换
@ -434,21 +439,22 @@ class GranularizePipe(Pipe):
dataset.drop(lambda ins: ins[Const.TARGET] == -100)
data_bundle.set_dataset(dataset, name)
return data_bundle
def process(self, data_bundle: DataBundle):
task_tag_dict = {
'XNLI':{'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2}
'XNLI': {'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2}
}
if self.task in task_tag_dict:
data_bundle = self._granularize(data_bundle=data_bundle, tag_map= task_tag_dict[self.task])
data_bundle = self._granularize(data_bundle=data_bundle, tag_map=task_tag_dict[self.task])
else:
raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.")
return data_bundle
class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len
class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len
def __init__(self):
super().__init__()
def process(self, data_bundle: DataBundle):
for name, dataset in data_bundle.datasets.items():
pass
@ -456,7 +462,7 @@ class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len
class LCQMCBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = LCQMCLoader().load(paths)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle)
@ -465,7 +471,7 @@ class LCQMCBertPipe(MatchingBertPipe):
class BQCorpusBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = BQCorpusLoader().load(paths)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle)
@ -474,7 +480,7 @@ class BQCorpusBertPipe(MatchingBertPipe):
class CNXNLIBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)

View File

@ -152,8 +152,7 @@ class BiAttention(nn.Module):
:param torch.Tensor premise_mask: [batch_size, a_seq_len]
:param torch.Tensor hypothesis_batch: [batch_size, b_seq_len, hidden_size]
:param torch.Tensor hypothesis_mask: [batch_size, b_seq_len]
:return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size]
torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size]
:return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size] torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size]
"""
similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
.contiguous())

View File

@ -1,280 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 快速入门"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.io import CSVLoader\n",
"\n",
"loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n",
"dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'] type=list}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将所有字母转为小写, 并所有句子变成单词序列\n",
"dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n",
"dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"# 使用Vocabulary类统计单词并将单词序列转化为数字序列\n",
"vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n",
"vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将label转为整数并设置为 target\n",
"dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CNNText(\n",
" (embed): Embedding(\n",
" 177, 50\n",
" (dropout): Dropout(p=0.0)\n",
" )\n",
" (conv_pool): ConvMaxpool(\n",
" (convs): ModuleList(\n",
" (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n",
" (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n",
" (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n",
" )\n",
" )\n",
" (dropout): Dropout(p=0.1)\n",
" (fc): Linear(in_features=12, out_features=5, bias=True)\n",
")"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.models import CNNText\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(62, 15)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 分割训练集/验证集\n",
"train_data, dev_data = dataset.split(0.2)\n",
"len(train_data), len(dev_data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-09-10-59-39\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333\n",
"\n",
"\n",
"In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.8}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 12,\n",
" 'seconds': 0.22}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n",
"\n",
"# 定义trainer并进行训练\n",
"trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n",
" loss=CrossEntropyLoss(), metrics=AccuracyMetric())\n",
"trainer.train()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -1,77 +0,0 @@
A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1
This quiet , introspective and entertaining independent is worth seeking . 4
Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1
A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3
Aggressive self-glorification and a manipulative whitewash . 1
A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4
Narratively , Trouble Every Day is a plodding mess . 1
The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3
But it does n't leave you with much . 1
You could hate it for the same reason . 1
There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1
Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1
The performances are an absolute joy . 4
Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1
The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1
More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2
Nothing more than a run-of-the-mill action flick . 2
Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0
Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2
There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2
Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2
They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1
It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1
The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1
2 This quiet , introspective and entertaining independent is worth seeking . 4
3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1
4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3
5 Aggressive self-glorification and a manipulative whitewash . 1
6 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4
7 Narratively , Trouble Every Day is a plodding mess . 1
8 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3
9 But it does n't leave you with much . 1
10 You could hate it for the same reason . 1
11 There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1
12 Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1
13 The performances are an absolute joy . 4
14 Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3
15 I still like Moonlight Mile , better judgment be damned . 3
16 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
17 a bilingual charmer , just like the woman who inspired it 3
18 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
19 As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
20 It 's everything you 'd expect -- but nothing more . 2
21 Best indie of the year , so far . 4
22 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
23 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
24 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
25 The plot is romantic comedy boilerplate from start to finish . 2
26 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
27 A film that clearly means to preach exclusively to the converted . 2
28 While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1
29 The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1
30 More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2
31 Nothing more than a run-of-the-mill action flick . 2
32 Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0
33 Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2
34 There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2
35 Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2
36 They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1
37 It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1
38 The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0
39 I still like Moonlight Mile , better judgment be damned . 3
40 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
41 a bilingual charmer , just like the woman who inspired it 3
42 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
43 As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
44 It 's everything you 'd expect -- but nothing more . 2
45 Best indie of the year , so far . 4
46 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
47 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
48 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
49 The plot is romantic comedy boilerplate from start to finish . 2
50 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
51 A film that clearly means to preach exclusively to the converted . 2
52 I still like Moonlight Mile , better judgment be damned . 3
53 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
54 a bilingual charmer , just like the woman who inspired it 3
55 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
56 As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
57 It 's everything you 'd expect -- but nothing more . 2
58 Best indie of the year , so far . 4
59 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
60 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
61 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
62 The plot is romantic comedy boilerplate from start to finish . 2
63 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
64 A film that clearly means to preach exclusively to the converted . 2
65 I still like Moonlight Mile , better judgment be damned . 3
66 A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
67 a bilingual charmer , just like the woman who inspired it 3
68 Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
69 As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
70 It 's everything you 'd expect -- but nothing more . 2
71 Best indie of the year , so far . 4
72 Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
73 It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
74 That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
75 The plot is romantic comedy boilerplate from start to finish . 2
76 It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
77 A film that clearly means to preach exclusively to the converted . 2

View File

@ -1,831 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 详细指南"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据读入"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.io import CSVLoader\n",
"\n",
"loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n",
"dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n",
"dataset[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Instance表示一个样本由一个或多个field属性特征组成每个field有名字和值。\n",
"\n",
"在初始化Instance时即可定义它包含的域使用 \"field_name=field_value\"的写法。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': fake data type=str,\n",
"'label': 0 type=str}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Instance\n",
"\n",
"dataset.append(Instance(raw_sentence='fake data', label='0'))\n",
"dataset[-1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据处理"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"# 将所有字母转为小写, 并所有句子变成单词序列\n",
"dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n",
"dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words')\n",
"\n",
"# 使用Vocabulary类统计单词并将单词序列转化为数字序列\n",
"vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n",
"vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n",
"\n",
"# 将label转为整数\n",
"dataset.apply(lambda x: int(x['label']), new_field_name='target')\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int,\n",
"'seq_len': 37 type=int}\n"
]
}
],
"source": [
"# 增加长度信息\n",
"dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')\n",
"print(dataset[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用内置模块CNNText\n",
"设置为符合内置模块的名称"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CNNText(\n",
" (embed): Embedding(\n",
" 177, 50\n",
" (dropout): Dropout(p=0.0)\n",
" )\n",
" (conv_pool): ConvMaxpool(\n",
" (convs): ModuleList(\n",
" (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n",
" (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n",
" (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n",
" )\n",
" )\n",
" (dropout): Dropout(p=0.1)\n",
" (fc): Linear(in_features=12, out_features=5, bias=True)\n",
")"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.models import CNNText\n",
"\n",
"model_cnn = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"model_cnn"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"我们在使用内置模块的时候,还应该使用应该注意把 field 设定成符合内置模型输入输出的名字。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"words\n",
"seq_len\n",
"target\n"
]
}
],
"source": [
"from fastNLP import Const\n",
"\n",
"dataset.rename_field('words', Const.INPUT)\n",
"dataset.rename_field('seq_len', Const.INPUT_LEN)\n",
"dataset.rename_field('target', Const.TARGET)\n",
"\n",
"dataset.set_input(Const.INPUT, Const.INPUT_LEN)\n",
"dataset.set_target(Const.TARGET)\n",
"\n",
"print(Const.INPUT)\n",
"print(Const.INPUT_LEN)\n",
"print(Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 分割训练集/验证集/测试集"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(64, 7, 7)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dev_data, test_data = dataset.split(0.1)\n",
"train_data, dev_data = train_dev_data.split(0.1)\n",
"len(train_data), len(dev_data), len(test_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 训练(model_cnn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### loss\n",
"训练模型需要提供一个损失函数\n",
"\n",
"下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n",
"\n",
"pred参数对应的是模型的forward返回的dict的一个key的名字这里是\"output\"。\n",
"\n",
"target参数对应的是dataset作为标签的field的名字这里是\"label_seq\"。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import CrossEntropyLoss\n",
"\n",
"# loss = CrossEntropyLoss()\n",
"# 等价于\n",
"loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Metric\n",
"定义评价指标\n",
"\n",
"这里使用准确率。参数的“命名规则”跟上面类似。\n",
"\n",
"pred参数对应的是模型的predict方法返回的dict的一个key的名字这里是\"predict\"。\n",
"\n",
"target参数对应的是dataset作为标签的field的名字这里是\"label_seq\"。"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"\n",
"# metrics=AccuracyMetric()\n",
"# 等价于\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-34\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"\n",
"In Epoch:8/Step:16, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 8,\n",
" 'best_step': 16,\n",
" 'seconds': 0.21}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"\n",
"trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n",
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 测试(model_cnn)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tester] \n",
"AccuracyMetric: acc=0.857143\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.857143}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"\n",
"tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 编写自己的模型\n",
"\n",
"完全支持 pytorch 的模型,与 pytorch 唯一不同的是返回结果是一个字典,字典中至少需要包含 \"pred\" 这个字段"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"\n",
"class LSTMText(nn.Module):\n",
" def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):\n",
" super().__init__()\n",
"\n",
" self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
" self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout)\n",
" self.fc = nn.Linear(hidden_dim * 2, output_dim)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, words):\n",
" # (input) words : (batch_size, seq_len)\n",
" words = words.permute(1,0)\n",
" # words : (seq_len, batch_size)\n",
"\n",
" embedded = self.dropout(self.embedding(words))\n",
" # embedded : (seq_len, batch_size, embedding_dim)\n",
" output, (hidden, cell) = self.lstm(embedded)\n",
" # output: (seq_len, batch_size, hidden_dim * 2)\n",
" # hidden: (num_layers * 2, batch_size, hidden_dim)\n",
" # cell: (num_layers * 2, batch_size, hidden_dim)\n",
"\n",
" hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
" hidden = self.dropout(hidden)\n",
" # hidden: (batch_size, hidden_dim * 2)\n",
"\n",
" pred = self.fc(hidden.squeeze(0))\n",
" # result: (batch_size, output_dim)\n",
" return {\"pred\":pred}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-36\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.714286\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"\n",
"In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 12,\n",
" 'seconds': 2.15}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_lstm = LSTMText(len(vocab),50,5)\n",
"trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tester] \n",
"AccuracyMetric: acc=0.857143\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.857143}}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用 Batch编写自己的训练过程"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0 Avg Loss: 3.11 18ms\n",
"Epoch 1 Avg Loss: 2.88 30ms\n",
"Epoch 2 Avg Loss: 2.69 42ms\n",
"Epoch 3 Avg Loss: 2.47 54ms\n",
"Epoch 4 Avg Loss: 2.38 67ms\n",
"Epoch 5 Avg Loss: 2.10 78ms\n",
"Epoch 6 Avg Loss: 2.06 91ms\n",
"Epoch 7 Avg Loss: 1.92 103ms\n",
"Epoch 8 Avg Loss: 1.91 114ms\n",
"Epoch 9 Avg Loss: 1.76 126ms\n",
"[tester] \n",
"AccuracyMetric: acc=0.571429\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.571429}}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import Batch\n",
"import torch\n",
"import time\n",
"\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"\n",
"def train(epoch, data):\n",
" optim = torch.optim.Adam(model.parameters(), lr=0.001)\n",
" lossfunc = torch.nn.CrossEntropyLoss()\n",
" batch_size = 32\n",
"\n",
" # 定义一个Batch传入DataSet规定batch_size和去batch的规则。\n",
" # 顺序Sequential随机Random相似长度组成一个batchBucket\n",
" train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n",
" train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler)\n",
" \n",
" start_time = time.time()\n",
" for i in range(epoch):\n",
" loss_list = []\n",
" for batch_x, batch_y in train_batch:\n",
" optim.zero_grad()\n",
" output = model(batch_x['words'])\n",
" loss = lossfunc(output['pred'], batch_y['target'])\n",
" loss.backward()\n",
" optim.step()\n",
" loss_list.append(loss.item())\n",
" print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n",
" print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n",
" loss_list.clear()\n",
" \n",
"train(10, train_data)\n",
"tester = Tester(test_data, model, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用 Callback 实现自己想要的效果"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-40\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 51ms\n",
"\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 69ms\n",
"\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 91ms\n",
"\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 107ms\n",
"\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 125ms\n",
"\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 142ms\n",
"\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 158ms\n",
"\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 176ms\n",
"\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.714286\n",
"\n",
"Sum Time: 193ms\n",
"\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Sum Time: 212ms\n",
"\n",
"\n",
"\n",
"In Epoch:10/Step:20, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 10,\n",
" 'best_step': 20,\n",
" 'seconds': 0.2}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Callback\n",
"\n",
"start_time = time.time()\n",
"\n",
"class MyCallback(Callback):\n",
" def on_epoch_end(self):\n",
" print('Sum Time: {:d}ms\\n\\n'.format(round((time.time()-start_time)*1000)))\n",
" \n",
"\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n",
" loss=CrossEntropyLoss(), metrics=AccuracyMetric(), callbacks=[MyCallback()])\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@ -1,41 +0,0 @@
{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"##1. 命名实体识别(name entity recognition, NER)\n",
"命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。\n",
"如下面的例子中\n",
"\n",
"我来自复旦大学。\n",
"\n",
"其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题现在一般被转换为了\n",
"在本tutorial中我们将通过fastNLP尝试写出一个\n",
"\n",
"##2. 数据\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}