diff --git a/docs/source/fastNLP.core.callback.rst b/docs/source/fastNLP.core.callback.rst index 75b5d0cd..5a508e03 100644 --- a/docs/source/fastNLP.core.callback.rst +++ b/docs/source/fastNLP.core.callback.rst @@ -2,6 +2,6 @@ fastNLP.core.callback ===================== .. automodule:: fastNLP.core.callback - :members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError + :members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError :inherited-members: diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst index f6c72be8..c6d0dc55 100644 --- a/docs/source/fastNLP.io.loader.rst +++ b/docs/source/fastNLP.io.loader.rst @@ -2,6 +2,6 @@ fastNLP.io.loader ================= .. automodule:: fastNLP.io.loader - :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CoReferenceLoader + :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader :inherited-members: diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst index ee389e8c..178d35a9 100644 --- a/docs/source/fastNLP.io.pipe.rst +++ b/docs/source/fastNLP.io.pipe.rst @@ -2,6 +2,6 @@ fastNLP.io.pipe =============== .. automodule:: fastNLP.io.pipe - :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CoReferencePipe + :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe :inherited-members: diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index 7118039d..54373df4 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -2,7 +2,7 @@ fastNLP.io ========== .. automodule:: fastNLP.io - :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver + :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver :inherited-members: 子模块 diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index e92807d7..097ad0b2 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -2,7 +2,7 @@ fastNLP ======= .. automodule:: fastNLP - :members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger + :members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger :inherited-members: 子模块 diff --git a/docs/source/tutorials/tutorial_10_callback.rst b/docs/source/tutorials/tutorial_10_callback.rst index dc50aca5..133ca695 100644 --- a/docs/source/tutorials/tutorial_10_callback.rst +++ b/docs/source/tutorials/tutorial_10_callback.rst @@ -1,67 +1,132 @@ =================================================== -使用Callback自定义你的训练过程 +使用 Callback 自定义你的训练过程 =================================================== -在训练时,我们常常要使用trick来提高模型的性能(如调节学习率),或者要打印训练中的信息。 -这里我们提供Callback类,在Trainer中插入代码,完成一些自定义的操作。 +- 什么是 Callback +- 使用 Callback +- 一些常用的 Callback +- 自定义实现 Callback -我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。 -给出一段评价性文字,预测其情感倾向是积极(label=1)、消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。 -关于数据处理,Loss和Optimizer的选择可以看其他教程,这里仅在训练时加入学习率衰减。 ---------------------- -Callback的构建和使用 +什么是Callback --------------------- -创建Callback - 我们可以继承fastNLP :class:`~fastNLP.Callback` 类来定义自己的Callback。 - 这里我们实现一个让学习率线性衰减的Callback。 +Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。 - .. code-block:: python +fastNLP 中提供了很多常用的 Callback ,开箱即用。 - import fastNLP - class LRDecay(fastNLP.Callback): - def __init__(self): - super(LRDecay, self).__init__() - self.base_lrs = [] - self.delta = [] +使用 Callback +--------------------- - def on_train_begin(self): - # 初始化,仅训练开始时调用 - self.base_lrs = [pg['lr'] for pg in self.optimizer.param_groups] - self.delta = [float(lr) / self.n_epochs for lr in self.base_lrs] +使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。 - def on_epoch_end(self): - # 每个epoch结束时,更新学习率 - ep = self.epoch - lrs = [lr - d * ep for lr, d in zip(self.base_lrs, self.delta)] - self.change_lr(lrs) - def change_lr(self, lrs): - for pg, lr in zip(self.optimizer.param_groups, lrs): - pg['lr'] = lr +.. code-block:: python - 这里,:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。 - 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。 - 具体有哪些类方法,参见文档 :class:`~fastNLP.Callback` 。 + from fastNLP import (Callback, EarlyStopCallback, + Trainer, CrossEntropyLoss, AccuracyMetric) + from fastNLP.models import CNNText + import torch.cuda - 另外,为了使用方便,可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性,如 optimizer, epoch, step,分别对应训练时的优化器,当前epoch数,和当前的总step数。 - 具体可访问的属性,参见文档 :class:`~fastNLP.Callback` 。 + # prepare data + def get_data(): + from fastNLP.io import ChnSentiCorpPipe as pipe + data = pipe().process_from_file() + print(data) + data.rename_field('chars', 'words') + train_data = data.datasets['train'] + dev_data = data.datasets['dev'] + test_data = data.datasets['test'] + vocab = data.vocabs['words'] + tgt_vocab = data.vocabs['target'] + return train_data, dev_data, test_data, vocab, tgt_vocab -使用Callback - 在定义好 :class:`~fastNLP.Callback` 之后,就能将它传入Trainer的 ``callbacks`` 参数,在实际训练时使用。 + # prepare model + train_data, dev_data, _, vocab, tgt_vocab = get_data() + device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + model = CNNText((len(vocab),50), num_classes=len(tgt_vocab)) - .. code-block:: python - - """ - 数据预处理,模型定义等等 - """ - - trainer = fastNLP.Trainer( - model=model, train_data=train_data, dev_data=dev_data, - optimizer=optimizer, metrics=metrics, - batch_size=10, n_epochs=100, - callbacks=[LRDecay()]) + # define callback + callbacks=[EarlyStopCallback(5)] + # pass callbacks to Trainer + def train_with_callback(cb_list): + trainer = Trainer( + device=device, + n_epochs=3, + model=model, + train_data=train_data, + dev_data=dev_data, + loss=CrossEntropyLoss(), + metrics=AccuracyMetric(), + callbacks=cb_list, + check_code_level=-1 + ) trainer.train() + + train_with_callback(callbacks) + + + +fastNLP 中的 Callback +--------------------- + +fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks + +.. code-block:: python + + from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback + callbacks = [ + EarlyStopCallback(5), + GradientClipCallback(clip_value=5, clip_type='value'), + EvaluateCallback(dev_data) + ] + + train_with_callback(callbacks) + +自定义 Callback +--------------------- + +这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。 + +1. 创建 Callback + + 要自定义 Callback,我们要实现一个类,继承 fastNLP.Callback。这里我们定义 MyCallBack ,继承 fastNLP.Callback 。 + +2. 指定 Callback 调用的阶段 + + Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() + 会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录 + 当前 loss,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。 + +3. 使用 Callback 的属性访问 Trainer 的内部信息 + + 为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器, + 当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步 + 数,可以通过 self.step 属性得到当前训练了多少步。 + +.. code-block:: python + + from fastNLP import Callback + from fastNLP import logger + + class MyCallBack(Callback): + """Print average loss in each epoch""" + def __init__(self): + super().__init__() + self.total_loss = 0 + self.start_step = 0 + + def on_backward_begin(self, loss): + self.total_loss += loss.item() + + def on_epoch_end(self): + n_steps = self.step - self.start_step + avg_loss = self.total_loss / n_steps + logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss) + self.start_step = self.step + + callbacks = [MyCallBack()] + train_with_callback(callbacks) + diff --git a/docs/source/tutorials/tutorial_2_vocabulary.rst b/docs/source/tutorials/tutorial_2_vocabulary.rst index fffb94c6..e5a83fc0 100644 --- a/docs/source/tutorials/tutorial_2_vocabulary.rst +++ b/docs/source/tutorials/tutorial_2_vocabulary.rst @@ -86,7 +86,7 @@ Vocabulary vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) - :class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 +:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果, diff --git a/docs/source/tutorials/tutorial_3_embedding.rst b/docs/source/tutorials/tutorial_3_embedding.rst index 7de2bb1b..521992ec 100644 --- a/docs/source/tutorials/tutorial_3_embedding.rst +++ b/docs/source/tutorials/tutorial_3_embedding.rst @@ -187,7 +187,7 @@ BertEmbedding的使用 torch.Size([1, 7, 768]) 在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出, - :class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制 +:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制 该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和 ##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。 @@ -200,8 +200,8 @@ BertEmbedding的使用 torch.Size([1, 5, 768]) -另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding - `_ ,Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0, +另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `_ , +Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0, 后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。 .. code-block:: python @@ -230,7 +230,7 @@ Part VI: 使用character-level的embedding ----------------------------------------------------- 除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和 - :class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这 +:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这 会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该 Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。 @@ -298,11 +298,12 @@ Part VII: 叠加使用多个embedding torch.Size([1, 5, 114]) - :class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` , - :class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。 - :class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding +:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` , +:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。 +:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding 必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字 + ----------------------------------------------------------- Part VIII: Embedding的其它说明 ----------------------------------------------------------- diff --git a/docs/source/tutorials/tutorial_4_load_dataset.rst b/docs/source/tutorials/tutorial_4_load_dataset.rst index 628f8809..a93ae8d5 100644 --- a/docs/source/tutorials/tutorial_4_load_dataset.rst +++ b/docs/source/tutorials/tutorial_4_load_dataset.rst @@ -20,7 +20,7 @@ Part I: 数据集容器DataBundle 来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle` 的相关使用。 - :class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。 +:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。 下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。 Part II: 加载的各种数据集的Loader diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 63fde69a..54d2d8b6 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -47,7 +47,7 @@ __all__ = [ "SNLILoader", "QNLILoader", "RTELoader", - "XNLILoader", + "CNXNLILoader", "BQCorpusLoader", "LCQMCLoader", @@ -70,32 +70,61 @@ __all__ = [ "WeiboNERPipe", "CWSPipe", - + + "Pipe", + + "CWSPipe", + + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", + + "Conll2003NERPipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "WeiboNERPipe", + "PeopleDailyPipe", + "Conll2003Pipe", + "MatchingBertPipe", "RTEBertPipe", "SNLIBertPipe", "QuoraBertPipe", "QNLIBertPipe", "MNLIBertPipe", + "CNXNLIBertPipe", + "BQCorpusBertPipe", + "LCQMCBertPipe", "MatchingPipe", "RTEPipe", "SNLIPipe", "QuoraPipe", "QNLIPipe", "MNLIPipe", + "LCQMCPipe", + "CNXNLIPipe", + "BQCorpusPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", 'ModelLoader', 'ModelSaver', ] -from .embed_loader import EmbedLoader -from .data_bundle import DataBundle -from .model_io import ModelLoader, ModelSaver - -from .loader import * -from .pipe import * - import sys + +from .data_bundle import DataBundle +from .embed_loader import EmbedLoader +from .loader import * +from .model_io import ModelLoader, ModelSaver +from .pipe import * from ..doc_utils import doc_process + doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 4ad228b0..5fb9fd91 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -54,7 +54,9 @@ __all__ = [ 'SSTLoader', 'SST2Loader', "ChnSentiCorpLoader", - + "THUCNewsLoader", + "WeiboSenti100kLoader", + 'ConllLoader', 'Conll2003Loader', 'Conll2003NERLoader', @@ -63,26 +65,31 @@ __all__ = [ "MsraNERLoader", "PeopleDailyNERLoader", "WeiboNERLoader", - + 'CSVLoader', 'JsonLoader', - + 'CWSLoader', - + 'MNLILoader', "QuoraLoader", "SNLILoader", "QNLILoader", "RTELoader", - + "CNXNLILoader", + "BQCorpusLoader", + "LCQMCLoader", + "CoReferenceLoader" ] -from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader +from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, \ + ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader +from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader +from .coreference import CoReferenceLoader from .csv import CSVLoader from .cws import CWSLoader from .json import JsonLoader from .loader import Loader -from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader -from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader -from .coreference import CoReferenceLoader \ No newline at end of file +from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, \ + LCQMCLoader diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index 004f3ebd..e0c894a2 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -409,6 +409,7 @@ class THUCNewsLoader(Loader): .. csv-table:: :header: "raw_words", "target" + "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "...", "..." @@ -446,13 +447,18 @@ class WeiboSenti100kLoader(Loader): 别名: 数据集简介:微博sentiment classification,二分类 原始数据内容为: - label text - 0 六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒] - 1 听过一场!笑死了昂,一听茄子脱口秀,从此节操是路人![嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果] + + .. .. code-block:: text + + label text + 0 六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒] + 1 听过一场!笑死了昂,一听茄子脱口秀,从此节操是路人![嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果] + 读取后的Dataset将具有以下数据结构: .. csv-table:: :header: "raw_chars", "target" + "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0" "...", "..." diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index 80889507..77dcb521 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -15,14 +15,14 @@ import os import warnings from typing import Union, Dict +from .csv import CSVLoader from .json import JsonLoader from .loader import Loader from .. import DataBundle +from ..utils import check_loader_paths from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance -from .csv import CSVLoader -from ..utils import check_loader_paths class MNLILoader(Loader): @@ -348,8 +348,9 @@ class CNXNLILoader(Loader): .. csv-table:: :header: "raw_chars1", "raw_chars2", "target" + "从概念上看,奶油收入有两个基本方面产品和地理.", "产品和地理是什么使奶油抹霜工作.", "1" - ""...", "...", "..." + "...", "...", "..." """ @@ -412,6 +413,7 @@ class BQCorpusLoader(Loader): .. csv-table:: :header: "raw_chars1", "raw_chars2", "target" + "不是邀请的如何贷款?", "我不是你们邀请的客人可以贷款吗?", "1" "如何满足微粒银行的审核", "建设银行有微粒贷的资格吗", "0" "...", "...", "..." @@ -448,20 +450,26 @@ class BQCorpusLoader(Loader): class LCQMCLoader(Loader): - """ - 别名: + r""" 数据集简介:句对匹配(question matching) + 原始数据为: - '喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n' - '晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n' - 读取后的Dataset将具有以下的数据结构: - + + .. code-block:: text + + '喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n' + '晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n' + + 读取后的Dataset将具有以下的数据结构 + .. csv-table:: :header: "raw_chars1", "raw_chars2", "target" + "喜欢打篮球的男生喜欢什么样的女生?", "爱打篮球的男生喜欢什么样的女生?", "1" "晚上睡觉带着耳机听音乐有什么害处吗?", "妇可以戴耳机听音乐吗?", "0" - ""...", "...", "..." - + "...", "...", "..." + + """ def __init__(self): diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 212f9e66..e30978be 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -9,9 +9,9 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce """ __all__ = [ "Pipe", - + "CWSPipe", - + "YelpFullPipe", "YelpPolarityPipe", "SSTPipe", @@ -20,35 +20,46 @@ __all__ = [ "ChnSentiCorpPipe", "THUCNewsPipe", "WeiboSenti100kPipe", - + "Conll2003NERPipe", "OntoNotesNERPipe", "MsraNERPipe", "WeiboNERPipe", "PeopleDailyPipe", "Conll2003Pipe", - + "MatchingBertPipe", "RTEBertPipe", "SNLIBertPipe", "QuoraBertPipe", "QNLIBertPipe", "MNLIBertPipe", + "CNXNLIBertPipe", + "BQCorpusBertPipe", + "LCQMCBertPipe", "MatchingPipe", "RTEPipe", "SNLIPipe", "QuoraPipe", "QNLIPipe", "MNLIPipe", - + "LCQMCPipe", + "CNXNLIPipe", + "BQCorpusPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", + "CoReferencePipe" ] -from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe +from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \ + WeiboSenti100kPipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe -from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ - MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe -from .pipe import Pipe from .conll import Conll2003Pipe -from .cws import CWSPipe from .coreference import CoReferencePipe +from .cws import CWSPipe +from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ + MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \ + LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, MachingTruncatePipe +from .pipe import Pipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 1c44cc23..ab31c9de 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -21,11 +21,11 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_insta from ..data_bundle import DataBundle from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader +from ...core._logger import logger from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance from ...core.vocabulary import Vocabulary -from ...core._logger import logger nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') @@ -718,6 +718,7 @@ class THUCNewsPipe(_CLSPipe): .. csv-table:: :header: "raw_words", "target" + "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "...", "..." @@ -826,6 +827,7 @@ class WeiboSenti100kPipe(_CLSPipe): .. csv-table:: :header: "raw_chars", "target" + "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0" "...", "..." diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 90cf17df..dac21dca 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -16,20 +16,24 @@ __all__ = [ "QuoraPipe", "QNLIPipe", "MNLIPipe", + "LCQMCPipe", "CNXNLIPipe", "BQCorpusPipe", - "LCQMCPipe", + "RenamePipe", + "GranularizePipe", + "MachingTruncatePipe", ] import warnings from .pipe import Pipe from .utils import get_tokenizer -from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader +from ..data_bundle import DataBundle +from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, \ + LCQMCLoader +from ...core._logger import logger from ...core.const import Const from ...core.vocabulary import Vocabulary -from ...core._logger import logger -from ..data_bundle import DataBundle class MatchingBertPipe(Pipe): @@ -145,7 +149,7 @@ class MatchingBertPipe(Pipe): f"data set but not in train data set!." warnings.warn(warn_msg) logger.warning(warn_msg) - + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) @@ -294,7 +298,7 @@ class MatchingPipe(Pipe): f"data set but not in train data set!." warnings.warn(warn_msg) logger.warning(warn_msg) - + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) @@ -345,8 +349,9 @@ class MNLIPipe(MatchingPipe): data_bundle = MNLILoader().load(paths) return self.process(data_bundle) + class LCQMCPipe(MatchingPipe): - def process_from_file(self, paths = None): + def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) data_bundle = RenamePipe().process(data_bundle) data_bundle = self.process(data_bundle) @@ -358,14 +363,14 @@ class CNXNLIPipe(MatchingPipe): def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) data_bundle = GranularizePipe(task='XNLI').process(data_bundle) - data_bundle = RenamePipe().process(data_bundle) #使中文数据的field + data_bundle = RenamePipe().process(data_bundle) # 使中文数据的field data_bundle = self.process(data_bundle) data_bundle = RenamePipe().process(data_bundle) return data_bundle class BQCorpusPipe(MatchingPipe): - def process_from_file(self, paths = None): + def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) data_bundle = RenamePipe().process(data_bundle) data_bundle = self.process(data_bundle) @@ -374,12 +379,12 @@ class BQCorpusPipe(MatchingPipe): class RenamePipe(Pipe): - def __init__(self, task = 'cn-nli'): + def __init__(self, task='cn-nli'): super().__init__() self.task = task - + def process(self, data_bundle: DataBundle): # rename field name for Chinese Matching dataset - if(self.task == 'cn-nli'): + if (self.task == 'cn-nli'): for name, dataset in data_bundle.datasets.items(): if (dataset.has_field(Const.RAW_CHARS(0))): dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS @@ -392,12 +397,12 @@ class RenamePipe(Pipe): else: raise RuntimeError( "field name of dataset is not qualified. It should have ether RAW_CHARS or WORDS") - elif(self.task == 'cn-nli-bert'): + elif (self.task == 'cn-nli-bert'): for name, dataset in data_bundle.datasets.items(): if (dataset.has_field(Const.RAW_CHARS(0))): dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1)) - elif(dataset.has_field(Const.RAW_WORDS(0))): + elif (dataset.has_field(Const.RAW_WORDS(0))): dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0)) dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1)) dataset.rename_field(Const.INPUT, Const.CHAR_INPUT) @@ -409,15 +414,15 @@ class RenamePipe(Pipe): raise RuntimeError( "Only support task='cn-nli' or 'cn-nli-bert'" ) - + return data_bundle class GranularizePipe(Pipe): - def __init__(self, task = None): + def __init__(self, task=None): super().__init__() self.task = task - + def _granularize(self, data_bundle, tag_map): """ 该函数对data_bundle中'target'列中的内容进行转换。 @@ -434,21 +439,22 @@ class GranularizePipe(Pipe): dataset.drop(lambda ins: ins[Const.TARGET] == -100) data_bundle.set_dataset(dataset, name) return data_bundle - + def process(self, data_bundle: DataBundle): task_tag_dict = { - 'XNLI':{'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2} + 'XNLI': {'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2} } if self.task in task_tag_dict: - data_bundle = self._granularize(data_bundle=data_bundle, tag_map= task_tag_dict[self.task]) + data_bundle = self._granularize(data_bundle=data_bundle, tag_map=task_tag_dict[self.task]) else: raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.") return data_bundle -class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len +class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len def __init__(self): super().__init__() + def process(self, data_bundle: DataBundle): for name, dataset in data_bundle.datasets.items(): pass @@ -456,7 +462,7 @@ class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len class LCQMCBertPipe(MatchingBertPipe): - def process_from_file(self, paths = None): + def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = self.process(data_bundle) @@ -465,7 +471,7 @@ class LCQMCBertPipe(MatchingBertPipe): class BQCorpusBertPipe(MatchingBertPipe): - def process_from_file(self, paths = None): + def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = self.process(data_bundle) @@ -474,7 +480,7 @@ class BQCorpusBertPipe(MatchingBertPipe): class CNXNLIBertPipe(MatchingBertPipe): - def process_from_file(self, paths = None): + def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index fdfcf0fd..b48be579 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -152,8 +152,7 @@ class BiAttention(nn.Module): :param torch.Tensor premise_mask: [batch_size, a_seq_len] :param torch.Tensor hypothesis_batch: [batch_size, b_seq_len, hidden_size] :param torch.Tensor hypothesis_mask: [batch_size, b_seq_len] - :return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size] - torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size] + :return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size] torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size] """ similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) .contiguous()) diff --git a/tutorials/quickstart.ipynb b/tutorials/quickstart.ipynb deleted file mode 100644 index 00c30c93..00000000 --- a/tutorials/quickstart.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# 快速入门" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.io import CSVLoader\n", - "\n", - "loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n", - "dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'] type=list}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 将所有字母转为小写, 并所有句子变成单词序列\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n", - "dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n", - "vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n", - "vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 将label转为整数,并设置为 target\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " 177, 50\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(in_features=12, out_features=5, bias=True)\n", - ")" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.models import CNNText\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "model" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(62, 15)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 分割训练集/验证集\n", - "train_data, dev_data = dataset.split(0.2)\n", - "len(train_data), len(dev_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-09-10-59-39\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333\n", - "\n", - "\n", - "In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.8}},\n", - " 'best_epoch': 6,\n", - " 'best_step': 12,\n", - " 'seconds': 0.22}" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n", - "\n", - "# 定义trainer并进行训练\n", - "trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n", - " loss=CrossEntropyLoss(), metrics=AccuracyMetric())\n", - "trainer.train()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tutorials/sample_data/tutorial_sample_dataset.csv b/tutorials/sample_data/tutorial_sample_dataset.csv deleted file mode 100644 index e5c0a74f..00000000 --- a/tutorials/sample_data/tutorial_sample_dataset.csv +++ /dev/null @@ -1,77 +0,0 @@ -A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1 -This quiet , introspective and entertaining independent is worth seeking . 4 -Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1 -A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3 -Aggressive self-glorification and a manipulative whitewash . 1 -A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4 -Narratively , Trouble Every Day is a plodding mess . 1 -The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3 -But it does n't leave you with much . 1 -You could hate it for the same reason . 1 -There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1 -Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1 -The performances are an absolute joy . 4 -Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1 -The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1 -More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2 -Nothing more than a run-of-the-mill action flick . 2 -Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0 -Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2 -There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2 -Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2 -They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1 -It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1 -The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 -I still like Moonlight Mile , better judgment be damned . 3 -A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3 -a bilingual charmer , just like the woman who inspired it 3 -Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2 -As inept as big-screen remakes of The Avengers and The Wild Wild West . 1 -It 's everything you 'd expect -- but nothing more . 2 -Best indie of the year , so far . 4 -Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3 -It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1 -That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2 -The plot is romantic comedy boilerplate from start to finish . 2 -It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2 -A film that clearly means to preach exclusively to the converted . 2 \ No newline at end of file diff --git a/tutorials/tutorial_1.ipynb b/tutorials/tutorial_1.ipynb deleted file mode 100644 index db302238..00000000 --- a/tutorials/tutorial_1.ipynb +++ /dev/null @@ -1,831 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# 详细指南" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 数据读入" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.io import CSVLoader\n", - "\n", - "loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n", - "dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n", - "dataset[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n", - "\n", - "在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': fake data type=str,\n", - "'label': 0 type=str}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Instance\n", - "\n", - "dataset.append(Instance(raw_sentence='fake data', label='0'))\n", - "dataset[-1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 数据处理" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Vocabulary\n", - "\n", - "# 将所有字母转为小写, 并所有句子变成单词序列\n", - "dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n", - "dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words')\n", - "\n", - "# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n", - "vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n", - "vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n", - "\n", - "# 将label转为整数\n", - "dataset.apply(lambda x: int(x['label']), new_field_name='target')\n", - "dataset[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'label': 1 type=str,\n", - "'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n", - "'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n", - "'target': 1 type=int,\n", - "'seq_len': 37 type=int}\n" - ] - } - ], - "source": [ - "# 增加长度信息\n", - "dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')\n", - "print(dataset[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 使用内置模块CNNText\n", - "设置为符合内置模块的名称" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "CNNText(\n", - " (embed): Embedding(\n", - " 177, 50\n", - " (dropout): Dropout(p=0.0)\n", - " )\n", - " (conv_pool): ConvMaxpool(\n", - " (convs): ModuleList(\n", - " (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n", - " (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n", - " (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n", - " )\n", - " )\n", - " (dropout): Dropout(p=0.1)\n", - " (fc): Linear(in_features=12, out_features=5, bias=True)\n", - ")" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP.models import CNNText\n", - "\n", - "model_cnn = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "model_cnn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "我们在使用内置模块的时候,还应该使用应该注意把 field 设定成符合内置模型输入输出的名字。" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "words\n", - "seq_len\n", - "target\n" - ] - } - ], - "source": [ - "from fastNLP import Const\n", - "\n", - "dataset.rename_field('words', Const.INPUT)\n", - "dataset.rename_field('seq_len', Const.INPUT_LEN)\n", - "dataset.rename_field('target', Const.TARGET)\n", - "\n", - "dataset.set_input(Const.INPUT, Const.INPUT_LEN)\n", - "dataset.set_target(Const.TARGET)\n", - "\n", - "print(Const.INPUT)\n", - "print(Const.INPUT_LEN)\n", - "print(Const.TARGET)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 分割训练集/验证集/测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(64, 7, 7)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_dev_data, test_data = dataset.split(0.1)\n", - "train_data, dev_data = train_dev_data.split(0.1)\n", - "len(train_data), len(dev_data), len(test_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 训练(model_cnn)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### loss\n", - "训练模型需要提供一个损失函数\n", - "\n", - "下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n", - "\n", - "pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import CrossEntropyLoss\n", - "\n", - "# loss = CrossEntropyLoss()\n", - "# 等价于\n", - "loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Metric\n", - "定义评价指标\n", - "\n", - "这里使用准确率。参数的“命名规则”跟上面类似。\n", - "\n", - "pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n", - "\n", - "target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from fastNLP import AccuracyMetric\n", - "\n", - "# metrics=AccuracyMetric()\n", - "# 等价于\n", - "metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-34\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.428571\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "\n", - "In Epoch:8/Step:16, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 8,\n", - " 'best_step': 16,\n", - " 'seconds': 0.21}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Trainer\n", - "\n", - "trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n", - "trainer.train()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 测试(model_cnn)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.857143\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.857143}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Tester\n", - "\n", - "tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 编写自己的模型\n", - "\n", - "完全支持 pytorch 的模型,与 pytorch 唯一不同的是返回结果是一个字典,字典中至少需要包含 \"pred\" 这个字段" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "\n", - "class LSTMText(nn.Module):\n", - " def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):\n", - " super().__init__()\n", - "\n", - " self.embedding = nn.Embedding(vocab_size, embedding_dim)\n", - " self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout)\n", - " self.fc = nn.Linear(hidden_dim * 2, output_dim)\n", - " self.dropout = nn.Dropout(dropout)\n", - "\n", - " def forward(self, words):\n", - " # (input) words : (batch_size, seq_len)\n", - " words = words.permute(1,0)\n", - " # words : (seq_len, batch_size)\n", - "\n", - " embedded = self.dropout(self.embedding(words))\n", - " # embedded : (seq_len, batch_size, embedding_dim)\n", - " output, (hidden, cell) = self.lstm(embedded)\n", - " # output: (seq_len, batch_size, hidden_dim * 2)\n", - " # hidden: (num_layers * 2, batch_size, hidden_dim)\n", - " # cell: (num_layers * 2, batch_size, hidden_dim)\n", - "\n", - " hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n", - " hidden = self.dropout(hidden)\n", - " # hidden: (batch_size, hidden_dim * 2)\n", - "\n", - " pred = self.fc(hidden.squeeze(0))\n", - " # result: (batch_size, output_dim)\n", - " return {\"pred\":pred}" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-36\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.714286\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "\n", - "In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 6,\n", - " 'best_step': 12,\n", - " 'seconds': 2.15}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_lstm = LSTMText(len(vocab),50,5)\n", - "trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n", - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[tester] \n", - "AccuracyMetric: acc=0.857143\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.857143}}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 使用 Batch编写自己的训练过程" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 0 Avg Loss: 3.11 18ms\n", - "Epoch 1 Avg Loss: 2.88 30ms\n", - "Epoch 2 Avg Loss: 2.69 42ms\n", - "Epoch 3 Avg Loss: 2.47 54ms\n", - "Epoch 4 Avg Loss: 2.38 67ms\n", - "Epoch 5 Avg Loss: 2.10 78ms\n", - "Epoch 6 Avg Loss: 2.06 91ms\n", - "Epoch 7 Avg Loss: 1.92 103ms\n", - "Epoch 8 Avg Loss: 1.91 114ms\n", - "Epoch 9 Avg Loss: 1.76 126ms\n", - "[tester] \n", - "AccuracyMetric: acc=0.571429\n" - ] - }, - { - "data": { - "text/plain": [ - "{'AccuracyMetric': {'acc': 0.571429}}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import BucketSampler\n", - "from fastNLP import Batch\n", - "import torch\n", - "import time\n", - "\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "\n", - "def train(epoch, data):\n", - " optim = torch.optim.Adam(model.parameters(), lr=0.001)\n", - " lossfunc = torch.nn.CrossEntropyLoss()\n", - " batch_size = 32\n", - "\n", - " # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n", - " # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n", - " train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n", - " train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler)\n", - " \n", - " start_time = time.time()\n", - " for i in range(epoch):\n", - " loss_list = []\n", - " for batch_x, batch_y in train_batch:\n", - " optim.zero_grad()\n", - " output = model(batch_x['words'])\n", - " loss = lossfunc(output['pred'], batch_y['target'])\n", - " loss.backward()\n", - " optim.step()\n", - " loss_list.append(loss.item())\n", - " print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n", - " print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n", - " loss_list.clear()\n", - " \n", - "train(10, train_data)\n", - "tester = Tester(test_data, model, metrics=AccuracyMetric())\n", - "tester.test()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 使用 Callback 实现自己想要的效果" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "input fields after batch(if batch size is 2):\n", - "\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n", - "\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "target fields after batch(if batch size is 2):\n", - "\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n", - "\n", - "training epochs started 2019-05-12-21-38-40\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 51ms\n", - "\n", - "\n", - "Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 69ms\n", - "\n", - "\n", - "Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.285714\n", - "\n", - "Sum Time: 91ms\n", - "\n", - "\n", - "Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 107ms\n", - "\n", - "\n", - "Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 125ms\n", - "\n", - "\n", - "Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 142ms\n", - "\n", - "\n", - "Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 158ms\n", - "\n", - "\n", - "Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.571429\n", - "\n", - "Sum Time: 176ms\n", - "\n", - "\n", - "Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.714286\n", - "\n", - "Sum Time: 193ms\n", - "\n", - "\n", - "Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n", - "\n", - "Sum Time: 212ms\n", - "\n", - "\n", - "\n", - "In Epoch:10/Step:20, got best dev performance:AccuracyMetric: acc=0.857143\n", - "Reloaded the best model.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n", - " 'best_epoch': 10,\n", - " 'best_step': 20,\n", - " 'seconds': 0.2}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from fastNLP import Callback\n", - "\n", - "start_time = time.time()\n", - "\n", - "class MyCallback(Callback):\n", - " def on_epoch_end(self):\n", - " print('Sum Time: {:d}ms\\n\\n'.format(round((time.time()-start_time)*1000)))\n", - " \n", - "\n", - "model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n", - "trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n", - " loss=CrossEntropyLoss(), metrics=AccuracyMetric(), callbacks=[MyCallback()])\n", - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/tutorials/tutorial_callback.ipynb b/tutorials/tutorial_10_callback.ipynb similarity index 100% rename from tutorials/tutorial_callback.ipynb rename to tutorials/tutorial_10_callback.ipynb diff --git a/tutorials/命名实体识别.ipynb b/tutorials/命名实体识别.ipynb deleted file mode 100644 index 95975f2c..00000000 --- a/tutorials/命名实体识别.ipynb +++ /dev/null @@ -1,41 +0,0 @@ -{ - "cells": [ - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "##1. 命名实体识别(name entity recognition, NER)\n", - "命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。\n", - "如下面的例子中\n", - "\n", - "我来自复旦大学。\n", - "\n", - "其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题现在一般被转换为了\n", - "在本tutorial中我们将通过fastNLP尝试写出一个\n", - "\n", - "##2. 数据\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}