!11 update packaging related

Merge pull request !11 from yh_cc/auto-1850475-master-1657210864575
This commit is contained in:
yh_cc 2022-10-17 08:59:36 +00:00 committed by Gitee
commit 4f2fd20ade
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
1125 changed files with 111080 additions and 268833 deletions

View File

@ -1,36 +1,96 @@
pipeline {
agent {
docker {
image 'ubuntu_tester'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci'
}
agent any
options {
timeout(time:30, unit: 'MINUTES')
}
environment {
TRAVIS = 1
PJ_NAME = 'fastNLP'
POST_URL = 'https://open.feishu.cn/open-apis/bot/v2/hook/14719364-818d-4f88-9057-7c9f0eaaf6ae'
POST_URL = 'https://open.feishu.cn/open-apis/bot/v2/hook/2f7122e3-3459-43d2-a9e4-ddd77bfc4282'
}
stages {
stage('Package Installation') {
steps {
sh 'python setup.py install'
}
}
stage('Parallel Stages') {
parallel {
stage('Document Building') {
stage('Test Other'){
agent {
docker {
image 'fnlp:other'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci'
}
}
steps {
sh 'cd docs && make prod'
sh 'rm -rf /docs/${PJ_NAME}'
sh 'mv docs/build/html /docs/${PJ_NAME}'
sh 'pytest ./tests --durations=0 --html=other.html --self-contained-html -m "not (torch or paddle or paddledist or jittor or oneflow or deepspeed or oneflowdist or torchpaddle or torchjittor or torchoneflow)"'
}
post {
always {
sh 'html_path=/ci/${PJ_NAME}/report-${BUILD_NUMBER}-${GIT_BRANCH#*/}-${GIT_COMMIT} && mkdir -p ${html_path} && mv other.html ${html_path}'
}
}
}
stage('Package Testing') {
stage('Test Torch-1.11') {
agent {
docker {
image 'fnlp:torch-1.11'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci --gpus all --shm-size 1G'
}
}
steps {
sh 'pip install fitlog'
sh 'pytest ./tests --html=test_results.html --self-contained-html'
sh 'pytest ./tests/ --durations=0 --html=torch-1.11.html --self-contained-html -m torch'
}
post {
always {
sh 'html_path=/ci/${PJ_NAME}/report-${BUILD_NUMBER}-${GIT_BRANCH#*/}-${GIT_COMMIT} && mkdir -p ${html_path} && mv torch-1.11.html ${html_path}'
}
}
}
stage('Test Torch-1.6') {
agent {
docker {
image 'fnlp:torch-1.6'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci --gpus all --shm-size 1G'
}
}
steps {
sh 'pytest ./tests/ --durations=0 --html=torch-1.6.html --self-contained-html -m torch'
}
post {
always {
sh 'html_path=/ci/${PJ_NAME}/report-${BUILD_NUMBER}-${GIT_BRANCH#*/}-${GIT_COMMIT} && mkdir -p ${html_path} && mv torch-1.6.html ${html_path}'
}
}
}
stage('Test Paddle') {
agent {
docker {
image 'fnlp:paddle'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci --gpus all --shm-size 1G'
}
}
steps {
sh 'pytest ./tests --durations=0 --html=paddle.html --self-contained-html -m paddle --co'
sh 'FASTNLP_BACKEND=paddle pytest ./tests --durations=0 --html=paddle_with_backend.html --self-contained-html -m paddle --co'
sh 'FASTNLP_BACKEND=paddle pytest ./tests/core/drivers/paddle_driver/test_dist_utils.py --durations=0 --html=paddle_dist_utils.html --self-contained-html --co'
sh 'FASTNLP_BACKEND=paddle pytest ./tests/core/drivers/paddle_driver/test_fleet.py --durations=0 --html=paddle_fleet.html --self-contained-html --co'
sh 'FASTNLP_BACKEND=paddle pytest ./tests/core/controllers/test_trainer_paddle.py --durations=0 --html=paddle_trainer.html --self-contained-html --co'
}
post {
always {
sh 'html_path=/ci/${PJ_NAME}/report-${BUILD_NUMBER}-${GIT_BRANCH#*/}-${GIT_COMMIT} && mkdir -p ${html_path} && mv paddle*.html ${html_path}'
}
}
}
// stage('Test Jittor') {
// agent {
// docker {
// image 'fnlp:jittor'
// args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci --gpus all --shm-size 1G'
// }
// }
// steps {
// // sh 'pip install fitlog'
// // sh 'pytest ./tests --html=test_results.html --self-contained-html'
// sh 'pytest ./tests --durations=0 --html=jittor.html --self-contained-html -m jittor --co'
// }
// }
}
}
}
@ -40,8 +100,7 @@ pipeline {
}
success {
sh 'post 0'
sh 'post github'
// sh 'post github'
}
}
}

File diff suppressed because one or more lines are too long

18
.gitignore vendored
View File

@ -1,18 +0,0 @@
.gitignore
.DS_Store
.ipynb_checkpoints
*.pyc
__pycache__
*.swp
.vscode/
.idea/**
caches
# fitlog
.fitlog
logs/
.fitconfig
docs/build

View File

@ -1,30 +0,0 @@
language: python
python:
- "3.6"
env:
- TRAVIS=1
# command to install dependencies
install:
- pip install --quiet -r requirements.txt
- pip install --quiet fitlog
- pip install pytest>=3.6
- pip install pytest-cov
# command to run tests
script:
# - python -m spacy download en
- pytest --cov=fastNLP tests/
after_success:
- bash <(curl -s https://codecov.io/bash)
notifications:
webhooks:
urls:
- https://open.feishu.cn/officialapp/notify/55ba4b15d04608e875c122f11484a4e2fa807c42b9ca074509bea654d1b99ca6
on_success: always # default: always
on_failure: always # default: always
on_start: never # default: never
on_cancel: always # default: always
on_error: always # default: always

View File

@ -2,6 +2,4 @@ include requirements.txt
include LICENSE
include README.md
prune tests/
prune reproduction/
prune fastNLP/api
prune fastNLP/automl
prune tutorials/

290
README.md
View File

@ -1,110 +1,239 @@
# fastNLP
[![Build Status](https://travis-ci.org/fastnlp/fastNLP.svg?branch=master)](https://travis-ci.org/fastnlp/fastNLP)
[![codecov](https://codecov.io/gh/fastnlp/fastNLP/branch/master/graph/badge.svg)](https://codecov.io/gh/fastnlp/fastNLP)
[![Pypi](https://img.shields.io/pypi/v/fastNLP.svg)](https://pypi.org/project/fastNLP)
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)
fastNLP是一款面向自然语言处理NLP的轻量级框架目标是快速实现NLP任务以及构建复杂模型。
[//]: # ([![Build Status]&#40;https://travis-ci.org/fastnlp/fastNLP.svg?branch=master&#41;]&#40;https://travis-ci.org/fastnlp/fastNLP&#41;)
[//]: # ([![codecov]&#40;https://codecov.io/gh/fastnlp/fastNLP/branch/master/graph/badge.svg&#41;]&#40;https://codecov.io/gh/fastnlp/fastNLP&#41;)
[//]: # ([![Pypi]&#40;https://img.shields.io/pypi/v/fastNLP.svg&#41;]&#40;https://pypi.org/project/fastNLP&#41;)
[//]: # (![Hex.pm]&#40;https://img.shields.io/hexpm/l/plug.svg&#41;)
[//]: # ([![Documentation Status]&#40;https://readthedocs.org/projects/fastnlp/badge/?version=latest&#41;]&#40;http://fastnlp.readthedocs.io/?badge=latest&#41;)
fastNLP是一款轻量级的自然语言处理NLP工具包目标是减少用户项目中的工程型代码例如数据处理循环、训练循环、多卡运行等。
fastNLP具有如下的特性
- 统一的Tabular式数据容器简化数据预处理过程
- 内置多种数据集的Loader和Pipe省去预处理代码;
- 各种方便的NLP工具例如Embedding加载包括ELMo和BERT、中间数据cache等;
- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载;
- 提供多种神经网络组件以及复现模型(涵盖中文分词、命名实体识别、句法分析、文本分类、文本匹配、指代消解、摘要等任务);
- Trainer提供多种内置Callback函数方便实验记录、异常捕获等。
- 便捷。在数据处理中可以通过apply函数避免循环、使用多进程提速等在训练循环阶段可以很方便定制操作。
- 高效。无需改动代码实现fp16切换、多卡、ZeRO优化等。
- 兼容。fastNLP支持多种深度学习框架作为后端。
> :warning: **为了实现对不同深度学习架构的兼容fastNLP 1.0.0之后的版本重新设计了架构因此与过去的fastNLP版本不完全兼容
> 基于更早的fastNLP代码需要做一定的调整**:
## fastNLP文档
[中文文档](http://www.fastnlp.top/docs/fastNLP/master/index.html)
## 安装指南
fastNLP 依赖以下包:
+ numpy>=1.14.2
+ torch>=1.0.0
+ tqdm>=4.28.1
+ nltk>=3.4.1
+ requests
+ spacy
+ prettytable>=0.7.2
其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 [PyTorch 官网](https://pytorch.org/) 。
在依赖包安装完成后,您可以在命令行执行如下指令完成安装
fastNLP可以通过以下的命令进行安装
```shell
pip install fastNLP
python -m spacy download en
pip install fastNLP>=1.0.0alpha
```
如果需要安装更早版本的fastNLP请指定版本号例如
```shell
pip install fastNLP==0.7.1
```
另外,请根据使用的深度学习框架,安装相应的深度学习框架。
<details>
<summary>Pytorch</summary>
下面是使用pytorch来进行文本分类的例子。需要安装torch>=1.6.0。
```python
from fastNLP.io import ChnSentiCorpLoader
from functools import partial
from fastNLP import cache_results
from fastNLP.transformers.torch import BertTokenizer
# 使用cache_results装饰器装饰函数将prepare_data的返回结果缓存到caches/cache.pkl再次运行时如果
# 该文件还存在,将自动读取缓存文件,而不再次运行预处理代码。
@cache_results('caches/cache.pkl')
def prepare_data():
# 会自动下载数据,并且可以通过文档看到返回的 dataset 应该是包含"raw_words"和"target"两个field的
data_bundle = ChnSentiCorpLoader().load()
# 使用tokenizer对数据进行tokenize
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')
tokenize = partial(tokenizer, max_length=256) # 限制数据的最大长度
data_bundle.apply_field_more(tokenize, field_name='raw_chars', num_proc=4) # 会新增"input_ids", "attention_mask"等field进入dataset中
data_bundle.apply_field(int, field_name='target', new_field_name='labels') # 将int函数应用到每个target上并且放入新的labels field中
return data_bundle
data_bundle = prepare_data()
print(data_bundle.get_dataset('train')[:4])
# 初始化model, optimizer
from fastNLP.transformers.torch import BertForSequenceClassification
from torch import optim
model = BertForSequenceClassification.from_pretrained('hfl/chinese-bert-wwm')
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
# 准备dataloader
from fastNLP import prepare_dataloader
dls = prepare_dataloader(data_bundle, batch_size=32)
# 准备训练
from fastNLP import Trainer, Accuracy, LoadBestModelCallback, TorchWarmupCallback, Event
callbacks = [
TorchWarmupCallback(warmup=0.1, schedule='linear'), # 训练过程中调整学习率。
LoadBestModelCallback() # 将在训练结束之后加载性能最优的model
]
# 在训练特定时机加入一些操作, 不同时机能够获取到的参数不一样可以通过Trainer.on函数的文档查看每个时机的参数
@Trainer.on(Event.on_before_backward())
def print_loss(trainer, outputs):
if trainer.global_forward_batches % 10 == 0: # 每10个batch打印一次loss。
print(outputs.loss.item())
trainer = Trainer(model=model, train_dataloader=dls['train'], optimizers=optimizer,
device=0, evaluate_dataloaders=dls['dev'], metrics={'acc': Accuracy()},
callbacks=callbacks, monitor='acc#acc',n_epochs=5,
# Accuracy的update()函数需要predtarget两个参数它们实际对应的就是以下的field。
evaluate_input_mapping={'labels': 'target'}, # 在评测时将dataloader中会输入到模型的labels重新命名为target
evaluate_output_mapping={'logits': 'pred'} # 在评测时将model输出中的logits重新命名为pred
)
trainer.run()
# 在测试集合上进行评测
from fastNLP import Evaluator
evaluator = Evaluator(model=model, dataloaders=dls['test'], metrics={'acc': Accuracy()},
# Accuracy的update()函数需要predtarget两个参数它们实际对应的就是以下的field。
output_mapping={'logits': 'pred'},
input_mapping={'labels': 'target'})
evaluator.run()
```
## fastNLP教程
中文[文档](http://www.fastnlp.top/docs/fastNLP/)、 [教程](http://www.fastnlp.top/docs/fastNLP/user/quickstart.html)
更多内容可以参考如下的链接
### 快速入门
- [Quick-1. 文本分类](http://www.fastnlp.top/docs/fastNLP/tutorials/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.html)
- [Quick-2. 序列标注](http://www.fastnlp.top/docs/fastNLP/tutorials/%E5%BA%8F%E5%88%97%E6%A0%87%E6%B3%A8.html)
- [0. 10 分钟快速上手 fastNLP torch](http://www.fastnlp.top/docs/fastNLP/master/tutorials/torch/fastnlp_torch_tutorial.html)
### 详细使用教程
- [1. 使用DataSet预处理文本](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_1_data_preprocess.html)
- [2. 使用Vocabulary转换文本与index](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_2_vocabulary.html)
- [3. 使用Embedding模块将文本转成向量](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_3_embedding.html)
- [4. 使用Loader和Pipe加载并处理数据集](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_4_load_dataset.html)
- [5. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_5_loss_optimizer.html)
- [6. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html)
- [7. 使用Metric快速评测你的模型](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_7_metrics.html)
- [8. 使用Modules和Models快速搭建自定义模型](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_8_modules_models.html)
- [9. 使用Callback自定义你的训练过程](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_9_callback.html)
### 扩展教程
- [Extend-1. BertEmbedding的各种用法](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_1_bert_embedding.html)
- [Extend-2. 分布式训练简介](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_2_dist.html)
- [Extend-3. 使用fitlog 辅助 fastNLP 进行科研](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_3_fitlog.html)
- [1. Trainer 和 Evaluator 的基本使用](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_0.html)
- [2. DataSet 和 Vocabulary 的基本使用](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_1.html)
- [3. DataBundle 和 Tokenizer 的基本使用](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_2.html)
- [4. TorchDataloader 的内部结构和基本使用](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_3.html)
- [5. fastNLP 中的预定义模型](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_4.html)
- [6. Trainer 和 Evaluator 的深入介绍](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_4.html)
- [7. fastNLP 与 paddle 或 jittor 的结合](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_5.html)
- [8. 使用 Bert + fine-tuning 完成 SST-2 分类](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_e1.html)
- [9. 使用 Bert + prompt 完成 SST-2 分类](http://www.fastnlp.top/docs/fastNLP/master/tutorials/basic/fastnlp_tutorial_e2.html)
## 内置组件
</details>
大部分用于的 NLP 任务神经网络都可以看做由词嵌入embeddings和两种模块编码器encoder、解码器decoder组成。
<details>
<summary>Paddle</summary>
下面是使用paddle来进行文本分类的例子。需要安装paddle>=2.2.0以及paddlenlp>=2.3.3。
以文本分类任务为例下图展示了一个BiLSTM+Attention实现文本分类器的模型流程图
```python
from fastNLP.io import ChnSentiCorpLoader
from functools import partial
# 会自动下载数据,并且可以通过文档看到返回的 dataset 应该是包含"raw_words"和"target"两个field的
data_bundle = ChnSentiCorpLoader().load()
# 使用tokenizer对数据进行tokenize
from paddlenlp.transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-bert-wwm')
tokenize = partial(tokenizer, max_length=256) # 限制一下最大长度
data_bundle.apply_field_more(tokenize, field_name='raw_chars', num_proc=4) # 会新增"input_ids", "attention_mask"等field进入dataset中
data_bundle.apply_field(int, field_name='target', new_field_name='labels') # 将int函数应用到每个target上并且放入新的labels field中
print(data_bundle.get_dataset('train')[:4])
# 初始化 model
from paddlenlp.transformers import BertForSequenceClassification, LinearDecayWithWarmup
from paddle import optimizer, nn
class SeqClsModel(nn.Layer):
def __init__(self, model_checkpoint, num_labels):
super(SeqClsModel, self).__init__()
self.num_labels = num_labels
self.bert = BertForSequenceClassification.from_pretrained(model_checkpoint)
def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
logits = self.bert(input_ids, token_type_ids, position_ids, attention_mask)
return logits
def train_step(self, input_ids, labels, token_type_ids=None, position_ids=None, attention_mask=None):
logits = self(input_ids, token_type_ids, position_ids, attention_mask)
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1, )))
return {
"logits": logits,
"loss": loss,
}
def evaluate_step(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
logits = self(input_ids, token_type_ids, position_ids, attention_mask)
return {
"logits": logits,
}
model = SeqClsModel('hfl/chinese-bert-wwm', num_labels=2)
# 准备dataloader
from fastNLP import prepare_dataloader
dls = prepare_dataloader(data_bundle, batch_size=16)
# 训练过程中调整学习率。
scheduler = LinearDecayWithWarmup(2e-5, total_steps=20 * len(dls['train']), warmup=0.1)
optimizer = optimizer.AdamW(parameters=model.parameters(), learning_rate=scheduler)
# 准备训练
from fastNLP import Trainer, Accuracy, LoadBestModelCallback, Event
callbacks = [
LoadBestModelCallback() # 将在训练结束之后加载性能最优的model
]
# 在训练特定时机加入一些操作, 不同时机能够获取到的参数不一样可以通过Trainer.on函数的文档查看每个时机的参数
@Trainer.on(Event.on_before_backward())
def print_loss(trainer, outputs):
if trainer.global_forward_batches % 10 == 0: # 每10个batch打印一次loss。
print(outputs["loss"].item())
trainer = Trainer(model=model, train_dataloader=dls['train'], optimizers=optimizer,
device=0, evaluate_dataloaders=dls['dev'], metrics={'acc': Accuracy()},
callbacks=callbacks, monitor='acc#acc',
# Accuracy的update()函数需要predtarget两个参数它们实际对应的就是以下的field。
evaluate_output_mapping={'logits': 'pred'},
evaluate_input_mapping={'labels': 'target'}
)
trainer.run()
# 在测试集合上进行评测
from fastNLP import Evaluator
evaluator = Evaluator(model=model, dataloaders=dls['test'], metrics={'acc': Accuracy()},
# Accuracy的update()函数需要predtarget两个参数它们实际对应的就是以下的field。
output_mapping={'logits': 'pred'},
input_mapping={'labels': 'target'})
evaluator.run()
```
更多内容可以参考如下的链接
### 快速入门
- [0. 10 分钟快速上手 fastNLP paddle](http://www.fastnlp.top/docs/fastNLP/master/tutorials/torch/fastnlp_torch_tutorial.html)
### 详细使用教程
- [1. 使用 paddlenlp 和 fastNLP 实现中文文本情感分析](http://www.fastnlp.top/docs/fastNLP/master/tutorials/paddle/fastnlp_tutorial_paddle_e1.html)
- [2. 使用 paddlenlp 和 fastNLP 训练中文阅读理解任务](http://www.fastnlp.top/docs/fastNLP/master/tutorials/paddle/fastnlp_tutorial_paddle_e2.html)
</details>
<details>
<summary>oneflow</summary>
</details>
![](./docs/source/figures/text_classification.png)
fastNLP 在 embeddings 模块中内置了几种不同的embedding静态embeddingGloVe、word2vec、上下文相关embedding
ELMo、BERT、字符embedding基于CNN或者LSTM的CharEmbedding
与此同时fastNLP 在 modules 模块中内置了两种模块的诸多组件,可以帮助用户快速搭建自己所需的网络。 两种模块的功能和常见组件如下:
<table>
<tr>
<td><b> 类型 </b></td>
<td><b> 功能 </b></td>
<td><b> 例子 </b></td>
</tr>
<tr>
<td> encoder </td>
<td> 将输入编码为具有具有表示能力的向量 </td>
<td> Embedding, RNN, CNN, Transformer, ...
</tr>
<tr>
<td> decoder </td>
<td> 将具有某种表示意义的向量解码为需要的输出形式 </td>
<td> MLP, CRF, ... </td>
</tr>
</table>
<details>
<summary>jittor</summary>
</details>
## 项目结构
<div align=center><img width="450" height="350" src="./docs/source/figures/workflow.png"/></div>
fastNLP的大致工作流程如上图所示而项目结构如下
fastNLP的项目结构如下
<table>
<tr>
@ -135,4 +264,3 @@ fastNLP的大致工作流程如上图所示而项目结构如下
<hr>
*In memory of @FengZiYjun. May his soul rest in peace. We will miss you very very much!*

View File

@ -1,5 +0,0 @@
ignore:
- "reproduction" # ignore folders and all its contents
- "setup.py"
- "docs"
- "tutorials"

View File

@ -6,24 +6,35 @@ SPHINXOPTS =
SPHINXAPIDOC = sphinx-apidoc
SPHINXBUILD = sphinx-build
SPHINXPROJ = fastNLP
SPHINXEXCLUDE = ../fastNLP/transformers/*
SOURCEDIR = source
BUILDDIR = build
PORT = 8000
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS)
apidoc:
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ)
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) $(SPHINXEXCLUDE)
server:
cd build/html && python -m http.server
cd build/html && python -m http.server $(PORT)
delete:
rm -f source/$(SPHINXPROJ).* source/modules.rst && rm -rf build
web:
make html && make server
dev:
rm -f source/$(SPHINXPROJ).* source/modules.rst && rm -rf build && make apidoc && make html && make server
make delete && make apidoc && make html && make server
versions:
sphinx-multiversion "$(SOURCEDIR)" "$(BUILDDIR)" && cd build && python -m http.server $(PORT)
prod:
make apidoc && make html
make apidoc && make html
.PHONY: help Makefile

View File

@ -1,40 +0,0 @@
# 快速入门 fastNLP 文档编写
本教程为 fastNLP 文档编写者创建,文档编写者包括合作开发人员和文档维护人员。您在一般情况下属于前者,
只需要了解整个框架的部分内容即可。
## 合作开发人员
FastNLP的文档使用基于[reStructuredText标记语言](http://docutils.sourceforge.net/rst.html)的
[Sphinx](http://sphinx.pocoo.org/)工具生成,由[Read the Docs](https://readthedocs.org/)网站自动维护生成。
一般开发者只要编写符合reStructuredText语法规范的文档并通过[PR](https://help.github.com/en/articles/about-pull-requests)
就可以为fastNLP的文档贡献一份力量。
如果你想在本地编译文档并进行大段文档的编写您需要安装Sphinx工具以及sphinx-rtd-theme主题
```bash
fastNLP/docs> pip install sphinx
fastNLP/docs> pip install sphinx-rtd-theme
```
然后在本目录下执行 `make dev` 命令。该命令只支持Linux和MacOS系统期望看到如下输出
```bash
fastNLP/docs> make dev
rm -rf build/html && make html && make server
Running Sphinx v1.5.6
making output directory...
......
Build finished. The HTML pages are in build/html.
cd build/html && python -m http.server
Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
```
现在您浏览器访问 http://localhost:8000/ 查看文档。如果你在远程服务器尚进行工作,则访问地址为 http://{服务器的ip地址}:8000/ 。
但您必须保证服务器的8000端口是开放的。如果您的电脑或远程服务器的8000端口被占用程序会顺延使用8001、8002……等端口。
当你结束访问时您可以使用Control(Ctrl) + C 来结束进程。
我们在[这里](./source/user/example.rst)列举了fastNLP文档经常用到的reStructuredText语法网页查看请结合Raw模式
您可以通过阅读它进行快速上手。FastNLP大部分的文档都是写在代码中通过Sphinx工具进行抽取生成的
## 文档维护人员
文档维护人员需要了解 Makefile 中全部命令的含义,并了解到目前的文档结构
是在 sphinx-apidoc 自动抽取的基础上进行手动修改得到的。
文档维护人员应进一步提升整个框架的自动化程度,并监督合作开发人员不要破坏文档项目的整体结构。

View File

@ -1,191 +0,0 @@
import inspect
import os
import sys
def _colored_string(string: str, color: str or int) -> str:
"""在终端中显示一串有颜色的文字
:param string: 在终端中显示的文字
:param color: 文字的颜色
:return:
"""
if isinstance(color, str):
color = {
"black": 30, "Black": 30, "BLACK": 30,
"red": 31, "Red": 31, "RED": 31,
"green": 32, "Green": 32, "GREEN": 32,
"yellow": 33, "Yellow": 33, "YELLOW": 33,
"blue": 34, "Blue": 34, "BLUE": 34,
"purple": 35, "Purple": 35, "PURPLE": 35,
"cyan": 36, "Cyan": 36, "CYAN": 36,
"white": 37, "White": 37, "WHITE": 37
}[color]
return "\033[%dm%s\033[0m" % (color, string)
def gr(string, flag):
if flag:
return _colored_string(string, "green")
else:
return _colored_string(string, "red")
def find_all_modules():
modules = {}
children = {}
to_doc = set()
root = '../fastNLP'
for path, dirs, files in os.walk(root):
for file in files:
if file.endswith('.py'):
name = ".".join(path.split('/')[1:])
if file.split('.')[0] != "__init__":
name = name + '.' + file.split('.')[0]
__import__(name)
m = sys.modules[name]
modules[name] = m
try:
m.__all__
except:
print(name, "__all__ missing")
continue
if m.__doc__ is None:
print(name, "__doc__ missing")
continue
if "undocumented" not in m.__doc__:
to_doc.add(name)
for module in to_doc:
t = ".".join(module.split('.')[:-1])
if t in to_doc:
if t not in children:
children[t] = set()
children[t].add(module)
for m in children:
children[m] = sorted(children[m])
return modules, to_doc, children
def create_rst_file(modules, name, children):
m = modules[name]
with open("./source/" + name + ".rst", "w") as fout:
t = "=" * len(name)
fout.write(name + "\n")
fout.write(t + "\n")
fout.write("\n")
fout.write(".. automodule:: " + name + "\n")
if name != "fastNLP.core" and len(m.__all__) > 0:
fout.write(" :members: " + ", ".join(m.__all__) + "\n")
short = name[len("fastNLP."):]
if not (short.startswith('models') or short.startswith('modules') or short.startswith('embeddings')):
fout.write(" :inherited-members:\n")
fout.write("\n")
if name in children:
fout.write("子模块\n------\n\n.. toctree::\n :maxdepth: 1\n\n")
for module in children[name]:
fout.write(" " + module + "\n")
def check_file(m, name):
names = name.split('.')
test_name = "test." + ".".join(names[1:-1]) + ".test_" + names[-1]
try:
__import__(test_name)
tm = sys.modules[test_name]
except ModuleNotFoundError:
tm = None
tested = tm is not None
funcs = {}
classes = {}
for item, obj in inspect.getmembers(m):
if inspect.isclass(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm), {})
for i in dir(obj):
func = getattr(obj, i)
if inspect.isfunction(func) and not i.startswith('_'):
this[2][i] = (func.__doc__ is not None, False)
classes[obj.__name__] = this
if inspect.isfunction(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm)) # docs
funcs[obj.__name__] = this
return funcs, classes
def check_files(modules, out=None):
for name in sorted(modules.keys()):
print(name, file=out)
funcs, classes = check_file(modules[name], name)
if out is None:
for f in funcs:
print("%-30s \t %s \t %s" % (f, gr("文档", funcs[f][0]), gr("测试", funcs[f][1])))
for c in classes:
print("%-30s \t %s \t %s" % (c, gr("文档", classes[c][0]), gr("测试", classes[c][1])))
methods = classes[c][2]
for f in methods:
print(" %-28s \t %s" % (f, gr("文档", methods[f][0])))
else:
for f in funcs:
if not funcs[f][0]:
print("缺少文档 %s" % (f), file=out)
if not funcs[f][1]:
print("缺少测试 %s" % (f), file=out)
for c in classes:
if not classes[c][0]:
print("缺少文档 %s" % (c), file=out)
if not classes[c][1]:
print("缺少测试 %s" % (c), file=out)
methods = classes[c][2]
for f in methods:
if not methods[f][0]:
print("缺少文档 %s" % (c + "." + f), file=out)
print(file=out)
def main_check():
sys.path.append("..")
print(_colored_string('Getting modules...', "Blue"))
modules, to_doc, children = find_all_modules()
print(_colored_string('Done!', "Green"))
print(_colored_string('Creating rst files...', "Blue"))
for name in to_doc:
create_rst_file(modules, name, children)
print(_colored_string('Done!', "Green"))
print(_colored_string('Checking all files...', "Blue"))
check_files(modules, out=open("results.txt", "w"))
print(_colored_string('Done!', "Green"))
def check_file_r(file_path):
with open(file_path) as fin:
content = fin.read()
index = -3
cuts = []
while index != -1:
index = content.find('"""',index+3)
cuts.append(index)
cuts = cuts[:-1]
assert len(cuts)%2 == 0
write_content = ""
last = 0
for i in range(len(cuts)//2):
start, end = cuts[i+i], cuts[i+i+1]
if content[start-1] == "r":
write_content += content[last:end+3]
else:
write_content += content[last:start] + "r"
write_content += content[start:end+3]
last = end + 3
write_content += content[last:]
with open(file_path, "w") as fout:
fout.write(write_content)
def add_r(base_path='../fastNLP'):
for path, _, files in os.walk(base_path):
for f in files:
if f.endswith(".py"):
check_file_r(os.path.abspath(os.path.join(path,f)))
# sys.exit(0)
if __name__ == "__main__":
add_r()

View File

@ -1,4 +1,4 @@
sphinx==3.2.1
docutils==0.16
sphinx-rtd-theme==0.5.0
readthedocs-sphinx-search==0.1.0rc3
sphinx
sphinx_rtd_theme
sphinx_autodoc_typehints
sphinx-multiversion

View File

@ -1,260 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BertEmbedding的各种用法\n",
"Bert自从在 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 中被提出后因其性能卓越受到了极大的关注在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于 中文Bert预训练 。\n",
"\n",
"为了方便大家的使用fastNLP提供了预训练的Embedding权重及数据集的自动下载支持自动下载的Embedding和数据集见 数据集 。或您可从 使用Embedding模块将文本转成向量 与 使用Loader和Pipe加载并处理数据集 了解更多相关信息\n",
"\n",
"\n",
"下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。\n",
"\n",
"## 1. 使用Bert进行文本分类\n",
"\n",
"文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类\n",
"\n",
" *1, 商务大床房房间很大床有2M宽整体感觉经济实惠不错!*\n",
"\n",
"这里我们使用fastNLP提供自动下载的微博分类进行测试"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import WeiboSenti100kPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSequenceClassification\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"import torch\n",
"\n",
"data_bundle =WeiboSenti100kPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=Adam(model_params=model.parameters(), lr=2e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n",
"trainer.train()\n",
"\n",
"# 测试结果\n",
"from fastNLP import Tester\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 使用Bert进行命名实体识别\n",
"\n",
"命名实体识别是给定一句话标记出其中的实体。一般序列标注的任务都使用conll格式conll格式是至一行中通过制表符分隔不同的内容使用空行分隔 两句话,例如下面的例子\n",
"\n",
"```\n",
" 中 B-ORG\n",
" 共 I-ORG\n",
" 中 I-ORG\n",
" 央 I-ORG\n",
" 致 O\n",
" 中 B-ORG\n",
" 国 I-ORG\n",
" 致 I-ORG\n",
" 公 I-ORG\n",
" 党 I-ORG\n",
" 十 I-ORG\n",
" 一 I-ORG\n",
" 大 I-ORG\n",
" 的 O\n",
" 贺 O\n",
" 词 O\n",
"```\n",
"\n",
"这部分内容请参考 快速实现序列标注模型\n",
"\n",
"## 3. 使用Bert进行文本匹配\n",
"\n",
"文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否 具有相同的意思。这里我们使用"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import CNXNLIBertPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSentenceMatching\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"from fastNLP.core.optimizer import AdamW\n",
"from fastNLP.core.callback import WarmupCallback\n",
"from fastNLP import Tester\n",
"import torch\n",
"\n",
"data_bundle = CNXNLIBertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"print(data_bundle)\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=AdamW(params=model.parameters(), lr=4e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=5, print_every=1,\n",
" update_every=8, callbacks=callbacks)\n",
"trainer.train()\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. 使用Bert进行中文问答\n",
"\n",
"问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。 例如:\n",
"\n",
"```\n",
"\"context\": \"锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常\n",
"用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及\n",
"作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合\n",
"相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单\n",
"皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大\n",
"钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师\n",
"傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼\n",
"和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:\",\n",
"\"question\": \"锣鼓经是什么?\",\n",
"\"answers\": [\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" }\n",
"]\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"您可以通过以下的代码训练 (原文代码:[CMRC2018](https://github.com/ymcui/cmrc2018) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForQuestionAnswering\n",
"from fastNLP.core.losses import CMRC2018Loss\n",
"from fastNLP.core.metrics import CMRC2018Metric\n",
"from fastNLP.io.pipe.qa import CMRC2018BertPipe\n",
"from fastNLP import Trainer, BucketSampler\n",
"from fastNLP import WarmupCallback, GradientClipCallback\n",
"from fastNLP.core.optimizer import AdamW\n",
"import torch\n",
"\n",
"data_bundle = CMRC2018BertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"print(data_bundle)\n",
"\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,\n",
" dropout=0.5, word_dropout=0.01)\n",
"model = BertForQuestionAnswering(embed)\n",
"loss = CMRC2018Loss()\n",
"metric = CMRC2018Metric()\n",
"\n",
"wm_callback = WarmupCallback(schedule='linear')\n",
"gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')\n",
"callbacks = [wm_callback, gc_callback]\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
"\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" sampler=BucketSampler(seq_len_field_name='context_len'),\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric,\n",
" callbacks=callbacks, device=device, batch_size=6, num_workers=2, n_epochs=2, print_every=1,\n",
" test_use_tqdm=False, update_every=10)\n",
"trainer.train(load_best_model=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练结果(和论文中报道的基本一致):\n",
"\n",
"```\n",
" In Epoch:2/Step:1692, got best dev performance:\n",
" CMRC2018Metric: f1=85.61, em=66.08\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,292 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+------------------------------+---------------------------------------------+---------+\n",
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
"+------------------------------+---------------------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
" 'seq_len': [6, 3, 3]}\n",
"dataset = DataSet(data)\n",
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的构建"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet()\n",
"instance = Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6)\n",
"dataset.append(instance)\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet([\n",
" Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6),\n",
" Instance(raw_words=\"Second instance .\",\n",
" words=['Second', 'instance', '.'],\n",
" seq_len=3)\n",
" ])\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的删除"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+---+\n",
"| a | c |\n",
"+----+---+\n",
"| -5 | 0 |\n",
"| -4 | 0 |\n",
"| -3 | 0 |\n",
"| -2 | 0 |\n",
"| -1 | 0 |\n",
"| 0 | 0 |\n",
"| 1 | 0 |\n",
"| 2 | 0 |\n",
"| 3 | 0 |\n",
"| 4 | 0 |\n",
"+----+---+"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+---+\n",
"| c |\n",
"+---+\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"+---+"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 不改变dataset生成一个删除了满足条件的instance的新 DataSet\n",
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
"# 在dataset中删除满足条件的instance\n",
"dataset.drop(lambda ins:ins['a']<0)\n",
"# 删除第3个instance\n",
"dataset.delete_instance(2)\n",
"# 删除名为'a'的field\n",
"dataset.delete_field('a')\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 简单的数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 检查是否存在名为'a'的field\n",
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
"# 将名为'a'的field改名为'b'\n",
"dataset.rename_field('c', 'b')\n",
"# DataSet的长度\n",
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+------------------------------+-------------------------------------------------+\n",
"| raw_words | words |\n",
"+------------------------------+-------------------------------------------------+\n",
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
"| Second instance . | ['Second', 'instance', '.'] |\n",
"| Third instance . | ['Third', 'instance', '.'] |\n",
"+------------------------------+-------------------------------------------------+"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
"dataset = DataSet(data)\n",
"\n",
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
"\n",
"# 或使用DataSet.apply_field()\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 除了匿名函数,也可以定义函数传递进去\n",
"def get_words(instance):\n",
" sentence = instance['raw_words']\n",
" words = sentence.split()\n",
" return words\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,343 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的 Vocabulary\n",
"## 构建 Vocabulary"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字\n",
"vocab.add_word('上海') # `上海`会作为一个整体\n",
"vocab.to_index('复') # 应该会为3\n",
"vocab.to_index('我') # 会输出1Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1\n",
"\n",
"# 在构建target的Vocabulary时词表中应该用不上pad和unk可以通过以下的初始化\n",
"vocab = Vocabulary(unknown=None, padding=None)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.add_word_lst(['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.to_index('positive')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 没有设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ValueError",
"evalue": "word `neutral` not in vocabulary",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-c6d424040b45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvocab\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'neutral'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# 会报错因为没有unk这种情况\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36mto_index\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \"\"\"\n\u001b[0;32m--> 416\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 417\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrebuild\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munknown\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word `{}` not in vocabulary\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_check_build_vocab\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: word `neutral` not in vocabulary"
]
}
],
"source": [
"vocab.to_index('neutral') # 会报错因为没有unk这种情况"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, '<unk>')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary(unknown='<unk>', padding=None)\n",
"vocab.add_word_lst(['positive', 'negative'])\n",
"vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------------------------------------+--------+\n",
"| chars | target |\n",
"+---------------------------------------------------+--------+\n",
"| [4, 2, 2, 5, 6, 7, 3] | 0 |\n",
"| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |\n",
"+---------------------------------------------------+--------+\n"
]
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"dataset = DataSet({'chars': [\n",
" ['今', '天', '天', '气', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['neutral', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.from_dataset(dataset, field_name='chars')\n",
"vocab.index_dataset(dataset, field_name='chars')\n",
"\n",
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"target_vocab.from_dataset(dataset, field_name='target')\n",
"target_vocab.index_dataset(dataset, field_name='target')\n",
"print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['今', '天', '心', '情', '很']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"tr_data = DataSet({'chars': [\n",
" ['今', '天', '心', '情', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"dev_data = DataSet({'chars': [\n",
" ['住', '宿', '条', '件', '还', '不', '错'],\n",
" ['糟', '糕', '的', '天', '气', '', '无', '法', '出', '行', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。\n",
"vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 4%|▎ | 2.31M/63.5M [00:00<00:02, 22.9MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://212.129.155.247/embedding/glove.6B.50d.zip not found in cache, downloading to /tmp/tmpvziobj_e\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 63.5M/63.5M [00:01<00:00, 41.3MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish download from http://212.129.155.247/embedding/glove.6B.50d.zip\n",
"Copy file to /remote-home/ynzheng/.fastNLP/embedding/glove.6B.50d\n",
"Found 2 out of 6 words in the pre-training embedding.\n",
"tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,\n",
" 0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,\n",
" -0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,\n",
" 0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,\n",
" -0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,\n",
" -0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,\n",
" 0.4053, -0.3682]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1320, -0.2392, 0.1732, -0.2390, -0.0463, 0.0494, 0.0488, -0.0886,\n",
" 0.0224, -0.1300, 0.0369, 0.1800, 0.0750, -0.0183, 0.2264, 0.1628,\n",
" 0.1261, -0.1259, 0.1663, -0.1230, -0.1904, -0.0532, 0.1397, -0.0259,\n",
" -0.1799, 0.0226, 0.1858, 0.1981, 0.1338, 0.2394, 0.0248, 0.0203,\n",
" -0.1722, -0.1683, -0.1892, 0.0874, 0.0562, -0.0394, 0.0306, -0.1761,\n",
" 0.1015, -0.0171, 0.1172, 0.1357, 0.1519, -0.0011, 0.1572, 0.1265,\n",
" -0.2391, -0.0258]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, -0.2616, 0.2357, 0.1308, -0.0118,\n",
" 1.7659, 0.2078, 0.2620, -0.1643, -0.8464, 0.0201, 0.0702, 0.3978,\n",
" 0.1528, -0.2021, -1.6184, -0.5433, -0.1786, 0.5389, 0.4987, -0.1017,\n",
" 0.6626, -1.7051, 0.0572, -0.3241, -0.6683, 0.2665, 2.8420, 0.2684,\n",
" -0.5954, -0.5004, 1.5199, 0.0396, 1.6659, 0.9976, -0.5597, -0.7049,\n",
" -0.0309, -0.2830, -0.1356, 0.6429, 0.4149, 1.2362, 0.7659, 0.9780,\n",
" 0.5851, -0.3018]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word('train')\n",
"vocab.add_word('only_in_train') # 仅在train出现但肯定在预训练词表中不存在\n",
"vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现\n",
"vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(embed(torch.LongTensor([vocab.to_index('train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('test')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,524 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"torch.Size([1, 5, 50])\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
"print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 5, 30])\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"from fastNLP.embeddings import ElmoEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 512])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
"print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 1536])\n"
]
}
],
"source": [
"# 使用后面两层的输出\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
"print(embed(words).size()) # 结果将是在最后一维做拼接"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 7, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
"print(embed(words).size()) # 结果将在序列维度上增加2\n",
"# 取出句子的cls表示\n",
"cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 10 words out of 10.\n",
"torch.Size([1, 9, 768])\n"
]
}
],
"source": [
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import CNNCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50返回的embedding结果维度大小为64。\n",
"embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import LSTMCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50返回的embedding结果维度大小为64。\n",
"embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"50\n",
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"30\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"256\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"512\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"768\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"1536\n",
"80\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(static_embed.embedding_dim) # 50\n",
"char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
"print(char_embed.embedding_dim) # 30\n",
"elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
"print(elmo_embed_1.embedding_dim) # 256\n",
"elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
"print(elmo_embed_2.embedding_dim) # 512\n",
"bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_1.embedding_dim) # 768\n",
"bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_2.embedding_dim) # 1536\n",
"stack_embed = StackEmbedding([static_embed, char_embed])\n",
"print(stack_embed.embedding_dim) # 80"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
"embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练词向量时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 out of 4 words have frequency less than 2.\n",
"tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 out of 5 words have frequency less than 2.\n",
"All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
"tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,309 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Loader和Pipe加载并处理数据集\n",
"\n",
"这一部分是关于如何加载数据集的教程\n",
"\n",
"## Part I: 数据集容器DataBundle\n",
"\n",
"而由于对于同一个任务训练集验证集和测试集会共用同一个词表以及具有相同的目标值所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary 。下面会有例子介绍 DataBundle 的相关使用。\n",
"\n",
"DataBundle 在fastNLP中主要在各个 Loader 和 Pipe 中被使用。 下面我们先介绍一下 Loader 和 Pipe 。\n",
"\n",
"## Part II: 加载的各种数据集的Loader\n",
"\n",
"在fastNLP中所有的 Loader 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 DataSet 的格式, 例如 ChnSentiCorpLoader \n",
"\n",
"- download() 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。\n",
"\n",
"- _load() 函数:从一个数据文件中读取数据,返回一个 DataSet 。返回的DataSet的格式可从Loader文档判断。\n",
"\n",
"- load() 函数:从文件或者文件夹中读取数据为 DataSet 并将它们组装成 DataBundle。支持接受的参数类型有以下的几种\n",
"\n",
" - None, 将尝试读取自动缓存的数据仅支持提供了自动下载数据的Loader\n",
" - 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 train , test , dev 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取\n",
" - dict, 例如{'train':\"/path/to/tr.conll\", 'dev':\"/to/validate.conll\", \"test\":\"/to/te.conll\"}。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSLoader\n",
"\n",
"loader = CWSLoader(dataset_name='pku')\n",
"data_bundle = loader.load()\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里表示一共有3个数据集。其中\n",
"\n",
" 3个数据集的名称分别为train、dev、test分别有17223、1831、1944个instance\n",
"\n",
"也可以取出DataSet并打印DataSet中的具体内容"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------------------------------------------------------+\n",
"| raw_words |\n",
"+----------------------------------------------------------------+\n",
"| 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ... |\n",
"| 中共中央 总书记 、 国家 主席 江 泽民 |\n",
"+----------------------------------------------------------------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part III: 使用Pipe对数据集进行预处理\n",
"\n",
"通过 Loader 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。\n",
"\n",
"在fastNLP中我们使用 Pipe 的子类作为数据预处理的类, Loader 和 Pipe 一般具备一一对应的关系,该关系可以从其名称判断, 例如 CWSLoader 与 CWSPipe 是一一对应的。一般情况下Pipe处理包含以下的几个过程\n",
"1. 将raw_words或 raw_chars进行tokenize以切分成不同的词或字; \n",
"2. 再建立词或字的 Vocabulary , 并将词或字转换为index; \n",
"3. 将target 列建立词表并将target列转为index;\n",
"\n",
"所有的Pipe都可通过其文档查看该Pipe支持处理的 DataSet 以及返回的 DataBundle 中的Vocabulary的情况; 如 OntoNotesNERPipe\n",
"\n",
"各种数据集的Pipe当中都包含了以下的两个函数:\n",
"\n",
"- process() 函数:对输入的 DataBundle 进行处理, 然后返回处理之后的 DataBundle 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。\n",
"- process_from_file() 函数输入数据集所在文件夹使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的)然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。\n",
"\n",
"接着上面 CWSLoader 的例子,我们展示一下 CWSPipe 的功能:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"In total 2 vocabs:\n",
"\tchars has 4777 entries.\n",
"\ttarget has 4 entries.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSPipe\n",
"\n",
"data_bundle = CWSPipe().process(data_bundle)\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"表示一共有3个数据集和2个词表。其中\n",
"\n",
"- 3个数据集的名称分别为train、dev、test分别有17223、1831、1944个instance\n",
"- 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表一共有4777个不同的字target词表为目标标签所构建的词表一共有4种标签。\n",
"\n",
"相较于之前CWSLoader读取的DataBundle新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------+---------------------+---------------------+---------+\n",
"| raw_words | chars | target | seq_len |\n",
"+---------------------+---------------------+---------------------+---------+\n",
"| 迈向 充满 希望... | [1224, 178, 674,... | [0, 1, 0, 1, 0, ... | 29 |\n",
"| 中共中央 总书记... | [11, 212, 11, 33... | [0, 3, 3, 1, 0, ... | 15 |\n",
"+---------------------+---------------------+---------------------+---------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的 信息"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary(['B', 'E', 'S', 'M']...)\n"
]
}
],
"source": [
"vocab = data_bundle.get_vocab('target')\n",
"print(vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part IV: fastNLP封装好的Loader和Pipe\n",
"\n",
"fastNLP封装了多种任务/数据集的 Loader 和 Pipe 并提供自动下载功能,具体参见文档 [数据集](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)\n",
"\n",
"## Part V: 不同格式类型的基础Loader\n",
"\n",
"除了上面提到的针对具体任务的Loader我们还提供了CSV格式和JSON格式的Loader\n",
"\n",
"**CSVLoader** 读取CSV类型的数据集文件。例子如下\n",
"\n",
"```python\n",
"from fastNLP.io.loader import CSVLoader\n",
"data_set_loader = CSVLoader(\n",
" headers=('raw_words', 'target'), sep='\\t'\n",
")\n",
"```\n",
"\n",
"表示将CSV文件中每一行的第一项将填入'raw_words' field第二项填入'target' field。其中项之间由'\\t'分割开来\n",
"\n",
"```python\n",
"data_set = data_set_loader._load('path/to/your/file')\n",
"```\n",
"\n",
"文件内容样例如下\n",
"\n",
"```csv\n",
"But it does not leave you with much . 1\n",
"You could hate it for the same reason . 1\n",
"The performances are an absolute joy . 4\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words | target |\n",
"| --------------------------------------- | ------ |\n",
"| But it does not leave you with much . | 1 |\n",
"| You could hate it for the same reason . | 1 |\n",
"| The performances are an absolute joy . | 4 |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**JsonLoader** 读取Json类型的数据集文件数据必须按行存储每行是一个包含各类属性的Json对象。例子如下\n",
"\n",
"```python\n",
"from fastNLP.io.loader import JsonLoader\n",
"loader = JsonLoader(\n",
" fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}\n",
")\n",
"```\n",
"\n",
"表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields\n",
"\n",
"```python\n",
"data_set = loader._load('path/to/your/file')\n",
"```\n",
"\n",
"数据集内容样例如下\n",
"```\n",
"{\"annotator_labels\": [\"neutral\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"neutral\", ... }\n",
"{\"annotator_labels\": [\"contradiction\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"contradiction\", ... }\n",
"{\"annotator_labels\": [\"entailment\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"entailment\", ... }\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words0 | raw_words1 | target |\n",
"| ------------------------------------------------------ | ------------------------------------------------- | ------------- |\n",
"| A person on a horse jumps over a broken down airplane. | A person is training his horse for a competition. | neutral |\n",
"| A person on a horse jumps over a broken down airplane. | A person is at a diner, ordering an omelette. | contradiction |\n",
"| A person on a horse jumps over a broken down airplane. | A person is outdoors, on a horse. | entailment |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,603 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f49ec540160>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用内置模型训练"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import CNNText\n",
"\n",
"#词嵌入的维度\n",
"EMBED_DIM = 100\n",
"\n",
"#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数\n",
"#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值\n",
"model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import CrossEntropyLoss\n",
"\n",
"# loss = CrossEntropyLoss() 在本例中与下面这行代码等价\n",
"loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 这表示构建了一个损失函数类由func计算损失函数其中将从模型返回值或者DataSet的target=True的field\n",
"# 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数\n",
"# 传入func作为一个名为`target`的参数\n",
"#下面自己构建了一个交叉熵函数和之后直接使用fastNLP中的交叉熵函数是一个效果\n",
"import torch\n",
"from fastNLP import LossFunc\n",
"func = torch.nn.functional.cross_entropy\n",
"loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import torch.optim as optim\n",
"\n",
"#使用 torch.optim 定义优化器\n",
"optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 4]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-11-31-25\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=3080.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.75 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:308/3080: \n",
"\r",
"AccuracyMetric: acc=0.751147\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.83 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:616/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.32 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:924/3080: \n",
"\r",
"AccuracyMetric: acc=0.758028\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:1232/3080: \n",
"\r",
"AccuracyMetric: acc=0.741972\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:1540/3080: \n",
"\r",
"AccuracyMetric: acc=0.728211\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:1848/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:2156/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.57 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:2464/3080: \n",
"\r",
"AccuracyMetric: acc=0.747706\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:2772/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:3080/3080: \n",
"\r",
"AccuracyMetric: acc=0.740826\n",
"\n",
"\r\n",
"In Epoch:3/Step:924, got best dev performance:\n",
"AccuracyMetric: acc=0.758028\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.758028}},\n",
" 'best_epoch': 3,\n",
" 'best_step': 924,\n",
" 'seconds': 160.58}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"\n",
"#训练的轮数和batch size\n",
"N_EPOCHS = 10\n",
"BATCH_SIZE = 16\n",
"\n",
"#如果在定义trainer的时候没有传入optimizer参数模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3\n",
"#这里只使用了loss作为损失函数输入感兴趣可以尝试其他损失函数如之前自定义的loss_func作为输入\n",
"trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,\n",
"optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.43 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.773333\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.773333}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"\n",
"tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,681 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f0db03d0640>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSetIter初探"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"\n",
"tmp_data = dev_data[:10]\n",
"# 定义一个Batch传入DataSet规定batch_size和去batch的规则。\n",
"# 顺序Sequential随机Random相似长度组成一个batchBucket\n",
"sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
" -1, -1, -1]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, -1]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, -1, -1, -1, -1, -1, -1, -1, -1]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"tmp_data.set_pad_val('words',-1)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [15618, 3204, 5, 1675, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n"
]
}
],
"source": [
"from fastNLP.core.field import Padder\n",
"import numpy as np\n",
"class FixLengthPadder(Padder):\n",
" def __init__(self, pad_val=0, length=None):\n",
" super().__init__(pad_val=pad_val)\n",
" self.length = length\n",
" assert self.length is not None, \"Creating FixLengthPadder with no specific length!\"\n",
"\n",
" def __call__(self, contents, field_name, field_ele_dtype, dim):\n",
" #计算当前contents中的最大长度\n",
" max_len = max(map(len, contents))\n",
" #如果当前contents中的最大长度大于指定的padder length的话就报错\n",
" assert max_len <= self.length, \"Fixed padder length smaller than actual length! with length {}\".format(max_len)\n",
" array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)\n",
" for i, content_i in enumerate(contents):\n",
" array[i, :len(content_i)] = content_i\n",
" return array\n",
"\n",
"#设定FixLengthPadder的固定长度为40\n",
"tmp_padder = FixLengthPadder(pad_val=0,length=40)\n",
"#利用dataset的set_padder函数设定words field的padder\n",
"tmp_data.set_padder('words',tmp_padder)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用DataSetIter自己编写训练过程\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----start training-----\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.68 seconds!\n",
"Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.38 seconds!\n",
"Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.51 seconds!\n",
"Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.65 seconds!\n",
"Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.36 seconds!\n",
"Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.84 seconds!\n",
"Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.04 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.786667\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.786667}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"from fastNLP.models import CNNText\n",
"from fastNLP import Tester\n",
"import torch\n",
"import time\n",
"\n",
"embed_dim = 100\n",
"model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)\n",
"\n",
"def train(epoch, data, devdata):\n",
" optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
" lossfunc = torch.nn.CrossEntropyLoss()\n",
" batch_size = 32\n",
"\n",
" # 定义一个Batch传入DataSet规定batch_size和去batch的规则。\n",
" # 顺序Sequential随机Random相似长度组成一个batchBucket\n",
" train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n",
" train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)\n",
"\n",
" start_time = time.time()\n",
" print(\"-\"*5+\"start training\"+\"-\"*5)\n",
" for i in range(epoch):\n",
" loss_list = []\n",
" for batch_x, batch_y in train_batch:\n",
" optimizer.zero_grad()\n",
" output = model(batch_x['words'])\n",
" loss = lossfunc(output['pred'], batch_y['target'])\n",
" loss.backward()\n",
" optimizer.step()\n",
" loss_list.append(loss.item())\n",
"\n",
" #这里verbose如果为0在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1打印出验证结果返回评估信息\n",
" #在调用过Tester对象的test()函数后调用其_format_eval_results(res)函数,结构化输出验证结果\n",
" tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)\n",
" res=tester_tmp.test()\n",
"\n",
" print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n",
" print(tester_tmp._format_eval_results(res),end=\" \")\n",
" print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n",
" loss_list.clear()\n",
"\n",
"train(10, train_data, dev_data)\n",
"#使用tester进行快速测试\n",
"tester = Tester(test_data, model, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,622 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用 Callback 自定义你的训练过程"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 什么是 Callback\n",
"- 使用 Callback \n",
"- 一些常用的 Callback\n",
"- 自定义实现 Callback"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"什么是Callback\n",
"------\n",
"\n",
"Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。\n",
"\n",
"fastNLP 中提供了很多常用的 Callback ,开箱即用。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"使用 Callback\n",
" ------\n",
"\n",
"使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:34:46.465871Z",
"start_time": "2019-09-17T07:34:30.648758Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1200 instances.\n",
"\ttrain has 9600 instances.\n",
"\tdev has 1200 instances.\n",
"In total 2 vocabs:\n",
"\tchars has 4409 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"training epochs started 2019-09-17-03-34-34\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.863333\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.886667\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.890833\n",
"\n",
"\r\n",
"In Epoch:3/Step:900, got best dev performance:\n",
"AccuracyMetric: acc=0.890833\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import (Callback, EarlyStopCallback,\n",
" Trainer, CrossEntropyLoss, AccuracyMetric)\n",
"from fastNLP.models import CNNText\n",
"import torch.cuda\n",
"\n",
"# prepare data\n",
"def get_data():\n",
" from fastNLP.io import ChnSentiCorpPipe as pipe\n",
" data = pipe().process_from_file()\n",
" print(data)\n",
" data.rename_field('chars', 'words')\n",
" train_data = data.datasets['train']\n",
" dev_data = data.datasets['dev']\n",
" test_data = data.datasets['test']\n",
" vocab = data.vocabs['words']\n",
" tgt_vocab = data.vocabs['target']\n",
" return train_data, dev_data, test_data, vocab, tgt_vocab\n",
"\n",
"# prepare model\n",
"train_data, dev_data, _, vocab, tgt_vocab = get_data()\n",
"device = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n",
"model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))\n",
"\n",
"# define callback\n",
"callbacks=[EarlyStopCallback(5)]\n",
"\n",
"# pass callbacks to Trainer\n",
"def train_with_callback(cb_list):\n",
" trainer = Trainer(\n",
" device=device,\n",
" n_epochs=3,\n",
" model=model, \n",
" train_data=train_data, \n",
" dev_data=dev_data, \n",
" loss=CrossEntropyLoss(), \n",
" metrics=AccuracyMetric(), \n",
" callbacks=cb_list, \n",
" check_code_level=-1\n",
" )\n",
" trainer.train()\n",
"\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"fastNLP 中的 Callback\n",
"-------\n",
"fastNLP 中提供了很多常用的 Callback如梯度裁剪训练时早停和测试验证集fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:35:02.182727Z",
"start_time": "2019-09-17T07:34:49.443863Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training epochs started 2019-09-17-03-34-49\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.13 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.12 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.890833\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.890833\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.09 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.09 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.8875\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.8875\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.885\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.885\n",
"\n",
"\r\n",
"In Epoch:1/Step:300, got best dev performance:\n",
"AccuracyMetric: acc=0.890833\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback\n",
"callbacks = [\n",
" EarlyStopCallback(5),\n",
" GradientClipCallback(clip_value=5, clip_type='value'),\n",
" EvaluateCallback(dev_data)\n",
"]\n",
"\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"自定义 Callback\n",
"------\n",
"\n",
"这里我们以一个简单的 Callback作为例子它的作用是打印每一个 Epoch 平均训练 loss。\n",
"\n",
"#### 创建 Callback\n",
" \n",
"要自定义 Callback我们要实现一个类继承 fastNLP.Callback。\n",
"\n",
"这里我们定义 MyCallBack ,继承 fastNLP.Callback 。\n",
"\n",
"#### 指定 Callback 调用的阶段\n",
" \n",
"Callback 中所有以 on_ 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用on_epoch_end() 会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。\n",
"\n",
"这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录当前 loss ,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。\n",
"\n",
"#### 使用 Callback 的属性访问 Trainer 的内部信息\n",
" \n",
"为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs分别对应训练时的优化器当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。\n",
"\n",
"这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步数,可以通过 self.step 属性得到当前训练了多少步。\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:43:10.907139Z",
"start_time": "2019-09-17T07:42:58.488177Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training epochs started 2019-09-17-03-42-58\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.883333\n",
"\n",
"Avg loss at epoch 1, 0.100254\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.8775\n",
"\n",
"Avg loss at epoch 2, 0.183511\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.13 seconds!\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.875833\n",
"\n",
"Avg loss at epoch 3, 0.257103\n",
"\r\n",
"In Epoch:1/Step:300, got best dev performance:\n",
"AccuracyMetric: acc=0.883333\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import Callback\n",
"from fastNLP import logger\n",
"\n",
"class MyCallBack(Callback):\n",
" \"\"\"Print average loss in each epoch\"\"\"\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.total_loss = 0\n",
" self.start_step = 0\n",
" \n",
" def on_backward_begin(self, loss):\n",
" self.total_loss += loss.item()\n",
" \n",
" def on_epoch_end(self):\n",
" n_steps = self.step - self.start_step\n",
" avg_loss = self.total_loss / n_steps\n",
" logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)\n",
" self.start_step = self.step\n",
"\n",
"callbacks = [MyCallBack()]\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,912 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 序列标注\n",
"\n",
"这一部分的内容主要展示如何使用fastNLP实现序列标注(Sequence labeling)任务。您可以使用fastNLP的各个组件快捷方便地完成序列标注任务达到出色的效果。 在阅读这篇教程前希望您已经熟悉了fastNLP的基础使用尤其是数据的载入以及模型的构建通过这个小任务的能让您进一步熟悉fastNLP的使用。\n",
"\n",
"## 命名实体识别(name entity recognition, NER)\n",
"\n",
"命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。 如下面的例子中\n",
"\n",
"*我来自复旦大学*\n",
"\n",
"其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被 转换为序列标注问题\n",
"\n",
"针对\"我来自复旦大学\"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG]其中O表示out,即不是一个实体B-ORG是ORG( organization的缩写)这个类别的开头(Begin)I-ORG是ORG类别的中间(Inside)。\n",
"\n",
"在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。\n",
"\n",
"## 载入数据\n",
"\n",
"fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的您可以通过《使用Loader和Pipe处理数据》了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| raw_chars | target | chars | seq_len |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| ['科', '技', '全', '方', '位',... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | [792, 1015, 156, 198, 291, 714... | 26 |\n",
"| ['对', '', '输', '给', '一',... | [0, 0, 0, 0, 0, 0, 3, 1, 0, 0,... | [123, 2, 1205, 115, 8, 24, 101... | 15 |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"print(data_bundle.get_dataset('train')[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 模型构建\n",
"\n",
"首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见《使用Embedding模块将文本转成向量》。 在这里我们使用通过word2vec预训练的中文汉字embedding。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3321 out of 3471 words in the pre-training embedding.\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"选择好Embedding之后我们可以使用fastNLP中自带的 fastNLP.models.BiLSTMCRF 作为模型。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import BiLSTMCRF\n",
"\n",
"data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words而不是chars所以需要把这一列重新命名\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行训练\n",
"下面我们选择用来评估模型的metric以及优化用到的优化函数。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=1e-2)\n",
"loss = LossInForward()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"使用Trainer进行训练, 您可以通过修改 device 的值来选择显卡。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-53-24\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=430.0), HTML(value='')), layout=Layout(di…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.89 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:43/430: \n",
"\r",
"SpanFPreRecMetric: f=0.067797, pre=0.192771, rec=0.041131\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.9 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:86/430: \n",
"\r",
"SpanFPreRecMetric: f=0.344086, pre=0.568047, rec=0.246787\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:129/430: \n",
"\r",
"SpanFPreRecMetric: f=0.446701, pre=0.653465, rec=0.339332\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:172/430: \n",
"\r",
"SpanFPreRecMetric: f=0.479871, pre=0.642241, rec=0.383033\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.91 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:215/430: \n",
"\r",
"SpanFPreRecMetric: f=0.486312, pre=0.650862, rec=0.388175\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:258/430: \n",
"\r",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.86 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:301/430: \n",
"\r",
"SpanFPreRecMetric: f=0.430335, pre=0.685393, rec=0.313625\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.82 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:344/430: \n",
"\r",
"SpanFPreRecMetric: f=0.477759, pre=0.665138, rec=0.372751\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:387/430: \n",
"\r",
"SpanFPreRecMetric: f=0.500759, pre=0.611111, rec=0.424165\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:430/430: \n",
"\r",
"SpanFPreRecMetric: f=0.496025, pre=0.65, rec=0.401028\n",
"\n",
"\r\n",
"In Epoch:6/Step:258, got best dev performance:\n",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'SpanFPreRecMetric': {'f': 0.541401,\n",
" 'pre': 0.711297,\n",
" 'rec': 0.437018}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 258,\n",
" 'seconds': 121.39}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"import torch\n",
"\n",
"device= 0 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行测试\n",
"训练结束之后过,可以通过 Tester 测试其在测试集上的性能"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.54 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.439024, pre=0.685279, rec=0.322967\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.439024, 'pre': 0.685279, 'rec': 0.322967}}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用更强的Bert做序列标注\n",
"\n",
"在fastNLP使用Bert进行任务您只需要把fastNLP.embeddings.StaticEmbedding 切换为 fastNLP.embeddings.BertEmbedding可修改 device 选择显卡)。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 3384 words out of 3471.\n",
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-58-51\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1130.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.7 seconds!\n",
"Evaluation on dev at Epoch 1/10. Step:113/1130: \n",
"SpanFPreRecMetric: f=0.008114, pre=0.019231, rec=0.005141\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.49 seconds!\n",
"Evaluation on dev at Epoch 2/10. Step:226/1130: \n",
"SpanFPreRecMetric: f=0.467866, pre=0.467866, rec=0.467866\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.6 seconds!\n",
"Evaluation on dev at Epoch 3/10. Step:339/1130: \n",
"SpanFPreRecMetric: f=0.566879, pre=0.482821, rec=0.686375\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.56 seconds!\n",
"Evaluation on dev at Epoch 4/10. Step:452/1130: \n",
"SpanFPreRecMetric: f=0.651972, pre=0.59408, rec=0.722365\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.69 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:565/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.640909, pre=0.574338, rec=0.724936\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.52 seconds!\n",
"Evaluation on dev at Epoch 6/10. Step:678/1130: \n",
"SpanFPreRecMetric: f=0.661836, pre=0.624146, rec=0.70437\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.67 seconds!\n",
"Evaluation on dev at Epoch 7/10. Step:791/1130: \n",
"SpanFPreRecMetric: f=0.683429, pre=0.615226, rec=0.768638\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.37 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:904/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.674699, pre=0.634921, rec=0.719794\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.42 seconds!\n",
"Evaluation on dev at Epoch 9/10. Step:1017/1130: \n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.46 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:1130/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.686845, pre=0.62766, rec=0.758355\n",
"\n",
"\r\n",
"In Epoch:9/Step:1017, got best dev performance:\n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.96 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.626561, pre=0.596112, rec=0.660287\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.626561, 'pre': 0.596112, 'rec': 0.660287}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"from fastNLP.embeddings import BertEmbedding\n",
"embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))\n",
"\n",
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=2e-5)\n",
"loss = LossInForward()\n",
"\n",
"from fastNLP import Trainer\n",
"import torch\n",
"device= 5 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()\n",
"\n",
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,564 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 文本分类(Text classification)\n",
"文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。\n",
"\n",
"Example:: \n",
"1,商务大床房房间很大床有2M宽整体感觉经济实惠不错!\n",
"\n",
"\n",
"其中开头的1是只这条评论的标签表示是正面的情绪。我们将使用到的数据可以通过http://dbcloud.irocn.cn:8989/api/public/dl/dataset/chn_senti_corp.zip 下载并解压当然也可以通过fastNLP自动下载该数据。\n",
"\n",
"数据中的内容如下图所示。接下来我们将用fastNLP在这个数据上训练一个分类网络。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![jupyter](./cn_cls_example.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤\n",
"一共有以下的几个步骤 \n",
"(1) 读取数据 \n",
"(2) 预处理数据 \n",
"(3) 选择预训练词向量 \n",
"(4) 创建模型 \n",
"(5) 训练模型 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (1) 读取数据\n",
"fastNLP提供多种数据的自动下载与自动加载功能对于这里我们要用到的数据我们可以用\\ref{Loader}自动下载并加载该数据。更多有关Loader的使用可以参考\\ref{Loader}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpLoader\n",
"\n",
"loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader\n",
"data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回\n",
"data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"DataBundle的相关介绍可以参考\\ref{}。我们可以打印该data_bundle的基本信息。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看出该data_bundle中一个含有三个\\ref{DataSet}。通过下面的代码我们可以查看DataSet的基本情况"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (2) 预处理数据\n",
"在NLP任务中预处理一般包括: (a)将一整句话切分成汉字或者词; (b)将文本转换为index \n",
"\n",
"fastNLP中也提供了多种数据集的处理类这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考\\ref{Pipe}。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpPipe\n",
"\n",
"pipe = ChnSentiCorpPipe()\n",
"data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法且输入输出都为DataBundle类型"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle) # 打印data_bundle查看其变化"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到除了之前已经包含的3个\\ref{DataSet}, 还新增了两个\\ref{Vocabulary}。我们可以打印DataSet中的内容"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle.get_dataset('train')[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"新增了一列为数字列表的chars以及变为数字的target列。可以看出这两列的名称和刚好与data_bundle中两个Vocabulary的名称是一致的我们可以打印一下Vocabulary看一下里面的内容。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"char_vocab = data_bundle.get_vocab('chars')\n",
"print(char_vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Vocabulary是一个记录着词语与index之间映射关系的类比如"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index = char_vocab.to_index('选')\n",
"print(\"'选'的index是{}\".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的\n",
"print(\"index:{}对应的汉字是{}\".format(index, char_vocab.to_word(index))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (3) 选择预训练词向量 \n",
"由于Word2vec, Glove, Elmo, Bert等预训练模型可以增强模型的性能所以在训练具体任务前选择合适的预训练词向量非常重要。在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。更多关于Embedding的说明可以参考\\ref{Embedding}。这里我们先给出一个使用word2vec的中文汉字预训练的示例之后再给出一个使用Bert的文本分类。这里使用的预训练词向量为'cn-fastnlp-100d'fastNLP将自动下载该embedding至本地缓存fastNLP支持使用名字指定的Embedding以及相关说明可以参见\\ref{Embedding}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (4) 创建模型\n",
"这里我们使用到的模型结构如下所示,补图"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from torch import nn\n",
"from fastNLP.modules import LSTM\n",
"import torch\n",
"\n",
"# 定义模型\n",
"class BiLSTMMaxPoolCls(nn.Module):\n",
" def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):\n",
" super().__init__()\n",
" self.embed = embed\n",
" \n",
" self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, \n",
" batch_first=True, bidirectional=True)\n",
" self.dropout_layer = nn.Dropout(dropout)\n",
" self.fc = nn.Linear(hidden_size, num_classes)\n",
" \n",
" def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应比如之前我们DataSet中有chars这里就必须为chars\n",
" # chars:[batch_size, max_len]\n",
" # seq_len: [batch_size, ]\n",
" chars = self.embed(chars)\n",
" outputs, _ = self.lstm(chars, seq_len)\n",
" outputs = self.dropout_layer(outputs)\n",
" outputs, _ = torch.max(outputs, dim=1)\n",
" outputs = self.fc(outputs)\n",
" \n",
" return {'pred':outputs} # [batch_size,], 返回值必须是dict类型且预测值的key建议设为pred\n",
"\n",
"# 初始化模型\n",
"model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (5) 训练模型\n",
"fastNLP提供了Trainer对象来组织训练过程包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=0.001)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=metric, device=device)\n",
"trainer.train() # 开始训练训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用Bert进行文本分类"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 只需要切换一下Embedding即可\n",
"from fastNLP.embeddings import BertEmbedding\n",
"\n",
"# 这里为了演示一下效果所以默认Bert不更新权重\n",
"bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False)\n",
"model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')), )\n",
"\n",
"\n",
"import torch\n",
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=2e-5)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'),\n",
" metrics=metric, device=device, n_epochs=3)\n",
"trainer.train() # 开始训练训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 基于词进行文本分类"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"由于汉字中没有显示的字与字的边界,一般需要通过分词器先将句子进行分词操作。\n",
"下面的例子演示了如何不基于fastNLP已有的数据读取、预处理代码进行文本分类。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (1) 读取数据"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里我们继续以之前的数据为例但这次我们不使用fastNLP自带的数据读取代码 "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpLoader\n",
"\n",
"loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader\n",
"data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"下面我们先定义一个read_file_to_dataset的函数, 即给定一个文件路径读取其中的内容并返回一个DataSet。然后我们将所有的DataSet放入到DataBundle对象中来方便接下来的预处理"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from fastNLP import DataSet, Instance\n",
"from fastNLP.io import DataBundle\n",
"\n",
"\n",
"def read_file_to_dataset(fp):\n",
" ds = DataSet()\n",
" with open(fp, 'r') as f:\n",
" f.readline() # 第一行是title名称忽略掉\n",
" for line in f:\n",
" line = line.strip()\n",
" target, chars = line.split('\\t')\n",
" ins = Instance(target=target, raw_chars=chars)\n",
" ds.append(ins)\n",
" return ds\n",
"\n",
"data_bundle = DataBundle()\n",
"for name in ['train.tsv', 'dev.tsv', 'test.tsv']:\n",
" fp = os.path.join(data_dir, name)\n",
" ds = read_file_to_dataset(fp)\n",
" data_bundle.set_dataset(name=name.split('.')[0], dataset=ds)\n",
"\n",
"print(data_bundle) # 查看以下数据集的情况\n",
"# In total 3 datasets:\n",
"# train has 9600 instances.\n",
"# dev has 1200 instances.\n",
"# test has 1200 instances."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (2) 数据预处理"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"在这里,我们首先把句子通过 [fastHan](http://gitee.com/fastnlp/fastHan) 进行分词操作,然后创建词表,并将词语转换为序号。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastHan import FastHan\n",
"from fastNLP import Vocabulary\n",
"\n",
"model=FastHan()\n",
"# model.set_device('cuda')\n",
"\n",
"# 定义分词处理操作\n",
"def word_seg(ins):\n",
" raw_chars = ins['raw_chars']\n",
" # 由于有些句子比较长我们只截取前128个汉字\n",
" raw_words = model(raw_chars[:128], target='CWS')[0]\n",
" return raw_words\n",
"\n",
"for name, ds in data_bundle.iter_datasets():\n",
" # apply函数将对内部的instance依次执行word_seg操作并把其返回值放入到raw_words这个field\n",
" ds.apply(word_seg, new_field_name='raw_words')\n",
" # 除了apply函数fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作\n",
" # 同时我们增加一个seq_len的field\n",
" ds.add_seq_len('raw_words')\n",
"\n",
"vocab = Vocabulary()\n",
"\n",
"# 对raw_words列创建词表, 建议把非训练集的dataset放在no_create_entry_dataset参数中\n",
"# 也可以通过add_word(), add_word_lst()等建立词表请参考http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_2_vocabulary.html\n",
"vocab.from_dataset(data_bundle.get_dataset('train'), field_name='raw_words', \n",
" no_create_entry_dataset=[data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test')]) \n",
"\n",
"# 将建立好词表的Vocabulary用于对raw_words列建立词表并把转为序号的列存入到words列\n",
"vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test'), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 建立target的词表target的词表一般不需要padding和unknown\n",
"target_vocab = Vocabulary(padding=None, unknown=None) \n",
"# 一般情况下我们可以只用训练集建立target的词表\n",
"target_vocab.from_dataset(data_bundle.get_dataset('train'), field_name='target') \n",
"# 如果没有传递new_field_name, 则默认覆盖原词表\n",
"target_vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test'), field_name='target')\n",
"\n",
"# 我们可以把词表保存到data_bundle中方便之后使用\n",
"data_bundle.set_vocab(field_name='words', vocab=vocab)\n",
"data_bundle.set_vocab(field_name='target', vocab=target_vocab)\n",
"\n",
"# 我们把words和target分别设置为input和target这样它们才会在训练循环中被取出并自动padding, 有关这部分更多的内容参考\n",
"# http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html\n",
"data_bundle.set_target('target')\n",
"data_bundle.set_input('words', 'seq_len') # DataSet也有这两个接口\n",
"# 如果某些field您希望它被设置为target或者input但是不希望fastNLP自动padding或需要使用特定的padding方式请参考\n",
"# http://www.fastnlp.top/docs/fastNLP/fastNLP.core.dataset.html\n",
"\n",
"print(data_bundle.get_dataset('train')[:2]) # 我们可以看一下当前dataset的内容\n",
"\n",
"# 由于之后需要使用之前定义的BiLSTMMaxPoolCls模型所以需要将words这个field修改为chars(因为该模型的forward接受chars参数)\n",
"data_bundle.rename_field('words', 'chars')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (3) 选择预训练词向量"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里我们选择腾讯的预训练中文词向量,可以在 [腾讯词向量](https://ai.tencent.com/ailab/nlp/en/embedding.html) 处下载并解压。这里我们不能直接使用BERT因为BERT是基于中文字进行预训练的。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"word2vec_embed = StaticEmbedding(data_bundle.get_vocab('words'), \n",
" model_dir_or_name='/path/to/Tencent_AILab_ChineseEmbedding.txt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"# 初始化模型\n",
"model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 开始训练\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=0.001)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=metric, device=device)\n",
"trainer.train() # 开始训练训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,27 @@
{%- if current_version %}
<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
<span class="rst-current-version" data-toggle="rst-current-version">
<span class="fa fa-book"> Other Versions</span>
{{ current_version.name }}
<span class="fa fa-caret-down"></span>
</span>
<div class="rst-other-versions">
{%- if versions.tags %}
<dl>
<dt>Tags</dt>
{%- for item in versions.tags %}
<dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
{%- endfor %}
</dl>
{%- endif %}
{%- if versions.branches %}
<dl>
<dt>Branches</dt>
{%- for item in versions.branches %}
<dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
{%- endfor %}
</dl>
{%- endif %}
</div>
</div>
{%- endif %}

View File

@ -20,13 +20,13 @@ sys.path.insert(0, os.path.abspath('../../'))
# -- Project information -----------------------------------------------------
project = 'fastNLP'
copyright = '2020, xpqiu'
author = 'xpqiu'
copyright = '2022, fastNLP'
author = 'fastNLP'
# The short X.Y version
version = '0.6.0'
version = '1.0'
# The full version, including alpha/beta/rc tags
release = '0.6.0'
release = '1.0.0-alpha'
# -- General configuration ---------------------------------------------------
@ -42,7 +42,10 @@ extensions = [
'sphinx.ext.viewcode',
'sphinx.ext.autosummary',
'sphinx.ext.mathjax',
'sphinx.ext.todo'
'sphinx.ext.todo',
'sphinx_autodoc_typehints',
'sphinx_multiversion',
'nbsphinx',
]
autodoc_default_options = {
@ -51,7 +54,12 @@ autodoc_default_options = {
'undoc-members': False,
}
add_module_names = False
autosummary_ignore_module_all = False
# autodoc_typehints = "description"
autoclass_content = "class"
typehints_fully_qualified = False
typehints_defaults = "comma"
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@ -110,12 +118,16 @@ html_static_path = ['_static']
# 'searchbox.html']``.
#
# html_sidebars = {}
html_sidebars = {
'**': [
'versions.html',
],
}
# -- Options for HTMLHelp output ---------------------------------------------
# Output file base name for HTML help builder.
htmlhelp_basename = 'fastNLP doc'
htmlhelp_basename = 'fastNLP'
# -- Options for LaTeX output ------------------------------------------------
@ -140,17 +152,14 @@ latex_elements = {
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'fastNLP.tex', 'fastNLP Documentation',
'xpqiu', 'manual'),
]
latex_documents = []
# -- Options for manual page output ------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'fastnlp', 'fastNLP Documentation',
(master_doc, 'fastNLP', 'fastNLP Documentation',
[author], 1)
]
@ -161,10 +170,12 @@ man_pages = [
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'fastNLP', 'fastNLP Documentation',
author, 'fastNLP', 'One line description of project.',
author, 'fastNLP', 'A fast NLP tool for programming.',
'Miscellaneous'),
]
# -- Options for Multiversions ----------------------------------------------
smv_latest_version = 'dev0.8.0'
# -- Extension configuration -------------------------------------------------
def maybe_skip_member(app, what, name, obj, skip, options):
@ -174,7 +185,7 @@ def maybe_skip_member(app, what, name, obj, skip, options):
return False
if name.startswith("_"):
return True
return False
return skip
def setup(app):

View File

@ -1,7 +0,0 @@
fastNLP.core.batch
==================
.. automodule:: fastNLP.core.batch
:members: BatchIter, DataSetIter, TorchLoaderIter
:inherited-members:

View File

@ -1,7 +0,0 @@
fastNLP.core.callback
=====================
.. automodule:: fastNLP.core.callback
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError
:inherited-members:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.callback module
======================================
.. automodule:: fastNLP.core.callbacks.callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.callback\_event module
=============================================
.. automodule:: fastNLP.core.callbacks.callback_event
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.callback\_manager module
===============================================
.. automodule:: fastNLP.core.callbacks.callback_manager
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.checkpoint\_callback module
==================================================
.. automodule:: fastNLP.core.callbacks.checkpoint_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.early\_stop\_callback module
===================================================
.. automodule:: fastNLP.core.callbacks.early_stop_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.fitlog\_callback module
==============================================
.. automodule:: fastNLP.core.callbacks.fitlog_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.has\_monitor\_callback module
====================================================
.. automodule:: fastNLP.core.callbacks.has_monitor_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.load\_best\_model\_callback module
=========================================================
.. automodule:: fastNLP.core.callbacks.load_best_model_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.lr\_scheduler\_callback module
=====================================================
.. automodule:: fastNLP.core.callbacks.lr_scheduler_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.more\_evaluate\_callback module
======================================================
.. automodule:: fastNLP.core.callbacks.more_evaluate_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.progress\_callback module
================================================
.. automodule:: fastNLP.core.callbacks.progress_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,36 @@
fastNLP.core.callbacks package
==============================
.. automodule:: fastNLP.core.callbacks
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
fastNLP.core.callbacks.torch_callbacks
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.callbacks.callback
fastNLP.core.callbacks.callback_event
fastNLP.core.callbacks.callback_manager
fastNLP.core.callbacks.checkpoint_callback
fastNLP.core.callbacks.early_stop_callback
fastNLP.core.callbacks.fitlog_callback
fastNLP.core.callbacks.has_monitor_callback
fastNLP.core.callbacks.load_best_model_callback
fastNLP.core.callbacks.lr_scheduler_callback
fastNLP.core.callbacks.more_evaluate_callback
fastNLP.core.callbacks.progress_callback
fastNLP.core.callbacks.timer_callback
fastNLP.core.callbacks.topk_saver
fastNLP.core.callbacks.utils

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.timer\_callback module
=============================================
.. automodule:: fastNLP.core.callbacks.timer_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.topk\_saver module
=========================================
.. automodule:: fastNLP.core.callbacks.topk_saver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,16 @@
fastNLP.core.callbacks.torch\_callbacks package
===============================================
.. automodule:: fastNLP.core.callbacks.torch_callbacks
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.callbacks.torch_callbacks.torch_grad_clip_callback
fastNLP.core.callbacks.torch_callbacks.torch_lr_sched_callback

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.torch\_callbacks.torch\_grad\_clip\_callback module
==========================================================================
.. automodule:: fastNLP.core.callbacks.torch_callbacks.torch_grad_clip_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.torch\_callbacks.torch\_lr\_sched\_callback module
=========================================================================
.. automodule:: fastNLP.core.callbacks.torch_callbacks.torch_lr_sched_callback
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.callbacks.utils module
===================================
.. automodule:: fastNLP.core.callbacks.utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.collator module
======================================
.. automodule:: fastNLP.core.collators.collator
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.packer\_unpacker module
==============================================
.. automodule:: fastNLP.core.collators.packer_unpacker
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.exceptions module
================================================
.. automodule:: fastNLP.core.collators.padders.exceptions
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.get\_padder module
=================================================
.. automodule:: fastNLP.core.collators.padders.get_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.jittor\_padder module
====================================================
.. automodule:: fastNLP.core.collators.padders.jittor_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.numpy\_padder module
===================================================
.. automodule:: fastNLP.core.collators.padders.numpy_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.oneflow\_padder module
=====================================================
.. automodule:: fastNLP.core.collators.padders.oneflow_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.padder module
============================================
.. automodule:: fastNLP.core.collators.padders.padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.paddle\_padder module
====================================================
.. automodule:: fastNLP.core.collators.padders.paddle_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.raw\_padder module
=================================================
.. automodule:: fastNLP.core.collators.padders.raw_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,25 @@
fastNLP.core.collators.padders package
======================================
.. automodule:: fastNLP.core.collators.padders
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.collators.padders.exceptions
fastNLP.core.collators.padders.get_padder
fastNLP.core.collators.padders.jittor_padder
fastNLP.core.collators.padders.numpy_padder
fastNLP.core.collators.padders.oneflow_padder
fastNLP.core.collators.padders.padder
fastNLP.core.collators.padders.paddle_padder
fastNLP.core.collators.padders.raw_padder
fastNLP.core.collators.padders.torch_padder
fastNLP.core.collators.padders.torch_utils
fastNLP.core.collators.padders.utils

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.torch\_padder module
===================================================
.. automodule:: fastNLP.core.collators.padders.torch_padder
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.torch\_utils module
==================================================
.. automodule:: fastNLP.core.collators.padders.torch_utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.collators.padders.utils module
===========================================
.. automodule:: fastNLP.core.collators.padders.utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,24 @@
fastNLP.core.collators package
==============================
.. automodule:: fastNLP.core.collators
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
fastNLP.core.collators.padders
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.collators.collator
fastNLP.core.collators.packer_unpacker

View File

@ -1,7 +0,0 @@
fastNLP.core.const
==================
.. automodule:: fastNLP.core.const
:members: Const
:inherited-members:

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.evaluator module
=========================================
.. automodule:: fastNLP.core.controllers.evaluator
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.loops.evaluate\_batch\_loop module
===========================================================
.. automodule:: fastNLP.core.controllers.loops.evaluate_batch_loop
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.loops.loop module
==========================================
.. automodule:: fastNLP.core.controllers.loops.loop
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,17 @@
fastNLP.core.controllers.loops package
======================================
.. automodule:: fastNLP.core.controllers.loops
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.controllers.loops.evaluate_batch_loop
fastNLP.core.controllers.loops.loop
fastNLP.core.controllers.loops.train_batch_loop

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.loops.train\_batch\_loop module
========================================================
.. automodule:: fastNLP.core.controllers.loops.train_batch_loop
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,25 @@
fastNLP.core.controllers package
================================
.. automodule:: fastNLP.core.controllers
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
fastNLP.core.controllers.loops
fastNLP.core.controllers.utils
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.controllers.evaluator
fastNLP.core.controllers.trainer

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.trainer module
=======================================
.. automodule:: fastNLP.core.controllers.trainer
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,16 @@
fastNLP.core.controllers.utils package
======================================
.. automodule:: fastNLP.core.controllers.utils
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.controllers.utils.state
fastNLP.core.controllers.utils.utils

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.utils.state module
===========================================
.. automodule:: fastNLP.core.controllers.utils.state
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.controllers.utils.utils module
===========================================
.. automodule:: fastNLP.core.controllers.utils.utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.jittor\_dataloader.fdl module
======================================================
.. automodule:: fastNLP.core.dataloaders.jittor_dataloader.fdl
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,15 @@
fastNLP.core.dataloaders.jittor\_dataloader package
===================================================
.. automodule:: fastNLP.core.dataloaders.jittor_dataloader
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.jittor_dataloader.fdl

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.oneflow\_dataloader.fdl module
=======================================================
.. automodule:: fastNLP.core.dataloaders.oneflow_dataloader.fdl
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,15 @@
fastNLP.core.dataloaders.oneflow\_dataloader package
====================================================
.. automodule:: fastNLP.core.dataloaders.oneflow_dataloader
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.oneflow_dataloader.fdl

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.paddle\_dataloader.fdl module
======================================================
.. automodule:: fastNLP.core.dataloaders.paddle_dataloader.fdl
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,15 @@
fastNLP.core.dataloaders.paddle\_dataloader package
===================================================
.. automodule:: fastNLP.core.dataloaders.paddle_dataloader
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.paddle_dataloader.fdl

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.prepare\_dataloader module
===================================================
.. automodule:: fastNLP.core.dataloaders.prepare_dataloader
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,27 @@
fastNLP.core.dataloaders package
================================
.. automodule:: fastNLP.core.dataloaders
:members:
:undoc-members:
:show-inheritance:
Subpackages
-----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.jittor_dataloader
fastNLP.core.dataloaders.oneflow_dataloader
fastNLP.core.dataloaders.paddle_dataloader
fastNLP.core.dataloaders.torch_dataloader
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.prepare_dataloader
fastNLP.core.dataloaders.utils

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.torch\_dataloader.fdl module
=====================================================
.. automodule:: fastNLP.core.dataloaders.torch_dataloader.fdl
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.torch\_dataloader.mix\_dataloader module
=================================================================
.. automodule:: fastNLP.core.dataloaders.torch_dataloader.mix_dataloader
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,16 @@
fastNLP.core.dataloaders.torch\_dataloader package
==================================================
.. automodule:: fastNLP.core.dataloaders.torch_dataloader
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataloaders.torch_dataloader.fdl
fastNLP.core.dataloaders.torch_dataloader.mix_dataloader

View File

@ -0,0 +1,7 @@
fastNLP.core.dataloaders.utils module
=====================================
.. automodule:: fastNLP.core.dataloaders.utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.dataset.dataset module
===================================
.. automodule:: fastNLP.core.dataset.dataset
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.dataset.field module
=================================
.. automodule:: fastNLP.core.dataset.field
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.dataset.instance module
====================================
.. automodule:: fastNLP.core.dataset.instance
:members:
:undoc-members:
:show-inheritance:

View File

@ -1,7 +1,17 @@
fastNLP.core.dataset
====================
fastNLP.core.dataset package
============================
.. automodule:: fastNLP.core.dataset
:members: DataSet
:inherited-members:
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.dataset.dataset
fastNLP.core.dataset.field
fastNLP.core.dataset.instance

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.choose\_driver module
==========================================
.. automodule:: fastNLP.core.drivers.choose_driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.driver module
==================================
.. automodule:: fastNLP.core.drivers.driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.jittor\_driver.initialize\_jittor\_driver module
=====================================================================
.. automodule:: fastNLP.core.drivers.jittor_driver.initialize_jittor_driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.jittor\_driver.jittor\_driver module
=========================================================
.. automodule:: fastNLP.core.drivers.jittor_driver.jittor_driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.jittor\_driver.mpi module
==============================================
.. automodule:: fastNLP.core.drivers.jittor_driver.mpi
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,19 @@
fastNLP.core.drivers.jittor\_driver package
===========================================
.. automodule:: fastNLP.core.drivers.jittor_driver
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.drivers.jittor_driver.initialize_jittor_driver
fastNLP.core.drivers.jittor_driver.jittor_driver
fastNLP.core.drivers.jittor_driver.mpi
fastNLP.core.drivers.jittor_driver.single_device
fastNLP.core.drivers.jittor_driver.utils

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.jittor\_driver.single\_device module
=========================================================
.. automodule:: fastNLP.core.drivers.jittor_driver.single_device
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.jittor\_driver.utils module
================================================
.. automodule:: fastNLP.core.drivers.jittor_driver.utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.oneflow\_driver.ddp module
===============================================
.. automodule:: fastNLP.core.drivers.oneflow_driver.ddp
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.oneflow\_driver.dist\_utils module
=======================================================
.. automodule:: fastNLP.core.drivers.oneflow_driver.dist_utils
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.oneflow\_driver.initialize\_oneflow\_driver module
=======================================================================
.. automodule:: fastNLP.core.drivers.oneflow_driver.initialize_oneflow_driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,7 @@
fastNLP.core.drivers.oneflow\_driver.oneflow\_driver module
===========================================================
.. automodule:: fastNLP.core.drivers.oneflow_driver.oneflow_driver
:members:
:undoc-members:
:show-inheritance:

View File

@ -0,0 +1,20 @@
fastNLP.core.drivers.oneflow\_driver package
============================================
.. automodule:: fastNLP.core.drivers.oneflow_driver
:members:
:undoc-members:
:show-inheritance:
Submodules
----------
.. toctree::
:maxdepth: 4
fastNLP.core.drivers.oneflow_driver.ddp
fastNLP.core.drivers.oneflow_driver.dist_utils
fastNLP.core.drivers.oneflow_driver.initialize_oneflow_driver
fastNLP.core.drivers.oneflow_driver.oneflow_driver
fastNLP.core.drivers.oneflow_driver.single_device
fastNLP.core.drivers.oneflow_driver.utils

Some files were not shown because too many files have changed in this diff Show More