fastNLP/tutorials/tutorial_1_data_preprocess.ipynb
2020-02-27 21:21:38 +08:00

293 lines
8.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+------------------------------+---------------------------------------------+---------+\n",
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
"+------------------------------+---------------------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
" 'seq_len': [6, 3, 3]}\n",
"dataset = DataSet(data)\n",
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的构建"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet()\n",
"instance = Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6)\n",
"dataset.append(instance)\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet([\n",
" Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6),\n",
" Instance(raw_words=\"Second instance .\",\n",
" words=['Second', 'instance', '.'],\n",
" seq_len=3)\n",
" ])\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的删除"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+---+\n",
"| a | c |\n",
"+----+---+\n",
"| -5 | 0 |\n",
"| -4 | 0 |\n",
"| -3 | 0 |\n",
"| -2 | 0 |\n",
"| -1 | 0 |\n",
"| 0 | 0 |\n",
"| 1 | 0 |\n",
"| 2 | 0 |\n",
"| 3 | 0 |\n",
"| 4 | 0 |\n",
"+----+---+"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+---+\n",
"| c |\n",
"+---+\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"+---+"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 不改变dataset生成一个删除了满足条件的instance的新 DataSet\n",
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
"# 在dataset中删除满足条件的instance\n",
"dataset.drop(lambda ins:ins['a']<0)\n",
"# 删除第3个instance\n",
"dataset.delete_instance(2)\n",
"# 删除名为'a'的field\n",
"dataset.delete_field('a')\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 简单的数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 检查是否存在名为'a'的field\n",
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
"# 将名为'a'的field改名为'b'\n",
"dataset.rename_field('c', 'b')\n",
"# DataSet的长度\n",
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+------------------------------+-------------------------------------------------+\n",
"| raw_words | words |\n",
"+------------------------------+-------------------------------------------------+\n",
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
"| Second instance . | ['Second', 'instance', '.'] |\n",
"| Third instance . | ['Third', 'instance', '.'] |\n",
"+------------------------------+-------------------------------------------------+"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
"dataset = DataSet(data)\n",
"\n",
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
"\n",
"# 或使用DataSet.apply_field()\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 除了匿名函数,也可以定义函数传递进去\n",
"def get_words(instance):\n",
" sentence = instance['raw_words']\n",
" words = sentence.split()\n",
" return words\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}