mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-12-04 21:28:01 +08:00
293 lines
8.3 KiB
Plaintext
293 lines
8.3 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# fastNLP中的DataSet"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"+------------------------------+---------------------------------------------+---------+\n",
|
||
"| raw_words | words | seq_len |\n",
|
||
"+------------------------------+---------------------------------------------+---------+\n",
|
||
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
|
||
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
|
||
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
|
||
"+------------------------------+---------------------------------------------+---------+\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP import DataSet\n",
|
||
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
|
||
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
|
||
" 'seq_len': [6, 3, 3]}\n",
|
||
"dataset = DataSet(data)\n",
|
||
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
|
||
"print(dataset)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## DataSet的构建"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"+----------------------------+---------------------------------------------+---------+\n",
|
||
"| raw_words | words | seq_len |\n",
|
||
"+----------------------------+---------------------------------------------+---------+\n",
|
||
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
|
||
"+----------------------------+---------------------------------------------+---------+"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP import DataSet\n",
|
||
"from fastNLP import Instance\n",
|
||
"dataset = DataSet()\n",
|
||
"instance = Instance(raw_words=\"This is the first instance\",\n",
|
||
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
|
||
" seq_len=6)\n",
|
||
"dataset.append(instance)\n",
|
||
"dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"+----------------------------+---------------------------------------------+---------+\n",
|
||
"| raw_words | words | seq_len |\n",
|
||
"+----------------------------+---------------------------------------------+---------+\n",
|
||
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
|
||
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
|
||
"+----------------------------+---------------------------------------------+---------+"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP import DataSet\n",
|
||
"from fastNLP import Instance\n",
|
||
"dataset = DataSet([\n",
|
||
" Instance(raw_words=\"This is the first instance\",\n",
|
||
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
|
||
" seq_len=6),\n",
|
||
" Instance(raw_words=\"Second instance .\",\n",
|
||
" words=['Second', 'instance', '.'],\n",
|
||
" seq_len=3)\n",
|
||
" ])\n",
|
||
"dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## DataSet的删除"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"+----+---+\n",
|
||
"| a | c |\n",
|
||
"+----+---+\n",
|
||
"| -5 | 0 |\n",
|
||
"| -4 | 0 |\n",
|
||
"| -3 | 0 |\n",
|
||
"| -2 | 0 |\n",
|
||
"| -1 | 0 |\n",
|
||
"| 0 | 0 |\n",
|
||
"| 1 | 0 |\n",
|
||
"| 2 | 0 |\n",
|
||
"| 3 | 0 |\n",
|
||
"| 4 | 0 |\n",
|
||
"+----+---+"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP import DataSet\n",
|
||
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
|
||
"dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"+---+\n",
|
||
"| c |\n",
|
||
"+---+\n",
|
||
"| 0 |\n",
|
||
"| 0 |\n",
|
||
"| 0 |\n",
|
||
"| 0 |\n",
|
||
"+---+"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n",
|
||
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
|
||
"# 在dataset中删除满足条件的instance\n",
|
||
"dataset.drop(lambda ins:ins['a']<0)\n",
|
||
"# 删除第3个instance\n",
|
||
"dataset.delete_instance(2)\n",
|
||
"# 删除名为'a'的field\n",
|
||
"dataset.delete_field('a')\n",
|
||
"dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 简单的数据预处理"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"False\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"4"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 检查是否存在名为'a'的field\n",
|
||
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
|
||
"# 将名为'a'的field改名为'b'\n",
|
||
"dataset.rename_field('c', 'b')\n",
|
||
"# DataSet的长度\n",
|
||
"len(dataset)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"+------------------------------+-------------------------------------------------+\n",
|
||
"| raw_words | words |\n",
|
||
"+------------------------------+-------------------------------------------------+\n",
|
||
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
|
||
"| Second instance . | ['Second', 'instance', '.'] |\n",
|
||
"| Third instance . | ['Third', 'instance', '.'] |\n",
|
||
"+------------------------------+-------------------------------------------------+"
|
||
]
|
||
},
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from fastNLP import DataSet\n",
|
||
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
|
||
"dataset = DataSet(data)\n",
|
||
"\n",
|
||
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
|
||
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
|
||
"\n",
|
||
"# 或使用DataSet.apply_field()\n",
|
||
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
|
||
"\n",
|
||
"# 除了匿名函数,也可以定义函数传递进去\n",
|
||
"def get_words(instance):\n",
|
||
" sentence = instance['raw_words']\n",
|
||
" words = sentence.split()\n",
|
||
" return words\n",
|
||
"dataset.apply(get_words, new_field_name='words')\n",
|
||
"dataset"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python Now",
|
||
"language": "python",
|
||
"name": "now"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.0"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|