mirror of
https://gitee.com/fastnlp/fastNLP.git
synced 2024-11-30 11:17:50 +08:00
Merge branch 'dev0.8.0' of github.com:fastnlp/fastNLP into dev0.8.0
This commit is contained in:
commit
b620061ec2
7
tutorials/data/test4dataset.csv
Normal file
7
tutorials/data/test4dataset.csv
Normal file
@ -0,0 +1,7 @@
|
||||
,SentenceId,Sentence,Sentiment
|
||||
0,1,"['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']",negative
|
||||
1,2,"['this', 'quiet', ',', 'introspective', 'and', 'entertaining', 'independent', 'is', 'worth', 'seeking', '.']",positive
|
||||
2,3,"['even', 'fans', 'of', 'ismail', 'merchant', ""'s"", 'work', ',', 'i', 'suspect', ',', 'would', 'have', 'a', 'hard', 'time', 'sitting', 'through', 'this', 'one', '.']",negative
|
||||
3,4,"['a', 'positively', 'thrilling', 'combination', 'of', 'ethnography', 'and', 'all', 'the', 'intrigue', ',', 'betrayal', ',', 'deceit', 'and', 'murder', 'of', 'a', 'shakespearean', 'tragedy', 'or', 'a', 'juicy', 'soap', 'opera', '.']",neutral
|
||||
4,5,"['a', 'comedy-drama', 'of', 'nearly', 'epic', 'proportions', 'rooted', 'in', 'a', 'sincere', 'performance', 'by', 'the', 'title', 'character', 'undergoing', 'midlife', 'crisis', '.']",positive
|
||||
5,6,"['the', 'importance', 'of', 'being', 'earnest', ',', 'so', 'thick', 'with', 'wit', 'it', 'plays', 'like', 'a', 'reading', 'from', 'bartlett', ""'s"", 'familiar', 'quotations']",neutral
|
|
7
tutorials/data/test4dataset.tsv
Normal file
7
tutorials/data/test4dataset.tsv
Normal file
@ -0,0 +1,7 @@
|
||||
SentenceId Sentence Sentiment
|
||||
1 A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . negative
|
||||
2 This quiet , introspective and entertaining independent is worth seeking . positive
|
||||
3 Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . negative
|
||||
4 A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . neutral
|
||||
5 A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . positive
|
||||
6 The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations neutral
|
|
@ -153,7 +153,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1969418794120 1971237588872\n",
|
||||
"2438703969992 2438374526920\n",
|
||||
"+-----+------------------------+------------------------+-----+\n",
|
||||
"| idx | sentence | words | num |\n",
|
||||
"+-----+------------------------+------------------------+-----+\n",
|
||||
@ -198,7 +198,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1971237588872 1971237588872\n",
|
||||
"2438374526920 2438374526920\n",
|
||||
"+-----+------------------------+------------------------+-----+\n",
|
||||
"| idx | sentence | words | num |\n",
|
||||
"+-----+------------------------+------------------------+-----+\n",
|
||||
@ -774,9 +774,9 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d08>,\n",
|
||||
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879d88>,\n",
|
||||
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x1ca8a879e08>}"
|
||||
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d388>,\n",
|
||||
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d408>,\n",
|
||||
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x237ce26d488>}"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
@ -923,7 +923,8 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
|
||||
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n"
|
||||
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
|
||||
"6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -931,7 +932,8 @@
|
||||
"vocab.add_word_lst(['生活', '就像', '海洋'])\n",
|
||||
"print(len(vocab), vocab.word_count)\n",
|
||||
"vocab.add_word('只有')\n",
|
||||
"print(len(vocab), vocab.word_count)"
|
||||
"print(len(vocab), vocab.word_count)\n",
|
||||
"print(len(vocab), vocab.word2idx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -959,7 +961,6 @@
|
||||
"<pad> 0\n",
|
||||
"<unk> 1\n",
|
||||
"生活 2\n",
|
||||
"只有 5\n",
|
||||
"彼岸 1 False\n"
|
||||
]
|
||||
}
|
||||
@ -968,7 +969,6 @@
|
||||
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
|
||||
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
|
||||
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
|
||||
"print(vocab.to_word(5), vocab.to_index('只有'))\n",
|
||||
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
|
||||
]
|
||||
},
|
||||
@ -979,7 +979,9 @@
|
||||
"source": [
|
||||
"**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n",
|
||||
"\n",
|
||||
"  但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询"
|
||||
"  但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n",
|
||||
"\n",
|
||||
"  注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -992,15 +994,19 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"13 Counter({'生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '人': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
|
||||
"彼岸 12 True\n"
|
||||
"生活 2\n",
|
||||
"彼岸 12 True\n",
|
||||
"13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
|
||||
"13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '才', '能', '到达', '彼岸'])\n",
|
||||
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
|
||||
"print(vocab.to_word(2), vocab.to_index('生活'))\n",
|
||||
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n",
|
||||
"print(len(vocab), vocab.word_count)\n",
|
||||
"print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
|
||||
"print(len(vocab), vocab.word2idx)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1082,51 +1088,439 @@
|
||||
"## 3 dataset 和 vocabulary 的组合使用\n",
|
||||
" \n",
|
||||
"### 3.1 从 dataframe 中加载 dataset\n",
|
||||
"\n"
|
||||
"\n",
|
||||
"以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n",
|
||||
"\n",
|
||||
"  介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "3dbd985d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>SentenceId</th>\n",
|
||||
" <th>Sentence</th>\n",
|
||||
" <th>Sentiment</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>A series of escapades demonstrating the adage ...</td>\n",
|
||||
" <td>negative</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>This quiet , introspective and entertaining in...</td>\n",
|
||||
" <td>positive</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
|
||||
" <td>negative</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>A positively thrilling combination of ethnogra...</td>\n",
|
||||
" <td>neutral</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>A comedy-drama of nearly epic proportions root...</td>\n",
|
||||
" <td>positive</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>The Importance of Being Earnest , so thick wit...</td>\n",
|
||||
" <td>neutral</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" SentenceId Sentence Sentiment\n",
|
||||
"0 1 A series of escapades demonstrating the adage ... negative\n",
|
||||
"1 2 This quiet , introspective and entertaining in... positive\n",
|
||||
"2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n",
|
||||
"3 4 A positively thrilling combination of ethnogra... neutral\n",
|
||||
"4 5 A comedy-drama of nearly epic proportions root... positive\n",
|
||||
"5 6 The Importance of Being Earnest , so thick wit... neutral"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n",
|
||||
"df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "89059713",
|
||||
"id": "919ab350",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
"source": [
|
||||
"接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3dbd985d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 25,
|
||||
"id": "4f634586",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| SentenceId | Sentence | Sentiment |\n",
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| 1 | ['a', 'series', 'of', 'es... | negative |\n",
|
||||
"| 2 | ['this', 'quiet', ',', 'i... | positive |\n",
|
||||
"| 3 | ['even', 'fans', 'of', 'i... | negative |\n",
|
||||
"| 4 | ['a', 'positively', 'thri... | neutral |\n",
|
||||
"| 5 | ['a', 'comedy-drama', 'of... | positive |\n",
|
||||
"| 6 | ['the', 'importance', 'of... | neutral |\n",
|
||||
"+------------+------------------------------+-----------+\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from fastNLP.core.dataset import DataSet\n",
|
||||
"\n",
|
||||
"dataset = DataSet()\n",
|
||||
"dataset = dataset.from_pandas(df)\n",
|
||||
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
|
||||
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n",
|
||||
"print(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5c1ae192",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"  如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"id": "46722efc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"dataset.to_csv('./data/test4dataset.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ba13989",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### 3.2 从 dataset 中获取 vocabulary"
|
||||
"### 3.2 从 dataset 中获取 vocabulary\n",
|
||||
"\n",
|
||||
"然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n",
|
||||
"\n",
|
||||
"  获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n",
|
||||
"\n",
|
||||
"  注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "a2de615b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
|
||||
"\n",
|
||||
"{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
|
||||
"\n",
|
||||
"Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from fastNLP.core.vocabulary import Vocabulary\n",
|
||||
"\n",
|
||||
"vocab = Vocabulary()\n",
|
||||
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
|
||||
"print(vocab.word_count, '\\n')\n",
|
||||
"print(vocab.word2idx, '\\n')\n",
|
||||
"print(vocab)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f0857ccb",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n",
|
||||
"\n",
|
||||
"  使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "2f9a04b2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| SentenceId | Sentence | Sentiment |\n",
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n",
|
||||
"| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n",
|
||||
"| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n",
|
||||
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n",
|
||||
"| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n",
|
||||
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n",
|
||||
"+------------+------------------------------+-----------+\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vocab.index_dataset(dataset, field_name='Sentence')\n",
|
||||
"print(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b26b707",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "5f5eed18",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'negative': 0, 'positive': 1, 'neutral': 2}\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
|
||||
],
|
||||
"text/plain": []
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| SentenceId | Sentence | Sentiment |\n",
|
||||
"+------------+------------------------------+-----------+\n",
|
||||
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n",
|
||||
"| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n",
|
||||
"| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n",
|
||||
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n",
|
||||
"| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n",
|
||||
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n",
|
||||
"+------------+------------------------------+-----------+\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
|
||||
"\n",
|
||||
"target_vocab.from_dataset(dataset, field_name='Sentiment')\n",
|
||||
"print(target_vocab.word2idx)\n",
|
||||
"target_vocab.index_dataset(dataset, field_name='Sentiment')\n",
|
||||
"print(dataset)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eed7ea64",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n",
|
||||
"\n",
|
||||
"<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a2de615b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5f5eed18",
|
||||
"id": "35b4f0f7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
41
tutorials/fastnlp_tutorial_2.ipynb
Normal file
41
tutorials/fastnlp_tutorial_2.ipynb
Normal file
@ -0,0 +1,41 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
BIN
tutorials/figures/T1-fig-dataset-and-vocabulary.png
Normal file
BIN
tutorials/figures/T1-fig-dataset-and-vocabulary.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
Loading…
Reference in New Issue
Block a user