fastNLP/tutorials/tutorial_1_data_preprocess.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fastNLP中的DataSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+---------------------------------------------+---------+\n",
      "| raw_words                    | words                                       | seq_len |\n",
      "+------------------------------+---------------------------------------------+---------+\n",
      "| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
      "| Second instance .            | ['Second', 'instance', '.']                 | 3       |\n",
      "| Third instance .             | ['Third', 'instance', '.']                  | 3       |\n",
      "+------------------------------+---------------------------------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
    "        'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
    "        'seq_len': [6, 3, 3]}\n",
    "dataset = DataSet(data)\n",
    "# 传入的dict的每个key的value应该为具有相同长度的list\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DataSet的构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------------------------+---------------------------------------------+---------+\n",
       "| raw_words                  | words                                       | seq_len |\n",
       "+----------------------------+---------------------------------------------+---------+\n",
       "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
       "+----------------------------+---------------------------------------------+---------+"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "from fastNLP import Instance\n",
    "dataset = DataSet()\n",
    "instance = Instance(raw_words=\"This is the first instance\",\n",
    "                    words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
    "                    seq_len=6)\n",
    "dataset.append(instance)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------------------------+---------------------------------------------+---------+\n",
       "| raw_words                  | words                                       | seq_len |\n",
       "+----------------------------+---------------------------------------------+---------+\n",
       "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
       "| Second instance .          | ['Second', 'instance', '.']                 | 3       |\n",
       "+----------------------------+---------------------------------------------+---------+"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "from fastNLP import Instance\n",
    "dataset = DataSet([\n",
    "    Instance(raw_words=\"This is the first instance\",\n",
    "        words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
    "        seq_len=6),\n",
    "    Instance(raw_words=\"Second instance .\",\n",
    "        words=['Second', 'instance', '.'],\n",
    "        seq_len=3)\n",
    "    ])\n",
    "dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DataSet的删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----+---+\n",
       "| a  | c |\n",
       "+----+---+\n",
       "| -5 | 0 |\n",
       "| -4 | 0 |\n",
       "| -3 | 0 |\n",
       "| -2 | 0 |\n",
       "| -1 | 0 |\n",
       "| 0  | 0 |\n",
       "| 1  | 0 |\n",
       "| 2  | 0 |\n",
       "| 3  | 0 |\n",
       "| 4  | 0 |\n",
       "+----+---+"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+---+\n",
       "| c |\n",
       "+---+\n",
       "| 0 |\n",
       "| 0 |\n",
       "| 0 |\n",
       "| 0 |\n",
       "+---+"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 不改变dataset，生成一个删除了满足条件的instance的新 DataSet\n",
    "dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
    "# 在dataset中删除满足条件的instance\n",
    "dataset.drop(lambda ins:ins['a']<0)\n",
    "#  删除第3个instance\n",
    "dataset.delete_instance(2)\n",
    "#  删除名为'a'的field\n",
    "dataset.delete_field('a')\n",
    "dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 简单的数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  检查是否存在名为'a'的field\n",
    "print(dataset.has_field('a'))  # 或 ('a' in dataset)\n",
    "#  将名为'a'的field改名为'b'\n",
    "dataset.rename_field('c', 'b')\n",
    "#  DataSet的长度\n",
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+------------------------------+-------------------------------------------------+\n",
       "| raw_words                    | words                                           |\n",
       "+------------------------------+-------------------------------------------------+\n",
       "| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
       "| Second instance .            | ['Second', 'instance', '.']                     |\n",
       "| Third instance .             | ['Third', 'instance', '.']                      |\n",
       "+------------------------------+-------------------------------------------------+"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
    "dataset = DataSet(data)\n",
    "\n",
    "# 将句子分成单词形式, 详见DataSet.apply()方法\n",
    "dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
    "\n",
    "# 或使用DataSet.apply_field()\n",
    "dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
    "\n",
    "# 除了匿名函数，也可以定义函数传递进去\n",
    "def get_words(instance):\n",
    "    sentence = instance['raw_words']\n",
    "    words = sentence.split()\n",
    "    return words\n",
    "dataset.apply(get_words, new_field_name='words')\n",
    "dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python Now",
   "language": "python",
   "name": "now"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}