+ "def _process_val(data):\n",
+ " contexts = [data[i][\"context\"] for i in range(len(data))]\n",
+ " questions = [data[i][\"question\"] for i in range(len(data))]\n",
+ " tokenized_data_list = tokenizer(\n",
+ " questions,\n",
+ " contexts,\n",
+ " stride=doc_stride,\n",
+ " max_length=max_length,\n",
+ " return_dict=False\n",
+ " )\n",
+ " for i, tokenized_data in enumerate(tokenized_data_list):\n",
+ " token_type_ids = tokenized_data[\"token_type_ids\"]\n",
+ " # 保存数据对应的 id\n",
+ " sample_index = tokenized_data[\"overflow_to_sample\"]\n",
+ " tokenized_data_list[i][\"example_id\"] = data[sample_index][\"id\"]\n",
+ " # 将不属于 context 的 offset 设置为 None\n",
+ " tokenized_data_list[i][\"offset_mapping\"] = [\n",
+ " (o if token_type_ids[k] == 1 else None)\n",
+ " for k, o in enumerate(tokenized_data[\"offset_mapping\"])\n",
+ " ]\n",
+ " return tokenized_data_list\n",
val_dataset.map(_process_val, batched=True, num_workers=5)
#### 2.3 DataLoader
最后使用 `PaddleDataLoader` 将数据集包裹起来即可。
+ "from fastNLP.core import PaddleDataLoader\n",
+ "train_dataloader = PaddleDataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+ "val_dataloader = PaddleDataLoader(val_dataset, batch_size=16)"
### 3. 模型训练:自己定义评测用的 Metric 实现更加自由的任务评测

#### 3.1 损失函数
+ "#### 3.1 损失函数\n",
+ "\n",
对于阅读理解任务,我们使用的是 `ErnieForQuestionAnswering` 模型。该模型在接受输入后会返回两个值:`start_logits` 和 `end_logits` ,大小均为 `(batch_size, sequence_length)`,反映了每条数据每个词语为答案起始位置的可能性,因此我们需要自定义一个损失函数来计算 `loss`。 `CrossEntropyLossForSquad` 会分别对答案起始位置的预测值和真实值计算交叉熵,最后返回其平均值作为最终的损失。
+ "class CrossEntropyLossForSquad(paddle.nn.Layer):\n",
+ " def __init__(self):\n",
+ " super(CrossEntropyLossForSquad, self).__init__()\n",
+ " def forward(self, start_logits, end_logits, start_pos, end_pos):\n",
+ " start_pos = paddle.unsqueeze(start_pos, axis=-1)\n",
+ " end_pos = paddle.unsqueeze(end_pos, axis=-1)\n",
+ " start_loss = paddle.nn.functional.softmax_with_cross_entropy(\n",
+ " logits=start_logits, label=start_pos)\n",
+ " start_loss = paddle.mean(start_loss)\n",
+ " end_loss = paddle.nn.functional.softmax_with_cross_entropy(\n",
+ " logits=end_logits, label=end_pos)\n",
+ " end_loss = paddle.mean(end_loss)\n",
+ " loss = (start_loss + end_loss) / 2\n",
+ " return loss"
#### 3.2 定义模型
模型的核心则是 `ErnieForQuestionAnswering` 的 `ernie-1.0-base-zh` 预训练模型,同时按照 `FastNLP` 的规定定义 `train_step` 和 `evaluate_step` 函数。这里 `evaluate_step` 函数并没有像文本分类那样直接返回该批次数据的评测结果,这一点我们将在下面为您讲解。
+ "\u001b[32m[2022-06-27 19:00:15,825] [ INFO]\u001b[0m - Already cached /remote-home/shxing/.paddlenlp/models/ernie-1.0-base-zh/ernie_v1_chn_base.pdparams\u001b[0m\n",
+ "W0627 19:00:15.831080 21543 gpu_context.cc:278] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 11.2, Runtime API Version: 11.2\n",
+ "W0627 19:00:15.843276 21543 gpu_context.cc:306] device: 0, cuDNN Version: 8.1.\n"
+ "from paddlenlp.transformers import ErnieForQuestionAnswering\n",
+ "class QAModel(paddle.nn.Layer):\n",
+ " def __init__(self, model_checkpoint):\n",
+ " super(QAModel, self).__init__()\n",
+ " self.model = ErnieForQuestionAnswering.from_pretrained(model_checkpoint)\n",
+ " self.loss_func = CrossEntropyLossForSquad()\n",
+ " def forward(self, input_ids, token_type_ids):\n",
+ " start_logits, end_logits = self.model(input_ids, token_type_ids)\n",
+ " return start_logits, end_logits\n",
+ " def train_step(self, input_ids, token_type_ids, start_pos, end_pos):\n",
+ " start_logits, end_logits = self(input_ids, token_type_ids)\n",
+ " loss = self.loss_func(start_logits, end_logits, start_pos, end_pos)\n",
+ " return {\"loss\": loss}\n",
+ " def evaluate_step(self, input_ids, token_type_ids):\n",
+ " start_logits, end_logits = self(input_ids, token_type_ids)\n",
+ " return {\"start_logits\": start_logits, \"end_logits\": end_logits}\n",
model = QAModel(MODEL_NAME)
+ "#### 3.3 自定义 Metric 进行数据的评估\n",
+ "`paddlenlp` 为我们提供了评测 `SQuAD` 格式数据集的函数 `compute_prediction` 和 `squad_evaluate`:\n",
+ "- `compute_prediction` 函数要求传入原数据 `examples` 、处理后的数据 `features` 和 `features` 对应的结果 `predictions`(一个包含所有数据 `start_logits` 和 `end_logits` 的元组)\n",
+ "- `squad_evaluate` 要求传入原数据 `examples` 和预测结果 `all_predictions`(通常来自于 `compute_prediction`)\n",
+ "在使用这两个函数的时候,我们需要向其中传入数据集,但显然根据 `fastNLP` 的设计,我们无法在 `evaluate_step` 里实现这一过程,并且 `FastNLP` 也并没有提供计算 `F1` 和 `EM` 的 `Metric`,故我们需要自己定义用于评测的 `Metric`。\n",
+ "\n",
+ "在初始化之外,一个 `Metric` 还需要实现三个函数:\n",
+ "1. `reset` - 该函数会在验证数据集的迭代之前被调用,用于清空数据;在我们自定义的 `Metric` 中,我们需要将 `all_start_logits` 和 `all_end_logits` 清空,重新收集每个 `batch` 的结果。\n",
+ "2. `update` - 该函数会在在每个 `batch` 得到结果后被调用,用于更新 `Metric` 的状态;它的参数即为 `evaluate_step` 返回的内容。我们在这里将得到的 `start_logits` 和 `end_logits` 收集起来。\n",
+ "3. `get_metric` - 该函数会在数据集被迭代完毕后调用,用于计算评测的结果。现在我们有了整个验证集的 `all_start_logits` 和 `all_end_logits` ,将他们传入 `compute_predictions` 函数得到预测的结果,并继续使用 `squad_evaluate` 函数得到评测的结果。\n",
+ " - 注:`suqad_evaluate` 函数会自己输出评测结果,为了不让其干扰 `FastNLP` 输出,这里我们使用 `contextlib.redirect_stdout(None)` 将函数的标准输出屏蔽掉。\n",
+ "\n",
+ "综上,`SquadEvaluateMetric` 实现的评估过程是:将验证集中所有数据的 `logits` 收集起来,然后统一传入 `compute_prediction` 和 `squad_evaluate` 中进行评估。值得一提的是,`paddlenlp.datasets.load_dataset` 返回的结果是一个 `MapDataset` 类型,其 `data` 成员为加载时的数据,`new_data` 为经过 `map` 函数处理后更新的数据,因此可以分别作为 `examples` 和 `features` 传入。"
+ "from fastNLP.core import Metric\n",
+ "from paddlenlp.metrics.squad import squad_evaluate, compute_prediction\n",
+ "import contextlib\n",
+ "class SquadEvaluateMetric(Metric):\n",
+ " def __init__(self, examples, features, testing=False):\n",
+ " super(SquadEvaluateMetric, self).__init__(\"paddle\", False)\n",
+ " self.examples = examples\n",
+ " self.features = features\n",
+ " self.all_start_logits = []\n",
+ " self.all_end_logits = []\n",
+ " self.testing = testing\n",
+ " def reset(self):\n",
+ " self.all_start_logits = []\n",
+ " self.all_end_logits = []\n",
+ "\n",
+ " def update(self, start_logits, end_logits):\n",
+ " for start, end in zip(start_logits, end_logits):\n",
+ " self.all_start_logits.append(start.numpy())\n",
+ " self.all_end_logits.append(end.numpy())\n",
+ "\n",
+ " def get_metric(self):\n",
+ " all_predictions, _, _ = compute_prediction(\n",
+ " self.examples, self.features[:len(self.all_start_logits)],\n",
+ " (self.all_start_logits, self.all_end_logits),\n",
+ " False, 20, 30\n",
+ " )\n",
+ " with contextlib.redirect_stdout(None):\n",
+ " result = squad_evaluate(\n",
+ " examples=self.examples,\n",
+ " preds=all_predictions,\n",
+ " is_whitespace_splited=False\n",
+ " )\n",
+ "\n",
+ " if self.testing:\n",
+ " self.print_predictions(all_predictions)\n",
+ " return result\n",
+ "\n",
+ " def print_predictions(self, preds):\n",
+ " for i, data in enumerate(self.examples):\n",
+ " if i >= 5:\n",
+ " break\n",
+ " print()\n",
+ " print(\"原文:\", data[\"context\"])\n",
+ " print(\"问题:\", data[\"question\"], \\\n",
+ " \"答案:\", preds[data[\"id\"]], \\\n",
+ " \"正确答案:\", data[\"answers\"][\"text\"])\n",
+ "\n",
+ "metric = SquadEvaluateMetric(\n",
+ " val_dataloader.dataset.data,\n",
+ " val_dataloader.dataset.new_data,\n",
+ ")"
+ "#### 3.4 训练\n",
+ "至此所有的准备工作已经完成,可以使用 `Trainer` 进行训练了。学习率我们依旧采用线性预热策略 `LinearDecayWithWarmup`,优化器为 `AdamW`;回调模块我们选择 `LRSchedCallback` 更新学习率和 `LoadBestModelCallback` 监视评测结果的 `f1` 分数。初始化好 `Trainer` 之后,就将训练的过程交给 `FastNLP` 吧。"
[19:04:54] INFO Running evaluator sanity check for 2 batches.
+ "from fastNLP import Trainer, LRSchedCallback, LoadBestModelCallback\n",
+ "from paddlenlp.transformers import LinearDecayWithWarmup\n",
+ "n_epochs = 1\n",
+ "num_training_steps = len(train_dataloader) * n_epochs\n",
+ "lr_scheduler = LinearDecayWithWarmup(3e-5, num_training_steps, 0.1)\n",
+ "optimizer = paddle.optimizer.AdamW(\n",
+ " learning_rate=lr_scheduler,\n",
+ " parameters=model.parameters(),\n",
+ ")\n",
+ "callbacks=[\n",
+ " LRSchedCallback(lr_scheduler, step_on=\"batch\"),\n",
+ " LoadBestModelCallback(\"f1#squad\", larger_better=True, save_folder=\"fnlp-ernie-squad\")\n",
+ "]\n",
+ "trainer = Trainer(\n",
+ " model=model,\n",
+ " train_dataloader=train_dataloader,\n",
+ " evaluate_dataloaders=val_dataloader,\n",
+ " device=1,\n",
+ " optimizers=optimizer,\n",
+ " n_epochs=n_epochs,\n",
+ " callbacks=callbacks,\n",
+ " evaluate_every=100,\n",
+ " metrics={\"squad\": metric},\n",
+ ")\n",
trainer.run()
+ "#### 3.5 测试\n",
+ "\n",
+ "最后,我们可以使用 `Evaluator` 查看我们训练的结果。我们在之前为 `SquadEvaluateMetric` 设置了 `testing` 参数来在测试阶段进行输出,可以看到,训练的结果还是比较不错的。"
+ "from fastNLP import Evaluator\n",
+ "evaluator = Evaluator(\n",
+ " model=model,\n",
+ " dataloaders=val_dataloader,\n",
+ " device=1,\n",
+ " metrics={\n",
+ " \"squad\": SquadEvaluateMetric(\n",
+ " val_dataloader.dataset.data,\n",
+ " val_dataloader.dataset.new_data,\n",
+ " testing=True,\n",
+ " ),\n",
+ " },\n",
+ ")\n",
result = evaluator.run()
