!1291 添加Qwen1.5-14B适配

Merge pull request !1291 from 商元义/14B
2024-12-05 05:17:40 +08:00 · 2024-05-23 02:57:38 +00:00 · 2024-05-23 02:57:38 +00:00 · 19d3b157ff
commit 19d3b157ff
parent 0ddad3f9c8
9 changed files with 978 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -305,6 +305,27 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
      <td> 【昇腾贡献】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td>7B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
      <td rowspan="1"> 【社区贡献】 </td>
    </tr>
      <td>14B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
      <td rowspan="1"> 【社区贡献】 </td>
    <tr>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -337,17 +358,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> -- </td>
      <td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a> </td>
      <td>【昇腾贡献】</td>
    <tr>
      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td>7B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
      <td> 【社区贡献】 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
@ -587,6 +597,21 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> 285 </td>
      <td> 345 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td> 7B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td> 2862 </td>
      <td> 2621 </td>
      </tr>
      <tr>
      <td> 14B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td> 1717 </td>
      <td> 1702 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -611,14 +636,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> 2806 </td>
      <td> 2734 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td> 7B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td> 2548 </td>
      <td> 2578 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
--- a/README_en.md
+++ b/README_en.md
@ -304,6 +304,27 @@ Current ModelLink supports pre-training and fine-tuning for the following models
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
      <td> 【Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td>7B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
      <td rowspan="1"> 【Community】 </td>
    </tr>
      <td>14B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
      <td rowspan="1"> 【Community】 </td>
    <tr>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -336,17 +357,6 @@ Current ModelLink supports pre-training and fine-tuning for the following models
      <td> -- </td>
      <td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a>  </td>
      <td>【Ascend】</td>
    <tr>
      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td>7B</td>
      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
      <td> -- </td>
      <td> -- </td>
      <td> -- </td>
      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
      <td> 【Community】 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
@ -586,6 +596,21 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 285 </td>
      <td> 345 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td> 7B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td>  2862 </td>
      <td> 2621 </td>
      </tr>
      <tr>
      <td> 14B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td> 1717 </td>
      <td> 1702 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
      <td>34B</td>
@ -610,14 +635,6 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2806 </td>
      <td> 2734 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
      <td> 7B </td>
      <td> 1x8 </td>
      <td> BF16 </td>
      <td> 2548 </td>
      <td> 2578 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
--- a/examples/qwen15/README.md
+++ b/examples/qwen15/README.md
@ -15,6 +15,14 @@
 	- [推理](#推理)
 	- [评估](#评估)
 - [Qwen1.5-14B](#qwen15-14b)
 	- [训练](#训练)
 	- [脚本](#脚本)
 	- [性能](#性能)
 		- [吞吐](#吞吐)
 	- [推理](#推理)
 	- [评估](#评估)
 # Qwen1.5-7B
 ## 训练
@ -102,9 +110,10 @@ Qwen1.5-7B 训练的硬件配置:
       --target-pipeline-parallel-size 1 \
       --make-vocab-size-divisible-by 16 \
       --load-dir ./model_from_hf/Qwen1.5-7B/ \
-       --save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
+       --save-dir ./model_weights/DA-v0.1-tp8-pp1/ \
       --tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
-       --add-qkv-bias
+       --add-qkv-bias \
       --param-dtype bf16 
   ```
   任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
@ -122,7 +131,7 @@ Qwen1.5-7B 训练的硬件配置:
       --target-pipeline-parallel-size 1 \
       --add-qkv-bias \
       --load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
-       --save-dir ./model_from_hf/Qwen1.5-7B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen-7B/mg2hg/
+       --save-dir ./model_from_hf/Qwen1.5-7B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen1.5-7B/mg2hg/
   ```
 5. 预训练
@ -199,8 +208,7 @@ Qwen1.5-7B 训练的硬件配置:
   6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。
   *区别是数据集，以及增加训练参数`--is-instruction-dataset`，增加微调参数`--finetune`，增加预训练权重加载参数`--load`
-   ，使微调从第一步开始，修改tokenizer参数，去掉`--tokenizer-type Llama2Tokenizer`
+   ，使微调从第一步开始。*
   和`--tokenizer-model ${TOKENIZER_MODEL}`。*
   修改如下：
@ -213,8 +221,6 @@ Qwen1.5-7B 训练的硬件配置:
   --load ${CKPT_LOAD_DIR} \
   --finetune \
   --is-instruction-dataset \
   --tokenizer-type PretrainedFromHF \
   --tokenizer-name-or-path ${TOKENIZER_PATH} \
   --tokenizer-not-use-fast \
   ```
@ -286,4 +292,274 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh
 |  数据集  | 总学科数 | 总问题数  |                          参考准确率                          | NPU准确率 |
 |:-----:|:----:|:-----:|:-------------------------------------------------------:|:------:|
-| MMLU  |  57  | 14042 |    [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)     |  60.3  |
+| MMLU  |  57  | 14042 |    [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)     |  60.3  |
 # Qwen1.5-14B
 ## 训练
 Qwen1.5-14B 训练的硬件配置:
 | 硬件  |       配置        |
 |:---:|:---------------:|
 | NPU | 8 x Ascend NPUs |
 ### 脚本
 1. 克隆仓库到本地服务器
   ```shell
   git clone https://gitee.com/ascend/ModelLink.git
   git clone https://github.com/NVIDIA/Megatron-LM.git
   cd Megatron-LM
   git checkout -f bcce6f
   cp -r megatron ../ModelLink/
   cd ..
   cd ModelLink
   mkdir logs
   mkdir model_from_hf
   mkdir dataset
   mkdir ckpt
   ```
 2. 搭建环境
   ```bash
   # python3.8
   conda create -n test python=3.8
   conda activate test
   # 安装 torch 和 torch_npu
   pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
   pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
   pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
   # 安装加速库
   git clone https://gitee.com/ascend/AscendSpeed.git
   cd AscendSpeed
   git checkout 224ae35e8fc96778f957029d1371ddb623452a50
   pip install -r requirements.txt
   pip install -e .
   cd ..
   # 安装其余依赖库
   pip install -r requirements.txt
   ```
   **注意**：transformer版本要4.37.0以上
 3. 下载 Qwen1.5-14B 的 [预训练权重和词表](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
   ```bash
   mkdir ./model_from_hf/Qwen1.5-14B/
   cd ./model_from_hf/Qwen1.5-14B/
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
   ...
   cd ../../
   ```
 4. 权重转换
   将权重从 huggingface 格式转化为 megatron 格式
   ***（该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练）***
   ```shell
   # 修改 ascend-toolkit 路径
   source /usr/local/Ascend/ascend-toolkit/set_env.sh
   python tools/checkpoint/convert_ckpt.py \
       --model-type GPT \
       --loader llama2_hf \
       --saver megatron \
       --target-tensor-parallel-size 8 \
       --target-pipeline-parallel-size 1 \
       --make-vocab-size-divisible-by 16 \
       --load-dir ./model_from_hf/Qwen1.5-14B/ \
       --save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
       --tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
       --add-qkv-bias \
       --param-dtype bf16 
   ```
   任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
   ***（该场景一般用于将训练好的megatron模型重新转回HuggingFace格式）***
   ```bash
   # 请按照您的真实环境修改 set_env.sh 路径
   source /usr/local/Ascend/ascend-toolkit/set_env.sh
   python tools/checkpoint/convert_ckpt.py \
       --model-type GPT \
       --loader megatron \
       --saver megatron \
       --save-model-type save_huggingface_qwen \
       --target-tensor-parallel-size 1 \
       --target-pipeline-parallel-size 1 \
       --add-qkv-bias \
       --load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
       --save-dir ./model_from_hf/Qwen1.5-14B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen1.5-14B/mg2hg/
   ```
 5. 预训练
   5.1 准备数据集
   下载Qwen1.5-14B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
   ```shell
   # 下载数据
   cd ./dataset
   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
   cd ..
   # 处理数据   
   mkdir ./dataset/Qwen1.5-14B/
   python ./tools/preprocess_data.py \
       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
       --output-prefix ./dataset/Qwen1.5-14B/alpaca \
       --tokenizer-type PretrainedFromHF \
       --seq-length 8192 \
       --workers 4 \
       --log-interval 1000
   ```
   5.2 预训练
   配置Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
   ```shell
   # 设置 ascend-toolkit 路径
   source /usr/local/Ascend/ascend-toolkit/set_env.sh 
   # 根据实际情况配置词表、数据集、模型参数保存路径
   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B"
   TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B"  #词表路径
   DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document"  #数据集路径
   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
   ```
   多机运行增加参数 `--overlap-grad-reduce`。
   启动 Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
   ```shell
    bash examples/qwen15/pretrain_qwen15_14b_ptd.sh
   ```
   **注意**：如果使用多机训练，且没有设置数据共享，需要在训练启动脚本中增加 `--no-shared-storage` 参数，设置此参数之后将会根据分布式参数判断非主节点是否需要load数据，并检查相应缓存和生成数据。
 6. 微调
   6.1 准备微调数据集
   下载微调数据集 [这里](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
   ```shell
   # 下载数据集
   mkdir finetune_dataset
   cd ./finetune_dataset
   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
   cd ..
   # 处理微调数据集  
   mkdir ./finetune_dataset/Qwen1.5-14B/
   python ./tools/preprocess_data.py \
       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
       --output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
       --workers 4 \
       --log-interval 1000 \
       --tokenizer-type PretrainedFromHF \
       --handler-name GeneralInstructionHandler \
       --append-eod
   ```
   6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。
   *区别是数据集，以及增加训练参数`--is-instruction-dataset`，增加微调参数`--finetune`，增加预训练权重加载参数`--load`
   ，使微调从第一步开始}`。*
   修改如下：
   ```bash
   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
   DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
   TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
   --load ${CKPT_LOAD_DIR} \
   --finetune \
   --is-instruction-dataset \
   --tokenizer-not-use-fast \
   ```
   启动微调脚本: examples/qwen15/tune_qwen15_14b_ptd.sh
   ```shell
    bash examples/qwen15/tune_qwen15_14b_ptd.sh
   ```
 ### 性能
 #### 吞吐
 Qwen1.5-14B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比：
 |       设备       |     模型      | tokens吞吐 (tokens/s/p) |
 |:--------------:|:-----------:|:---------------------:|
 |      NPUs      | Qwen1.5-14B |        1717.8         |
 |       参考       | Qwen1.5-14B |        1702.2         |
 ## 推理
 配置 Qwen1.5-14B 推理脚本：examples/qwen15/generate_qwen15_14b_ptd.sh
 ```bash
 # ascend-toolkit 路径
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 # 修改模型权重路径和词表路径
 CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
 TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
 ```
 启动Qwen1.5-14B推理脚本
 ```bash
 bash examples/qwen15/generate_qwen15_14b_ptd.sh
 ```
 推理示例如下：
 ![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
 ## 评估
 使用[CEval数据集](https://huggingface.co/datasets/ceval/ceval-exam)
 和[MMLU数据集](https://huggingface.co/datasets/cais/mmlu)评估模型.
 配置Qwen1.5-14B评估脚本: examples/qwen15/evaluate_qwen15_14b_ptd.sh
 ```bash
 # ascend-toolkit 路径
 source /usr/local/Ascend/ascend-toolkit/set_env.sh 
 # 修改模型参数路径和词表路径
 TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"  #词表路径
 CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"  #模型路径
 # 配置任务和数据集路径
 DATA_PATH="./mmlu/data/test/"  # ceval任务配置为 "./ceval/val/"
 TASK="mmlu"  # ceval任务配置为 "ceval"
 ```
 启动评估
 ```bash
 bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
 ```
 |  数据集  | 总学科数 | 总问题数  |                      参考准确率                       | NPU准确率 |
 |:-----:|:----:|:-----:|:------------------------------------------------:|:------:|
 | MMLU  |  57  | 14042 | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) |  67.3  |
--- a/examples/qwen15/README_en.md
+++ b/examples/qwen15/README_en.md
@ -14,6 +14,14 @@
  - [Inference](#Inference)
  - [Evaluation](#Evaluation)
 - [Qwen1.5-14B](#qwen15-14b)
  - [Training](#training)
  - [Script](#script)
  - [Performance](#performance)
    - [Machine performance](#machine-performance)
  - [Inference](#Inference)
  - [Evaluation](#Evaluation)
 # Qwen1.5-7B
 ## Training
@ -105,7 +113,8 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
        --load-dir ./model_from_hf/Qwen1.5-7B/ \
        --save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
        --tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
-        --add-qkv-bias
+        --add-qkv-bias \
        --param-dtype bf16 
    ```
   Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
@ -123,7 +132,7 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
        --target-pipeline-parallel-size 1 \
        --add-qkv-bias \
        --load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
-        --save-dir ./model_from_hf/Qwen1.5-7B   # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen-7B/mg2hg/
+        --save-dir ./model_from_hf/Qwen1.5-7B   # Fill in the original HF model path here, new weights will be saved in 1.5/mg2hg/
    ```
 5. Pre-training
@ -199,7 +208,7 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
   The configuration script with the fine-tuning parameters is basically the same as the pre-training script.
-   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter, Drop `--tokenizer-type Llama2Tokenizer` and `--tokenizer-model ${TOKENIZER_MODEL}`.*
+   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*
    Modified as follows:
@ -212,8 +221,6 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
   --load ${CKPT_PATH} \
   --finetune \
   --is-instruction-dataset \
   --tokenizer-type PretrainedFromHF \
   --tokenizer-name-or-path ${TOKENIZER_PATH} \
   --tokenizer-not-use-fast \
   ```
@ -256,7 +263,6 @@ bash examples/qwen15/generate_qwen15_7b_ptd.sh
 **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
 Some inference samples are as follows:
 ![Inference](../../sources/images/qwen15/qwen1.5_7b_inference.png)
 ## Evaluation
@ -287,3 +293,277 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh
 | Task  | Subset | Question |                          OpenSource                           |   NPU   |
 |:-----:|:------:|:--------:|:-------------------------------------------------------------:|:-------:|
 | MMLU  |   57   |  14042   |       [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)        |  60.3   |
 # Qwen1.5-14B
 ## Training
 Here's a hardware summary of pre-training  Qwen1.5-14B:
 | Hardware |                      Value                      |
 | :------: | :---------------------------------------------: |
 |   NPU    |               8 x Ascend NPUs                   |
 ### Script
 1. Clone the repository to your local server:
    ```shell
    git clone https://gitee.com/ascend/ModelLink.git
    git clone https://github.com/NVIDIA/Megatron-LM.git
    cd Megatron-LM
    git checkout -f bcce6f
    cp -r megatron ../ModelLink/
    cd ..
    cd ModelLink
    mkdir logs
    mkdir model_from_hf
    mkdir dataset
    mkdir ckpt
    ```
 2. Build environment
    ```bash
    # python3.8
    conda create -n test python=3.8
    conda activate test
    # install torch and torch_npu
    pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
    pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
    # install AscendSpeed
    git clone https://gitee.com/ascend/AscendSpeed.git
    cd AscendSpeed
    git checkout 224ae35e8fc96778f957029d1371ddb623452a50
    pip install -r requirements.txt
    pip install -e .
    cd ..
    # install other packages
    pip install -r requirements.txt
    ```
    **Note** : transformer version 4.37.0 or higher
 3. Prepare pretrained weights and tokenizer
   Download the Qwen1.5-14B checkpoint from [here](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
   ```bash
   mkdir ./model_from_hf/Qwen1.5-14B/
   cd ./model_from_hf/Qwen1.5-14B/
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
   ...
   cd ../../
   ```
 4. Weights convert
    Convert weights from huggingface format to megatron format
    ***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
    ```bash
    # modify the script according to your own ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader llama2_hf \
        --saver megatron \
        --target-tensor-parallel-size 8 \
        --target-pipeline-parallel-size 1 \
        --make-vocab-size-divisible-by 16 \
        --load-dir ./model_from_hf/Qwen1.5-14B/ \
        --save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
        --tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
        --add-qkv-bias \
        --param-dtype bf16 
    ```
   Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
    ***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
    ```shell
    # Modify the ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh
    python tools/checkpoint/convert_ckpt.py \
        --model-type GPT \
        --loader megatron \
        --saver megatron \
        --save-model-type save_huggingface_qwen \
        --target-tensor-parallel-size 1 \
        --target-pipeline-parallel-size 1 \
        --add-qkv-bias \
        --load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
        --save-dir ./model_from_hf/Qwen1.5-14B   # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen1.5-14B/mg2hg/
    ```
 5. Pre-training
   5.1 prepare dataset
   Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
   ```shell
   # download datasets
   cd ./dataset
   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
   cd ..
   # process datasets
   mkdir ./dataset/Qwen1.5-14B/
   python ./tools/preprocess_data.py \
       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
       --output-prefix ./dataset/Qwen1.5-14B/alpaca \
       --tokenizer-type PretrainedFromHF \
       --seq-length 8192 \
       --workers 4 \
       --log-interval 1000
   ```
   5.2 pre-training
   Config Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
   ```shell
    # modify the script according to your own ascend-toolkit path
    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
    # modify config according to your own actual situation
    CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
    TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B"  #tokenizer path
    DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document"  #processed dataset
    CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
   ```
   Multi-machine training requires the addition of parameter `--overlap-grad-reduce`.
   Launch Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
   ```shell
    bash examples/qwen15/pretrain_qwen15_14b_ptd.sh 
   ```
    **Note**: If using multi machine training, and no data sharing configuration on the mechines, it's necessary to add the parameter `--no-shared-storage`. This parameter will determine whether non master nodes need to load data based on distributed parameters, and check the corresponding cache and generated data.
 6. fine-tuning
   6.1 Prepare fine-tuning dataset Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
   ```shell
   # download datasets
   mkdir finetune_dataset
   cd ./finetune_dataset
   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
   cd ..
   # process datasets   
   mkdir ./finetune_dataset/Qwen1.5-14B/
   python ./tools/preprocess_data.py \
       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
       --output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
       --workers 4 \
       --log-interval 1000 \
       --tokenizer-type PretrainedFromHF \
       --handler-name GeneralInstructionHandler \
       --append-eod
   ```
   6.2 Full Parameters Fine-Tuning
   The configuration script with the fine-tuning parameters is basically the same as the pre-training script.
   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*
    Modified as follows:
   ```bash
   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
   DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
   TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
   --load ${CKPT_PATH} \
   --finetune \
   --is-instruction-dataset \
   --tokenizer-type PretrainedFromHF \
   --tokenizer-name-or-path ${TOKENIZER_PATH} \
   --tokenizer-not-use-fast \
   ```
   Launch Qwen1.5-14B fine-tuning script: examples/qwen15/tune_qwen15_14b_ptd.sh
   ```shell
    bash examples/qwen15/tune_qwen15_14b_ptd.sh
   ```
 ### Performance
 #### Machine performance
 The performance of Qwen1.5-14B in **Ascend NPU** and **Reference**:
 |  Device   |    Model    | throughput rate (tokens/s/p) |
 |:---------:|:-----------:|:----------------------------:|
 |   NPUs    | Qwen1.5-14B |            1717.8            |
 | Reference | Qwen1.5-14B |            1702.2            |
 ## Inference
 Config Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
 ```bash
 # ascend-toolkit path
 source /usr/local/Ascend/ascend-toolkit/set_env.sh 
 # modify script model path and tokenizer path
 CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
 TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
 ```
 Launch Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
 ```bash
 bash examples/qwen15/generate_qwen15_14b_ptd.sh
 ```
 **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
 Some inference samples are as follows:
 ![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
 ## Evaluation
 We use the [CEval benchmark](https://huggingface.co/datasets/ceval/ceval-exam) and [MMLU benchmark](https://huggingface.co/datasets/cais/mmlu) to evaluate our model.
 Config Qwen1.5-14B evaluation script: examples/qwen15/evaluate_qwen15_14b_ptd.sh
 ```bash
 # ascend-toolkit path
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 # Modify the model parameter path and vocabulary path
 TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"  # vocabulary path
 CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"  # parameter path
 # Configure the task type and dataset path
 DATA_PATH="./mmlu/data/test/"  # "./ceval/val/" for ceval task
 TASK="mmlu"  # "ceval" for ceval task
 ```
 Launch Qwen1.5-14B evaluation
 ```bash
 bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
 ```
 | Task  | Subset | Question |                    OpenSource                    | NPU  |
 |:-----:|:------:|:--------:|:------------------------------------------------:|:----:|
 | MMLU  |   57   |  14042   | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) | 67.3 |
--- a/examples/qwen15/evaluate_qwen15_14b_ptd.sh
+++ b/examples/qwen15/evaluate_qwen15_14b_ptd.sh
@ -0,0 +1,68 @@
 #!/bin/bash
 # The number of parameters is not aligned
 export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6001
 NNODES=1
 NODE_RANK=0
 NPUS_PER_NODE=8
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 # please fill these path configurations
 CHECKPOINT="your model ckpt path"
 TOKENIZER_PATH="your tokenizer path"
 DATA_PATH="your data path"
 TASK="mmlu"
 TP=8
 PP=1
 DISTRIBUTED_ARGS="
    --nproc_per_node $NPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "
 # Different task needs different max_new_tokens value, please follow the instruction in readme.
 torchrun $DISTRIBUTED_ARGS evaluation.py \
       --task-data-path $DATA_PATH \
       --task ${TASK} \
       --tensor-model-parallel-size ${TP} \
       --pipeline-model-parallel-size ${PP} \
       --seq-length 8192 \
       --max-new-tokens 1 \
       --max-position-embeddings 32768 \
       --num-layers 40  \
       --hidden-size 5120  \
       --ffn-hidden-size 13696 \
       --num-attention-heads 40  \
       --disable-bias-linear \
       --swiglu \
       --position-embedding-type rope \
       --load $CHECKPOINT \
       --normalization RMSNorm \
       --tokenizer-type PretrainedFromHF  \
       --tokenizer-name-or-path ${TOKENIZER_PATH} \
       --tokenizer-not-use-fast \
       --micro-batch-size 1  \
       --exit-on-missing-checkpoint \
       --no-load-rng \
       --no-load-optim \
       --untie-embeddings-and-output-weights \
       --add-qkv-bias \
       --make-vocab-size-divisible-by 16 \
       --padded-vocab-size 152064 \
       --rotary-base 1000000 \
       --no-gradient-accumulation-fusion \
       --attention-softmax-in-fp32 \
       --seed 42 \
       --bf16 \
       | tee logs/eval_qwen15_14b_${TASK}.log
--- a/examples/qwen15/generate_qwen15_14b_ptd.sh
+++ b/examples/qwen15/generate_qwen15_14b_ptd.sh
@ -0,0 +1,64 @@
 #!/bin/bash
 # The number of parameters is not aligned
 export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
 export HCCL_CONNECT_TIMEOUT=1200
 export COMBINED_ENABLE=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 # please fill these path configurations
 CHECKPOINT="your model ckpt path"
 TOKENIZER_PATH="your tokenizer path"
 # Change for multinode config
 MASTER_ADDR=localhost
 MASTER_PORT=6010
 NNODES=1
 NODE_RANK=0
 NPUS_PER_NODE=8
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 TP=8
 PP=1
 DISTRIBUTED_ARGS="
    --nproc_per_node $NPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "
 torchrun $DISTRIBUTED_ARGS inference.py \
       --tensor-model-parallel-size ${TP} \
       --pipeline-model-parallel-size ${PP} \
       --num-layers 40 \
       --hidden-size 5120  \
       --num-attention-heads 40  \
       --ffn-hidden-size 13696 \
       --max-position-embeddings 32768 \
       --seq-length 8192 \
       --make-vocab-size-divisible-by 1 \
       --padded-vocab-size 152064 \
       --rotary-base 1000000 \
       --untie-embeddings-and-output-weights \
       --micro-batch-size 1 \
       --swiglu \
       --disable-bias-linear \
       --tokenizer-type PretrainedFromHF  \
       --tokenizer-name-or-path ${TOKENIZER_PATH} \
       --load ${CHECKPOINT}  \
       --normalization RMSNorm \
       --position-embedding-type rope \
       --norm-epsilon 1e-6 \
       --hidden-dropout 0 \
       --attention-dropout 0 \
       --tokenizer-not-use-fast \
       --add-qkv-bias \
       --max-new-tokens 256 \
       --no-gradient-accumulation-fusion \
       --exit-on-missing-checkpoint \
       --attention-softmax-in-fp32 \
       --seed 42 \
       --bf16 \
       | tee logs/generate_qwen15_14b.log
--- a/examples/qwen15/pretrain_qwen15_14b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_14b_ptd.sh
@ -0,0 +1,101 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NPU_ASD_ENABLE=0
 export WITHOUT_JIT_COMPILE=1
 NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 # please fill these path configurations
 CKPT_LOAD_DIR="your model ckpt path"
 CKPT_SAVE_DIR="your model save ckpt path"
 DATA_PATH="your data path"
 TOKENIZER_MODEL="your tokenizer path"
 TP=4
 PP=2
 DISTRIBUTED_ARGS="
    --nproc_per_node $NPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "
 GPT_ARGS="
    --tensor-model-parallel-size ${TP} \
    --pipeline-model-parallel-size ${PP} \
    --sequence-parallel \
    --num-layers 40 \
    --hidden-size 5120 \
    --ffn-hidden-size 13696 \
    --num-attention-heads 40 \
    --load ${CKPT_LOAD_DIR} \
    --tokenizer-type PretrainedFromHF \
    --tokenizer-name-or-path ${TOKENIZER_MODEL} \
    --seq-length 8192 \
    --max-position-embeddings 32768 \
    --micro-batch-size 1 \
    --global-batch-size 256 \
    --make-vocab-size-divisible-by 1 \
    --padded-vocab-size 152064 \
    --rotary-base 1000000 \
    --lr 1.25e-6 \
    --train-iters 5000 \
    --lr-decay-style cosine \
    --untie-embeddings-and-output-weights \
    --disable-bias-linear \
    --attention-dropout 0.0 \
    --init-method-std 0.01 \
    --hidden-dropout 0.0 \
    --position-embedding-type rope \
    --normalization RMSNorm \
    --swiglu \
    --use-flash-attn \
    --use-fused-rmsnorm \
    --use-fused-rotary-pos-emb \
    --use-rotary-position-embeddings \
    --use-fused-swiglu \
    --use-mc2 \
    --no-masked-softmax-fusion \
    --attention-softmax-in-fp32 \
    --min-lr 1.25e-7 \
    --weight-decay 1e-1 \
    --lr-warmup-fraction 0.01 \
    --clip-grad 1.0 \
    --adam-beta1 0.9 \
    --adam-beta2 0.95 \
    --add-qkv-bias \
    --initial-loss-scale 4096 \
    --no-gradient-accumulation-fusion \
    --no-load-optim \
    --no-load-rng \
    --seed 42 \
    --bf16
 "
 DATA_ARGS="
    --data-path $DATA_PATH \
    --split 100,0,0
 "
 OUTPUT_ARGS="
    --log-interval 1 \
    --save-interval 5000 \
    --eval-interval 5000 \
    --eval-iters 0 \
 "
 torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $GPT_ARGS \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
    --save ${CKPT_SAVE_DIR} \
    | tee logs/train_qwen15_14b.log
--- a/examples/qwen15/tune_qwen15_14b_ptd.sh
+++ b/examples/qwen15/tune_qwen15_14b_ptd.sh
@ -0,0 +1,103 @@
 #!/bin/bash
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NPU_ASD_ENABLE=0
 export WITHOUT_JIT_COMPILE=1
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
 NPUS_PER_NODE=8
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 # please fill these path configurations
 CKPT_LOAD_DIR="your model ckpt path"
 CKPT_SAVE_DIR="your model save ckpt path"
 DATA_PATH="your data path"
 TOKENIZER_PATH="your tokenizer path"
 TP=4
 PP=2
 DISTRIBUTED_ARGS="
    --nproc_per_node $NPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "
 GPT_ARGS="
    --tensor-model-parallel-size ${TP} \
    --pipeline-model-parallel-size ${PP} \
    --sequence-parallel \
    --num-layers 40 \
    --hidden-size 5120 \
    --ffn-hidden-size 13686 \
    --num-attention-heads 40 \
    --load ${CKPT_LOAD_DIR} \
    --finetune \
    --is-instruction-dataset \
    --tokenizer-type PretrainedFromHF \
    --tokenizer-name-or-path ${TOKENIZER_PATH} \
    --seq-length 8192 \
    --max-position-embeddings 32768 \
    --micro-batch-size 1 \
    --global-batch-size 256 \
    --make-vocab-size-divisible-by 16 \
    --padded-vocab-size 152064 \
    --rotary-base 1000000 \
    --lr 1.25e-6 \
    --train-iters 5000 \
    --lr-decay-style cosine \
    --untie-embeddings-and-output-weights \
    --disable-bias-linear \
    --attention-dropout 0.0 \
    --init-method-std 0.01 \
    --hidden-dropout 0.0 \
    --position-embedding-type rope \
    --normalization RMSNorm \
    --use-fused-rmsnorm \
    --swiglu \
    --use-flash-attn \
    --use-fused-rotary-pos-emb \
    --use-rotary-position-embeddings \
    --use-fused-swiglu \
    --use-mc2 \
    --no-masked-softmax-fusion \
    --attention-softmax-in-fp32 \
    --min-lr 1.25e-7 \
    --weight-decay 1e-1 \
    --lr-warmup-fraction 0.01 \
    --clip-grad 1.0 \
    --adam-beta1 0.9 \
    --adam-beta2 0.95 \
    --add-qkv-bias \
    --initial-loss-scale 4096 \
    --no-gradient-accumulation-fusion \
    --no-load-optim \
    --no-load-rng \
    --seed 42 \
    --bf16
 "
 DATA_ARGS="
    --data-path $DATA_PATH \
    --split 100,0,0
 "
 OUTPUT_ARGS="
    --log-interval 1 \
    --save-interval 5000 \
    --eval-interval 5000 \
    --eval-iters 0 \
 "
 torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $GPT_ARGS \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
    --save ${CKPT_SAVE_DIR} \
    | tee logs/finetune_qwen15_14b.log
--- a/sources/images/qwen15/qwen1.5_14b_inference.png
+++ b/sources/images/qwen15/qwen1.5_14b_inference.png