!1291 添加Qwen1.5-14B适配

Merge pull request !1291 from 商元义/14B
2024-12-05 05:17:40 +08:00 · 2024-05-23 02:57:38 +00:00 · 2024-05-23 02:57:38 +00:00 · 19d3b157ff
commit 19d3b157ff
parent 0ddad3f9c8
9 changed files with 978 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -305,6 +305,27 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
      <td> 【昇腾贡献】 </td>
    </tr>
+    <tr>
+      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
+      <td>7B</td>
+      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
+      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
+      <td rowspan="1"> 【社区贡献】 </td>
+    </tr>
+      <td>14B</td>
+      <td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
+      <td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
+      <td rowspan="1"> 【社区贡献】 </td>
+    <tr>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -337,17 +358,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> -- </td>
      <td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a> </td>
      <td>【昇腾贡献】</td>
-    <tr>
-      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
-      <td>7B</td>
-      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
-      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
-      <td> -- </td>
-      <td> -- </td>
-      <td> -- </td>
-      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
-      <td> 【社区贡献】 </td>
-    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
@ -587,6 +597,21 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> 285 </td>
      <td> 345 </td>
    </tr>
+    <tr>
+      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
+      <td> 7B </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 2862 </td>
+      <td> 2621 </td>
+      </tr>
+      <tr>
+      <td> 14B </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 1717 </td>
+      <td> 1702 </td>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -611,14 +636,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> 2806 </td>
      <td> 2734 </td>
    </tr>
-    <tr>
-      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
-      <td> 7B </td>
-      <td> 1x8 </td>
-      <td> BF16 </td>
-      <td> 2548 </td>
-      <td> 2578 </td>
-    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
--- a/README_en.md
+++ b/README_en.md
@ -304,6 +304,27 @@ Current ModelLink supports pre-training and fine-tuning for the following models
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
      <td> 【Ascend】 </td>
    </tr>
+    <tr>
+      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
+      <td>7B</td>
+      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
+      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
+      <td rowspan="1"> 【Community】 </td>
+    </tr>
+      <td>14B</td>
+      <td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
+      <td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> -- </td>
+      <td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
+      <td rowspan="1"> 【Community】 </td>
+    <tr>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
      <td>34B</td>
@ -336,17 +357,6 @@ Current ModelLink supports pre-training and fine-tuning for the following models
      <td> -- </td>
      <td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a>  </td>
      <td>【Ascend】</td>
-    <tr>
-      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
-      <td>7B</td>
-      <td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
-      <td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
-      <td> -- </td>
-      <td> -- </td>
-      <td> -- </td>
-      <td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
-      <td> 【Community】 </td>
-    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
@ -586,6 +596,21 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 285 </td>
      <td> 345 </td>
    </tr>
+    <tr>
+      <td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
+      <td> 7B </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td>  2862 </td>
+      <td> 2621 </td>
+      </tr>
+      <tr>
+      <td> 14B </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 1717 </td>
+      <td> 1702 </td>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
      <td>34B</td>
@ -610,14 +635,6 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2806 </td>
      <td> 2734 </td>
    </tr>
-    <tr>
-      <td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
-      <td> 7B </td>
-      <td> 1x8 </td>
-      <td> BF16 </td>
-      <td> 2548 </td>
-      <td> 2578 </td>
-    </tr>
    <tr>
      <td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
      <td>7B</td>
--- a/examples/qwen15/README.md
+++ b/examples/qwen15/README.md
@ -15,6 +15,14 @@
 	- [推理](#推理)
 	- [评估](#评估)

+- [Qwen1.5-14B](#qwen15-14b)
+	- [训练](#训练)
+	- [脚本](#脚本)
+	- [性能](#性能)
+		- [吞吐](#吞吐)
+	- [推理](#推理)
+	- [评估](#评估)
+
 # Qwen1.5-7B

 ## 训练
@ -102,9 +110,10 @@ Qwen1.5-7B 训练的硬件配置:
       --target-pipeline-parallel-size 1 \
       --make-vocab-size-divisible-by 16 \
       --load-dir ./model_from_hf/Qwen1.5-7B/ \
-       --save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
+       --save-dir ./model_weights/DA-v0.1-tp8-pp1/ \
       --tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
-       --add-qkv-bias
+       --add-qkv-bias \
+       --param-dtype bf16 
   ```

   任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
@ -122,7 +131,7 @@ Qwen1.5-7B 训练的硬件配置:
       --target-pipeline-parallel-size 1 \
       --add-qkv-bias \
       --load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
-       --save-dir ./model_from_hf/Qwen1.5-7B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen-7B/mg2hg/
+       --save-dir ./model_from_hf/Qwen1.5-7B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen1.5-7B/mg2hg/
   ```
   
 5. 预训练
@ -199,8 +208,7 @@ Qwen1.5-7B 训练的硬件配置:
   6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。

   *区别是数据集，以及增加训练参数`--is-instruction-dataset`，增加微调参数`--finetune`，增加预训练权重加载参数`--load`
-   ，使微调从第一步开始，修改tokenizer参数，去掉`--tokenizer-type Llama2Tokenizer`
-   和`--tokenizer-model ${TOKENIZER_MODEL}`。*
+   ，使微调从第一步开始。*

   修改如下：

@ -213,8 +221,6 @@ Qwen1.5-7B 训练的硬件配置:
   --load ${CKPT_LOAD_DIR} \
   --finetune \
   --is-instruction-dataset \
-   --tokenizer-type PretrainedFromHF \
-   --tokenizer-name-or-path ${TOKENIZER_PATH} \
   --tokenizer-not-use-fast \
   ```
   
@ -286,4 +292,274 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh

 |  数据集  | 总学科数 | 总问题数  |                          参考准确率                          | NPU准确率 |
 |:-----:|:----:|:-----:|:-------------------------------------------------------:|:------:|
-| MMLU  |  57  | 14042 |    [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)     |  60.3  |
+| MMLU  |  57  | 14042 |    [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)     |  60.3  |
+
+
+# Qwen1.5-14B
+
+## 训练
+
+Qwen1.5-14B 训练的硬件配置:
+
+| 硬件  |       配置        |
+|:---:|:---------------:|
+| NPU | 8 x Ascend NPUs |
+
+### 脚本
+
+1. 克隆仓库到本地服务器
+
+   ```shell
+   git clone https://gitee.com/ascend/ModelLink.git
+   git clone https://github.com/NVIDIA/Megatron-LM.git
+   cd Megatron-LM
+   git checkout -f bcce6f
+   cp -r megatron ../ModelLink/
+   cd ..
+   cd ModelLink
+   mkdir logs
+   mkdir model_from_hf
+   mkdir dataset
+   mkdir ckpt
+   ```
+2. 搭建环境
+
+   ```bash
+   # python3.8
+   conda create -n test python=3.8
+   conda activate test
+
+   # 安装 torch 和 torch_npu
+   pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
+   pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
+   pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
+
+   # 安装加速库
+   git clone https://gitee.com/ascend/AscendSpeed.git
+   cd AscendSpeed
+   git checkout 224ae35e8fc96778f957029d1371ddb623452a50
+   pip install -r requirements.txt
+   pip install -e .
+   cd ..
+
+   # 安装其余依赖库
+   pip install -r requirements.txt
+   ```
+   **注意**：transformer版本要4.37.0以上
+3. 下载 Qwen1.5-14B 的 [预训练权重和词表](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
+
+   ```bash
+   mkdir ./model_from_hf/Qwen1.5-14B/
+   cd ./model_from_hf/Qwen1.5-14B/
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
+   ...
+   cd ../../
+   ```
+4. 权重转换
+
+   将权重从 huggingface 格式转化为 megatron 格式
+   ***（该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练）***
+
+   ```shell
+   # 修改 ascend-toolkit 路径
+   source /usr/local/Ascend/ascend-toolkit/set_env.sh
+   
+   python tools/checkpoint/convert_ckpt.py \
+       --model-type GPT \
+       --loader llama2_hf \
+       --saver megatron \
+       --target-tensor-parallel-size 8 \
+       --target-pipeline-parallel-size 1 \
+       --make-vocab-size-divisible-by 16 \
+       --load-dir ./model_from_hf/Qwen1.5-14B/ \
+       --save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
+       --tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
+       --add-qkv-bias \
+       --param-dtype bf16 
+   ```
+
+   任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
+   ***（该场景一般用于将训练好的megatron模型重新转回HuggingFace格式）***
+
+   ```bash
+   # 请按照您的真实环境修改 set_env.sh 路径
+   source /usr/local/Ascend/ascend-toolkit/set_env.sh
+   python tools/checkpoint/convert_ckpt.py \
+       --model-type GPT \
+       --loader megatron \
+       --saver megatron \
+       --save-model-type save_huggingface_qwen \
+       --target-tensor-parallel-size 1 \
+       --target-pipeline-parallel-size 1 \
+       --add-qkv-bias \
+       --load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
+       --save-dir ./model_from_hf/Qwen1.5-14B 		# 需要填入原始HF模型路径，新权重会存于./model_from_hf/Qwen1.5-14B/mg2hg/
+   ```
+   
+5. 预训练
+
+   5.1 准备数据集
+
+   下载Qwen1.5-14B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+   ```shell
+   # 下载数据
+   cd ./dataset
+   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+   cd ..
+   
+   # 处理数据   
+   mkdir ./dataset/Qwen1.5-14B/
+   python ./tools/preprocess_data.py \
+       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
+       --output-prefix ./dataset/Qwen1.5-14B/alpaca \
+       --tokenizer-type PretrainedFromHF \
+       --seq-length 8192 \
+       --workers 4 \
+       --log-interval 1000
+   ```
+   
+   5.2 预训练
+
+   配置Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
+
+   ```shell
+   # 设置 ascend-toolkit 路径
+   source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+   # 根据实际情况配置词表、数据集、模型参数保存路径
+   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B"
+   TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B"  #词表路径
+   DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document"  #数据集路径
+   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
+   ```
+   多机运行增加参数 `--overlap-grad-reduce`。
+
+   启动 Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
+
+   ```shell
+    bash examples/qwen15/pretrain_qwen15_14b_ptd.sh
+   ```
+   **注意**：如果使用多机训练，且没有设置数据共享，需要在训练启动脚本中增加 `--no-shared-storage` 参数，设置此参数之后将会根据分布式参数判断非主节点是否需要load数据，并检查相应缓存和生成数据。
+6. 微调
+
+   6.1 准备微调数据集
+
+   下载微调数据集 [这里](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+   ```shell
+   # 下载数据集
+   mkdir finetune_dataset
+   cd ./finetune_dataset
+   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+   cd ..
+   
+   # 处理微调数据集  
+   mkdir ./finetune_dataset/Qwen1.5-14B/
+   python ./tools/preprocess_data.py \
+       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
+       --output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
+       --workers 4 \
+       --log-interval 1000 \
+       --tokenizer-type PretrainedFromHF \
+       --handler-name GeneralInstructionHandler \
+       --append-eod
+   ```
+   6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。
+
+   *区别是数据集，以及增加训练参数`--is-instruction-dataset`，增加微调参数`--finetune`，增加预训练权重加载参数`--load`
+   ，使微调从第一步开始}`。*
+
+   修改如下：
+
+   ```bash
+   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
+   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
+   DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
+   TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
+
+   --load ${CKPT_LOAD_DIR} \
+   --finetune \
+   --is-instruction-dataset \
+   --tokenizer-not-use-fast \
+   ```
+   
+   启动微调脚本: examples/qwen15/tune_qwen15_14b_ptd.sh
+
+   ```shell
+    bash examples/qwen15/tune_qwen15_14b_ptd.sh
+   ```
+
+### 性能
+
+#### 吞吐
+
+Qwen1.5-14B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比：
+
+|       设备       |     模型      | tokens吞吐 (tokens/s/p) |
+|:--------------:|:-----------:|:---------------------:|
+|      NPUs      | Qwen1.5-14B |        1717.8         |
+|       参考       | Qwen1.5-14B |        1702.2         |
+
+## 推理
+
+配置 Qwen1.5-14B 推理脚本：examples/qwen15/generate_qwen15_14b_ptd.sh
+
+```bash
+# ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# 修改模型权重路径和词表路径
+CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
+TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
+```
+
+启动Qwen1.5-14B推理脚本
+
+```bash
+bash examples/qwen15/generate_qwen15_14b_ptd.sh
+```
+
+推理示例如下：
+
+![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
+
+## 评估
+
+使用[CEval数据集](https://huggingface.co/datasets/ceval/ceval-exam)
+和[MMLU数据集](https://huggingface.co/datasets/cais/mmlu)评估模型.
+
+配置Qwen1.5-14B评估脚本: examples/qwen15/evaluate_qwen15_14b_ptd.sh
+
+```bash
+# ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+# 修改模型参数路径和词表路径
+TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"  #词表路径
+CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"  #模型路径
+
+# 配置任务和数据集路径
+DATA_PATH="./mmlu/data/test/"  # ceval任务配置为 "./ceval/val/"
+TASK="mmlu"  # ceval任务配置为 "ceval"
+```
+
+启动评估
+
+```bash
+bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
+```
+
+|  数据集  | 总学科数 | 总问题数  |                      参考准确率                       | NPU准确率 |
+|:-----:|:----:|:-----:|:------------------------------------------------:|:------:|
+| MMLU  |  57  | 14042 | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) |  67.3  |
--- a/examples/qwen15/README_en.md
+++ b/examples/qwen15/README_en.md
@ -14,6 +14,14 @@
  - [Inference](#Inference)
  - [Evaluation](#Evaluation)

+- [Qwen1.5-14B](#qwen15-14b)
+  - [Training](#training)
+  - [Script](#script)
+  - [Performance](#performance)
+    - [Machine performance](#machine-performance)
+  - [Inference](#Inference)
+  - [Evaluation](#Evaluation)
+
 # Qwen1.5-7B

 ## Training
@ -105,7 +113,8 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
        --load-dir ./model_from_hf/Qwen1.5-7B/ \
        --save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
        --tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
-        --add-qkv-bias
+        --add-qkv-bias \
+        --param-dtype bf16 
    ```

   Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
@ -123,7 +132,7 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
        --target-pipeline-parallel-size 1 \
        --add-qkv-bias \
        --load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
-        --save-dir ./model_from_hf/Qwen1.5-7B   # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen-7B/mg2hg/
+        --save-dir ./model_from_hf/Qwen1.5-7B   # Fill in the original HF model path here, new weights will be saved in 1.5/mg2hg/
    ```
 5. Pre-training

@ -199,7 +208,7 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:

   The configuration script with the fine-tuning parameters is basically the same as the pre-training script.

-   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter, Drop `--tokenizer-type Llama2Tokenizer` and `--tokenizer-model ${TOKENIZER_MODEL}`.*
+   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*

    Modified as follows:

@ -212,8 +221,6 @@ Here's a hardware summary of pre-training  Qwen1.5-7B:
   --load ${CKPT_PATH} \
   --finetune \
   --is-instruction-dataset \
-   --tokenizer-type PretrainedFromHF \
-   --tokenizer-name-or-path ${TOKENIZER_PATH} \
   --tokenizer-not-use-fast \
   ```
   
@ -256,7 +263,6 @@ bash examples/qwen15/generate_qwen15_7b_ptd.sh
 **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.

 Some inference samples are as follows:
-
 ![Inference](../../sources/images/qwen15/qwen1.5_7b_inference.png)

 ## Evaluation
@ -287,3 +293,277 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh
 | Task  | Subset | Question |                          OpenSource                           |   NPU   |
 |:-----:|:------:|:--------:|:-------------------------------------------------------------:|:-------:|
 | MMLU  |   57   |  14042   |       [61.0](https://qwenlm.github.io/zh/blog/qwen1.5)        |  60.3   |
+
+
+# Qwen1.5-14B
+
+## Training
+
+Here's a hardware summary of pre-training  Qwen1.5-14B:
+
+| Hardware |                      Value                      |
+| :------: | :---------------------------------------------: |
+|   NPU    |               8 x Ascend NPUs                   |
+
+### Script
+
+1. Clone the repository to your local server:
+
+    ```shell
+    git clone https://gitee.com/ascend/ModelLink.git
+    git clone https://github.com/NVIDIA/Megatron-LM.git
+    cd Megatron-LM
+    git checkout -f bcce6f
+    cp -r megatron ../ModelLink/
+    cd ..
+    cd ModelLink
+    mkdir logs
+    mkdir model_from_hf
+    mkdir dataset
+    mkdir ckpt
+    ```
+2. Build environment
+
+    ```bash
+    # python3.8
+    conda create -n test python=3.8
+    conda activate test
+
+    # install torch and torch_npu
+    pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
+    pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
+    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
+
+    # install AscendSpeed
+    git clone https://gitee.com/ascend/AscendSpeed.git
+    cd AscendSpeed
+    git checkout 224ae35e8fc96778f957029d1371ddb623452a50
+    pip install -r requirements.txt
+    pip install -e .
+    cd ..
+
+    # install other packages
+    pip install -r requirements.txt
+    ```
+    **Note** : transformer version 4.37.0 or higher
+3. Prepare pretrained weights and tokenizer
+   Download the Qwen1.5-14B checkpoint from [here](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
+
+   ```bash
+   mkdir ./model_from_hf/Qwen1.5-14B/
+   cd ./model_from_hf/Qwen1.5-14B/
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
+   wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
+   ...
+   cd ../../
+   ```
+
+4. Weights convert
+
+    Convert weights from huggingface format to megatron format
+    ***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
+
+    ```bash
+    # modify the script according to your own ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    
+    python tools/checkpoint/convert_ckpt.py \
+        --model-type GPT \
+        --loader llama2_hf \
+        --saver megatron \
+        --target-tensor-parallel-size 8 \
+        --target-pipeline-parallel-size 1 \
+        --make-vocab-size-divisible-by 16 \
+        --load-dir ./model_from_hf/Qwen1.5-14B/ \
+        --save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
+        --tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
+        --add-qkv-bias \
+        --param-dtype bf16 
+    ```
+
+   Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
+    ***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
+
+    ```shell
+    # Modify the ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    python tools/checkpoint/convert_ckpt.py \
+        --model-type GPT \
+        --loader megatron \
+        --saver megatron \
+        --save-model-type save_huggingface_qwen \
+        --target-tensor-parallel-size 1 \
+        --target-pipeline-parallel-size 1 \
+        --add-qkv-bias \
+        --load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
+        --save-dir ./model_from_hf/Qwen1.5-14B   # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen1.5-14B/mg2hg/
+    ```
+5. Pre-training
+
+   5.1 prepare dataset
+
+   Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+   ```shell
+   # download datasets
+   cd ./dataset
+   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+   cd ..
+
+   # process datasets
+   mkdir ./dataset/Qwen1.5-14B/
+   python ./tools/preprocess_data.py \
+       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
+       --output-prefix ./dataset/Qwen1.5-14B/alpaca \
+       --tokenizer-type PretrainedFromHF \
+       --seq-length 8192 \
+       --workers 4 \
+       --log-interval 1000
+   ```
+   5.2 pre-training
+
+   Config Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
+
+   ```shell
+    # modify the script according to your own ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+    # modify config according to your own actual situation
+    CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
+    TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B"  #tokenizer path
+    DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document"  #processed dataset
+    CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
+   ```
+   Multi-machine training requires the addition of parameter `--overlap-grad-reduce`.
+
+   Launch Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
+
+   ```shell
+    bash examples/qwen15/pretrain_qwen15_14b_ptd.sh 
+   ```
+    **Note**: If using multi machine training, and no data sharing configuration on the mechines, it's necessary to add the parameter `--no-shared-storage`. This parameter will determine whether non master nodes need to load data based on distributed parameters, and check the corresponding cache and generated data.
+
+6. fine-tuning
+
+   6.1 Prepare fine-tuning dataset Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+   ```shell
+   # download datasets
+   mkdir finetune_dataset
+   cd ./finetune_dataset
+   wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+   cd ..
+
+   # process datasets   
+   mkdir ./finetune_dataset/Qwen1.5-14B/
+   python ./tools/preprocess_data.py \
+       --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+       --tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
+       --output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
+       --workers 4 \
+       --log-interval 1000 \
+       --tokenizer-type PretrainedFromHF \
+       --handler-name GeneralInstructionHandler \
+       --append-eod
+   ```
+   
+   6.2 Full Parameters Fine-Tuning
+
+   The configuration script with the fine-tuning parameters is basically the same as the pre-training script.
+
+   *The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*
+
+    Modified as follows:
+
+   ```bash
+   CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
+   CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
+   DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
+   TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
+   
+   --load ${CKPT_PATH} \
+   --finetune \
+   --is-instruction-dataset \
+   --tokenizer-type PretrainedFromHF \
+   --tokenizer-name-or-path ${TOKENIZER_PATH} \
+   --tokenizer-not-use-fast \
+   ```
+   
+   Launch Qwen1.5-14B fine-tuning script: examples/qwen15/tune_qwen15_14b_ptd.sh
+
+   ```shell
+    bash examples/qwen15/tune_qwen15_14b_ptd.sh
+   ```
+
+### Performance
+
+#### Machine performance
+
+The performance of Qwen1.5-14B in **Ascend NPU** and **Reference**:
+
+|  Device   |    Model    | throughput rate (tokens/s/p) |
+|:---------:|:-----------:|:----------------------------:|
+|   NPUs    | Qwen1.5-14B |            1717.8            |
+| Reference | Qwen1.5-14B |            1702.2            |
+
+## Inference
+
+Config Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
+
+```bash
+# ascend-toolkit path
+source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+ 
+# modify script model path and tokenizer path
+CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
+TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
+```
+
+Launch Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
+
+```bash
+bash examples/qwen15/generate_qwen15_14b_ptd.sh
+```
+
+**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
+
+Some inference samples are as follows:
+![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
+
+## Evaluation
+
+We use the [CEval benchmark](https://huggingface.co/datasets/ceval/ceval-exam) and [MMLU benchmark](https://huggingface.co/datasets/cais/mmlu) to evaluate our model.
+
+Config Qwen1.5-14B evaluation script: examples/qwen15/evaluate_qwen15_14b_ptd.sh
+
+```bash
+# ascend-toolkit path
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# Modify the model parameter path and vocabulary path
+TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"  # vocabulary path
+CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"  # parameter path
+
+# Configure the task type and dataset path
+DATA_PATH="./mmlu/data/test/"  # "./ceval/val/" for ceval task
+TASK="mmlu"  # "ceval" for ceval task
+```
+
+Launch Qwen1.5-14B evaluation
+
+```bash
+bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
+```
+
+| Task  | Subset | Question |                    OpenSource                    | NPU  |
+|:-----:|:------:|:--------:|:------------------------------------------------:|:----:|
+| MMLU  |   57   |  14042   | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) | 67.3 |
+
--- a/examples/qwen15/evaluate_qwen15_14b_ptd.sh
+++ b/examples/qwen15/evaluate_qwen15_14b_ptd.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export COMBINED_ENABLE=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CHECKPOINT="your model ckpt path"
+TOKENIZER_PATH="your tokenizer path"
+DATA_PATH="your data path"
+TASK="mmlu"
+
+TP=8
+PP=1
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+# Different task needs different max_new_tokens value, please follow the instruction in readme.
+torchrun $DISTRIBUTED_ARGS evaluation.py \
+       --task-data-path $DATA_PATH \
+       --task ${TASK} \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --seq-length 8192 \
+       --max-new-tokens 1 \
+       --max-position-embeddings 32768 \
+       --num-layers 40  \
+       --hidden-size 5120  \
+       --ffn-hidden-size 13696 \
+       --num-attention-heads 40  \
+       --disable-bias-linear \
+       --swiglu \
+       --position-embedding-type rope \
+       --load $CHECKPOINT \
+       --normalization RMSNorm \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --tokenizer-not-use-fast \
+       --micro-batch-size 1  \
+       --exit-on-missing-checkpoint \
+       --no-load-rng \
+       --no-load-optim \
+       --untie-embeddings-and-output-weights \
+       --add-qkv-bias \
+       --make-vocab-size-divisible-by 16 \
+       --padded-vocab-size 152064 \
+       --rotary-base 1000000 \
+       --no-gradient-accumulation-fusion \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       | tee logs/eval_qwen15_14b_${TASK}.log
--- a/examples/qwen15/generate_qwen15_14b_ptd.sh
+++ b/examples/qwen15/generate_qwen15_14b_ptd.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export COMBINED_ENABLE=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+CHECKPOINT="your model ckpt path"
+TOKENIZER_PATH="your tokenizer path"
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6010
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+TP=8
+PP=1
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+torchrun $DISTRIBUTED_ARGS inference.py \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --num-layers 40 \
+       --hidden-size 5120  \
+       --num-attention-heads 40  \
+       --ffn-hidden-size 13696 \
+       --max-position-embeddings 32768 \
+       --seq-length 8192 \
+       --make-vocab-size-divisible-by 1 \
+       --padded-vocab-size 152064 \
+       --rotary-base 1000000 \
+       --untie-embeddings-and-output-weights \
+       --micro-batch-size 1 \
+       --swiglu \
+       --disable-bias-linear \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --load ${CHECKPOINT}  \
+       --normalization RMSNorm \
+       --position-embedding-type rope \
+       --norm-epsilon 1e-6 \
+       --hidden-dropout 0 \
+       --attention-dropout 0 \
+       --tokenizer-not-use-fast \
+       --add-qkv-bias \
+       --max-new-tokens 256 \
+       --no-gradient-accumulation-fusion \
+       --exit-on-missing-checkpoint \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       | tee logs/generate_qwen15_14b.log
--- a/examples/qwen15/pretrain_qwen15_14b_ptd.sh
+++ b/examples/qwen15/pretrain_qwen15_14b_ptd.sh
@ -0,0 +1,101 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NPU_ASD_ENABLE=0
+export WITHOUT_JIT_COMPILE=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CKPT_LOAD_DIR="your model ckpt path"
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_MODEL="your tokenizer path"
+
+TP=4
+PP=2
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 40 \
+    --load ${CKPT_LOAD_DIR} \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_MODEL} \
+    --seq-length 8192 \
+    --max-position-embeddings 32768 \
+    --micro-batch-size 1 \
+    --global-batch-size 256 \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 152064 \
+    --rotary-base 1000000 \
+    --lr 1.25e-6 \
+    --train-iters 5000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --swiglu \
+    --use-flash-attn \
+    --use-fused-rmsnorm \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-mc2 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --add-qkv-bias \
+    --initial-loss-scale 4096 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --seed 42 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 5000 \
+    --eval-interval 5000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_qwen15_14b.log
--- a/examples/qwen15/tune_qwen15_14b_ptd.sh
+++ b/examples/qwen15/tune_qwen15_14b_ptd.sh
@ -0,0 +1,103 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NPU_ASD_ENABLE=0
+export WITHOUT_JIT_COMPILE=1
+
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CKPT_LOAD_DIR="your model ckpt path"
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+
+TP=4
+PP=2
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 13686 \
+    --num-attention-heads 40 \
+    --load ${CKPT_LOAD_DIR} \
+    --finetune \
+    --is-instruction-dataset \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --seq-length 8192 \
+    --max-position-embeddings 32768 \
+    --micro-batch-size 1 \
+    --global-batch-size 256 \
+    --make-vocab-size-divisible-by 16 \
+    --padded-vocab-size 152064 \
+    --rotary-base 1000000 \
+    --lr 1.25e-6 \
+    --train-iters 5000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --swiglu \
+    --use-flash-attn \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-mc2 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --add-qkv-bias \
+    --initial-loss-scale 4096 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --seed 42 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 5000 \
+    --eval-interval 5000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/finetune_qwen15_14b.log
--- a/sources/images/qwen15/qwen1.5_14b_inference.png
+++ b/sources/images/qwen15/qwen1.5_14b_inference.png