!1268 添加yi-34B模型适配

Merge pull request !1268 from xiongliangcheng/master
2024-12-05 05:17:40 +08:00 · 2024-05-14 03:54:20 +00:00 · 2024-05-14 03:54:20 +00:00 · 6ac3959ae0
commit 6ac3959ae0
parent 1d6d2d354d
14 changed files with 1249 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-<p align="center"> <img src="sources/images/logo.png" height="90px" width="400px"> </p>
+  <p align="center"> <img src="sources/images/logo.png" height="90px" width="400px"> </p>

 <p align="center">
    <a href="https://gitee.com/ascend/AscendSpeed/blob/master/LICENSE">
@ -47,6 +47,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <th>推理</th>
      <th>评估</th>
      <th>数据集</th>
+      <th>贡献方</th>
    </tr>
  </thead>
  <tbody>
@ -57,6 +58,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/aquila/generate_aquila_7b_ptd.sh">对话</a> </td>
      <td> <a href="examples/aquila/evaluate_aquila_7b_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="2"> <a href="examples/baichuan/README.md">Baichuan</a> </td>
@ -65,6 +67,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/baichuan/generate_baichuan_7b_ptd.sh">对话</a> </td>
      <td> <a href="examples/baichuan/evaluate_baichuan_7B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td> 13B </td>
@ -72,6 +75,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/baichuan/generate_baichuan_13b_ptd.sh">对话</a> </td>
      <td> <a href="examples/baichuan/evaluate_baichuan_13B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="2"> <a href="examples/baichuan2/README.md">Baichuan2</a> </td>
@ -80,6 +84,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/baichuan2/generate_baichuan2_7b_ptd.sh">对话</a> </td>
      <td> <a href="examples/baichuan2/evaluate_baichuan2_7B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td> 13B </td>
@ -87,6 +92,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/baichuan2/generate_baichuan2_13b_ptd.sh">对话</a> </td>
      <td> <a href="examples/baichuan2/evaluate_baichuan2_13B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
@ -95,6 +101,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/bloom/generate_bloom_7b_ptd.sh">对话</a> </td>
      <td> <a href="examples/bloom/evaluate_bloom_7B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td> 176B </td>
@ -102,6 +109,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/bloom/generate_bloom_176b_ptd.sh">对话</a> </td>
      <td> <a href="examples/bloom/evaluate_bloom_176b_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/intern/README.md">InternLM</a></td>
@ -110,6 +118,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/intern/generate_lnternlm_7b_ptd.sh">对话</a> </td>
      <td> <a href="examples/intern/evaluate_internlm_7B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>65B</td>
@ -117,6 +126,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> -- </td>
      <td> -- </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="4"><a href="examples/llama/README.md">LLaMA</a></td>
@ -125,6 +135,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama/generate_llama_7b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama/evaluate_llama_7B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -132,6 +143,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama/generate_llama_13b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama/evaluate_llama_13B_ptd.sh">评估</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>33B</td>
@ -139,6 +151,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama/generate_llama_33b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama/evaluate_llama_33B_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td > 65B </td>
@ -146,6 +159,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama/generate_llama_65b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama/evaluate_llama_65B_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="4"><a href="examples/llama2/README.md">LLaMA2</a></td>
@ -154,6 +168,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama2/generate_llama2_7b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama2/evaluate_llama2_7B_ptd.sh">评估 </a>  </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -161,6 +176,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama2/generate_llama2_13b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama2/evaluate_llama2_13B_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td> 34B </td>
@ -168,6 +184,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama2/generate_llama2_34B_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama2/evaluate_llama2_34B_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td > 70B </td>
@ -175,6 +192,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama2/generate_llama2_70b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama2/evaluate_llama2_70B_ptd.sh">评估 </a> </td>
      <td>  <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/llama3/README.md">LLaMA3</a></td>
@ -183,6 +201,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama3/generate_llama3_8b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama3/evaluate_llama3_8b_ptd.sh">评估 </a>  </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>70B</td>
@ -190,6 +209,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/llama3/generate_llama3_70b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/llama3/evaluate_llama3_70b_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a>  </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="3"><a href="examples/qwen/README.md">Qwen</a></td>
@ -198,6 +218,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/qwen/generate_qwen_7b_ptd.sh">对话 </a></td>
      <td> <a href="examples/qwen/evaluate_qwen_7b_ptd.sh">评估 </a></td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>14B</td>
@ -205,6 +226,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/qwen/generate_qwen_14b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/qwen/evaluate_qwen_14b_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
    <tr>
      <td>72B</td>
@ -212,6 +234,16 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/qwen/generate_qwen_72b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh">评估 </a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
+    </tr>
+    <tr>
+      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
+      <td>34B</td>
+      <td> -- </td>
+      <td> <a href="examples/yi/generate_yi_34b_ptd.sh">对话 </a> </td>
+      <td> <a href="examples/yi/evaluate_yi_34b_ptd.sh">评估 </a> </td>
+      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【社区贡献模型】 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
@ -220,6 +252,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> <a href="examples/mixtral/generate_mixtral_8x7b_ptd.sh">对话 </a> </td>
      <td> <a href="examples/mixtral/evaluate_mixtral_8x7b_ptd.sh">评估 </a>  </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+      <td> 【昇腾贡献模型】 </td>
    </tr>
  </tbody>
 </table>
@ -454,6 +487,15 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
      <td> 345 </td>
      <td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">训练</a> </td>
    </tr>
+    <tr>
+      <td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
+      <td>34B</td>
+      <td>2x8</td>
+      <td>BF16 </td>
+      <td> 809 </td>
+      <td> 730 </td>
+      <td> <a href="examples/yi/pretrain_yi_34b_ptd_16p.sh">训练</a> </td>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
      <td>8x7B</td>
--- a/README_en.md
+++ b/README_en.md
@ -213,6 +213,14 @@ Current ModelLink supports pre-training and fine-tuning for the following models
      <td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh">evaluation</a> </td>
      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
    </tr>
+    <tr>
+      <td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
+      <td>34B</td>
+      <td> -- </td>
+      <td> <a href="examples/yi/generate_yi_34b_ptd.sh">inference</a> </td>
+      <td> <a href="examples/yi/evaluate_yi_34b_ptd.sh">evaluation</a> </td>
+      <td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
+    </tr>
    <tr>
      <td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
      <td>8x7B</td>
@ -265,6 +273,7 @@ For the supported models listed above, we provide training scripts and readme in
      <th>Performance </th>
      <th>Reference Performance </th>
      <th>Scripts</th>
+      <th>Contributor</th>
    </tr>
  </thead>
  <tbody>
@ -276,6 +285,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2849 </td>
      <td> 2874 </td>
      <td> <a href="examples/aquila/pretrain_aquila_7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/baichuan/README.md">Baichuan</a></td>
@ -285,6 +295,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2685 </td>
      <td> 2036 </td>
      <td> <a href="examples/baichuan/pretrain_baichuan_ptd_7B.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -293,6 +304,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1213 </td>
      <td> 862 </td>
      <td> <a href="examples/baichuan/pretrain_baichuan_ptd_13B.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/baichuan2/README.md">Baichuan2</a></td>
@ -302,6 +314,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2664 </td>
      <td> 3969 </td>
      <td> <a href="examples/baichuan2/pretrain_baichuan2_ptd_7B.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -310,6 +323,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1668 </td>
      <td> 2062 </td>
      <td> <a href="examples/baichuan2/pretrain_baichuan2_ptd_13B.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
@ -319,6 +333,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2034 </td>
      <td> 2525 </td>
      <td> <a href="examples/bloom/pretrain_bloom_ptd_7B.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td >176B</td>
@ -327,6 +342,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 100 </td>
      <td> 107 </td>
      <td> <a href="examples/bloom/pretrain_bloom_176b.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/intern/README.md">InternLM</a></td>
@ -336,6 +352,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2776 </td>
      <td> 2854 </td>
      <td> <a href="examples/intern/pretrain_internlm_7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td >65B</td>
@ -344,6 +361,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 341 </td>
      <td> 414 </td>
      <td> <a href="examples/intern/pretrain_internlm_65b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="5"><a href="examples/llama/README.md">LLaMA</a></td>
@ -353,6 +371,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 3600 </td>
      <td> 3804 </td>
      <td> <a href="examples/llama/pretrain_llama_7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -361,6 +380,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1895 </td>
      <td> 2012 </td>
      <td> <a href="examples/llama/pretrain_llama_13b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
        <td>33B</td>
@ -369,6 +389,7 @@ For the supported models listed above, we provide training scripts and readme in
        <td>621</td>
        <td>776</td>
        <td><a href="examples/llama/pretrain_llama_33B_ptd_32p.sh">train</a> </td>
+        <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2">65B</td>
@ -379,6 +400,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 348 </td>
      <td> 426 </td>
      <td> <a href="examples/llama/pretrain_llama_65b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="4"><a href="examples/llama2/README.md">LLaMA2</a></td>
@ -388,6 +410,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 4200 </td>
      <td> 3850 </td>
      <td> <a href="examples/llama2/pretrain_llama2_7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>13B</td>
@ -396,6 +419,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1990 </td>
      <td> 1920 </td>
      <td> <a href="examples/llama2/pretrain_llama2_13B_ptd_8p.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>34B</td>
@ -404,6 +428,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 690 </td>
      <td> 796 </td>
      <td> <a href="examples/llama2/pretrain_llama2_34B_ptd_16p.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>70B</td>
@ -412,6 +437,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 350 </td>
      <td> 339 </td>
      <td> <a href="examples/llama2/pretrain_llama2_70b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td rowspan="2"><a href="examples/llama3/README.md">LLaMA3</a></td>
@ -421,6 +447,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2483 </td>
      <td> 2674 </td>
      <td> <a href="examples/llama3/pretrain_llama3_8b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>70B</td>
@ -438,6 +465,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 2499 </td>
      <td> 2867 </td>
      <td> <a href="examples/qwen/pretrain_qwen_7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>14B</td>
@ -446,6 +474,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1560 </td>
      <td> 1578 </td>
      <td> <a href="examples/qwen/pretrain_qwen_14b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
    <tr>
      <td>72B</td>
@ -454,6 +483,17 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 285 </td>
      <td> 345 </td>
      <td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
+    </tr>
+    <tr>
+      <td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
+      <td>34B</td>
+      <td>2x8</td>
+      <td>BF16 </td>
+      <td> 809 </td>
+      <td> 730 </td>
+      <td> <a href="examples/yi/pretrain_yi_34b_ptd_16p.sh">train</a> </td>
+      <td> 【Model contributed by Community】 </td>
    </tr>
    <tr>
      <td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
@ -463,6 +503,7 @@ For the supported models listed above, we provide training scripts and readme in
      <td> 1054 </td>
      <td> 1139 </td>
      <td> <a href="examples/mixtral/pretrain_mixtral_8x7b_ptd.sh">train</a> </td>
+      <td> 【Model contributed by Ascend】 </td>
    </tr>
  </tbody>
 </table>
--- a/examples/yi/README.md
+++ b/examples/yi/README.md
@ -0,0 +1,293 @@
+# Yi $\color{black}{\bf\tiny{【社区贡献模型】}}$
+<p align="left">
+        <b>简体中文</b> |
+        <b><a href="README_en.md">English</a> </b> 
+    </p>
+</p>
+
+
+
+#  目录
+- [Yi-34B](#yi-34b)
+  - [训练](#训练)
+    - [脚本](#脚本)
+    - [性能](#性能)
+      - [吞吐](#吞吐)
+  - [推理](#推理)
+  - [评估](#评估)
+
+# Yi-34B
+
+## 训练
+
+Yi-34B 训练的硬件配置如下:
+
+|  硬件 |       配置        |
+|:---:|:---------------:|
+| NPU | 16 x Ascend NPUs |
+
+### 脚本
+
+1. 克隆仓库到本地服务器
+
+    ```shell
+    git clone https://gitee.com/ascend/ModelLink.git 
+    git clone https://github.com/NVIDIA/Megatron-LM.git
+    cd Megatron-LM
+    git checkout -f bcce6f
+    cp -r megatron ../ModelLink/
+    cd ..
+    cd ModelLink
+    mkdir logs
+    mkdir model_from_hf
+    mkdir dataset
+    mkdir ckpt
+    ```
+
+2. 搭建环境
+
+    ```bash
+    # 1).python3.8
+    conda create -n test python=3.8
+    conda activate test
+
+    # 2).安装 torch 和 torch_npu 
+    pip install torch-2.2.0-cp38-cp38m-linux_aarch64.whl
+    pip install torch_npu-2.2.0.XXX-cp38-cp38m-linux_aarch64.whl
+    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
+
+    # 3).修改 ascend-toolkit 路径
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+    # 4).安装加速库
+    git clone https://gitee.com/ascend/AscendSpeed.git
+    cd AscendSpeed
+    git checkout 224ae35e8fc96778f957029d1371ddb623452a50
+    pip install -r requirements.txt
+
+    pip3 install -e .
+    cd ..
+
+    # 5).安装其余依赖库
+    pip install -r requirements.txt 
+
+    ```
+
+
+3. （可选的）准备预训练权重
+
+    从 [huggingface](https://huggingface.co/01-ai/Yi-34B/tree/main) 下载预训练权重
+
+    ```shell
+    mkdir ./model_from_hf/Yi-34B/
+    cd ./model_from_hf/Yi-34B/
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/config.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/generation_config.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00001-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00002-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00003-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00004-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00005-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00006-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00007-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model.bin.index.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.model
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer_config.json
+    cd ../../
+    ```
+
+4. 权重转换
+
+    4.1 将 Yi-34B 模型权重从 huggingface 格式转换为 megatron 格式
+    ***（该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练）***
+
+    ```shell
+    # 修改 ascend-toolkit 路径
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+      
+    python tools/checkpoint/convert_ckpt.py \
+        --model-type GPT \
+        --loader llama2_hf \
+        --saver megatron \
+        --target-tensor-parallel-size 8 \
+        --target-pipeline-parallel-size 2 \
+        --load-dir ./model_from_hf/Yi-34B/ \
+        --save-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
+        --tokenizer-model ./model_from_hf/Yi-34B/tokenizer.model \
+        --params-dtype bf16
+    ```
+    如果为单机8卡推理或者评估任务，将`--target-pipeline-parallel-size`值设为`1`，将`--save-dir`值中的`pp2`改为`pp1`.
+
+    4.2 任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
+    ***（该场景一般用于将训练好的megatron模型重新转回HuggingFace格式）***
+
+    ```shell
+    # 请按照您的真实环境修改 set_env.sh 路径
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    python tools/checkpoint/convert_ckpt.py --model-type GPT \
+        --loader megatron \
+        --saver megatron \
+        --save-model-type save_huggingface_llama \
+        --load-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
+        --target-tensor-parallel-size 1 \
+        --target-pipeline-parallel-size 1 \
+        --save-dir ./model_from_hf/Yi-34B/     # <-- 需要填入原始HF模型路径，新权重会存于./model_from_hf/Yi-34B/mg2hg/
+    ```
+
+5. 预训练
+
+    5.1 准备数据集
+
+    下载 Yi-34B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet) 
+
+    ```shell
+    cd dataset/
+    wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+    cd ..
+
+    mkdir ./dataset/Yi-34B/
+    python ./tools/preprocess_data.py \
+        --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+        --tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
+        --output-prefix ./dataset/Yi-34B/alpaca \
+        --workers 4 \
+        --log-interval 1000 \
+        --tokenizer-type PretrainedFromHF 
+    ```   
+    5.2 预训练
+    
+    配置 Yi-34B 训练脚本: examples/yi/pretrain_yi_34b_ptd_16p.sh
+
+    ```shell
+    # 修改 ascend-toolkit 路径
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+    CKPT_SAVE_DIR="./ckpt/Yi-34B/"
+    DATA_PATH="./dataset/Yi-34B/alpaca_text_document"
+    TOKENIZER_MODEL="./model_from_hf/Yi-34B/tokenizer.model"
+    CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/" 
+    ```
+    启动 Yi-34B 训练脚本: examples/yi/pretrain_yi_34b_ptd_16p.sh
+
+    ```bash
+    bash examples/yi/pretrain_yi_34b_ptd_16p.sh
+    ```
+    **注意**：如果使用多机训练，需要设置多机数据共享，非主节点通过数据共享读取主节点数据。或者，直接将主节点生成的数据复制到非主节点。
+
+6. 微调
+
+    6.1 准备微调数据集
+
+    下载微调数据集 [这里](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+    ```shell
+    # 下载数据集
+    mkdir finetune_dataset
+    cd ./finetune_dataset
+    wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+    cd ..
+
+    # 处理微调数据集  
+    mkdir ./finetune_dataset/Yi-34B/
+    python ./tools/preprocess_data.py \
+      --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+      --tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
+      --output-prefix ./finetune_dataset/Yi-34B/alpaca \
+      --workers 4 \
+      --log-interval 1000 \
+      --tokenizer-type PretrainedFromHF \
+      --handler-name GeneralInstructionHandler \
+      --append-eod
+    ```
+
+    6.2 全参微调
+
+    全参微调的配置脚本基本和预训练脚本一致. *区别是数据集，以及增加训练参数--is-instruction-dataset*
+
+    增加微调参数--finetune，使微调从第一步开始。修改tokenizer参数，去掉--tokenizer-type Llama2Tokenizer 和--tokenizer-model ${TOKENIZER_MODEL}，更改为以下参数：
+
+    ```bash
+    CKPT_SAVE_DIR="./ckpt/Yi-34B/"
+    DATA_PATH="./finetune_dataset/Yi-34B/alpaca"
+    TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+    CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/" 
+    
+      --finetune \
+      --is-instruction-dataset \
+      --tokenizer-type PretrainedFromHF \
+      --tokenizer-name-or-path ${TOKENIZER_PATH} \
+      --tokenizer-not-use-fast \
+    ```
+
+### 性能
+
+#### 吞吐
+
+Yi-34B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比:
+
+|  设备  |      模型      | 迭代数  | 样本吞吐 (samples/s) | token吞吐 (tokens/p/s) | 单步迭代时间 (s/step) | 
+|:----:|:------------:|:----:|:------------------:|:--------------------:|:---------------:|
+| NPUs | Yi-34B | - | 3.16| 809| 324 |      
+|  参考  | Yi-34B | - |  2.85  | 732 | 359 |    
+
+
+
+
+## 推理
+
+配置Yi-34B的推理脚本: examples/yi/generate_yi_34b_ptd.sh
+
+```bash
+# 根据您自己的 ascend-toolkit 路径，执行set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+ 
+# 修改模型权重路径和词表路径
+CHECKPOINT="./model_weights/Yi-34B-Base-v0.1-tp8-pp1/"
+TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+```
+
+然后可直接启动generate_yi_34b_ptd.sh
+
+```bash
+bash examples/yi/generate_yi_34b_ptd.sh
+```
+推理的示例如下:
+
+![Inference](../../sources/images/yi/yi-34b-generate.png)
+
+## 评估
+
+我们使用MMLU基准来评估我们的模型。基准[下载](https://huggingface.co/datasets/cais/mmlu).
+
+```shell
+# 配置原始权重与词表的路径
+CHECKPOINT="./model_weights/Yi-34B-Base-v0.1-tp8-pp1/"
+TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+# 配置任务以及数据路径
+DATA_PATH="./mmlu/test"
+TASK="mmlu"
+```
+
+```shell
+bash ./examples/yi/evaluate_yi_34b_ptd.sh
+```
+
+<table>
+  <thead>
+    <tr>
+      <th>任务</th>
+      <th>模型</th>
+      <th>昇腾值</th>
+      <th>社区值</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><a href="https://huggingface.co/datasets/cais/mmlu">MMLU</a></td>
+      <th>Yi 34B</th>
+      <td>75.8</td>
+      <td><a href="https://hub.opencompass.org.cn/dataset-detail/MMLU">76.3</a></td>
+    </tr>
+  </tbody>
+</table>
--- a/examples/yi/README_en.md
+++ b/examples/yi/README_en.md
@ -0,0 +1,295 @@
+# Yi $\color{black}{\rm\tiny{【Model}}$ $\color{black}{\rm\tiny{contributed}}$ $\color{black}{\rm\tiny{by}}$ $\color{black}{\rm\tiny{community】}}$
+<p align="left">
+        <b><a href="README.md">简体中文</a></b> |
+        <b>English</b> 
+    </p>
+</p>
+
+
+#  Contents
+- [Yi-34B](#yi-34b)
+  - [Training](#training)
+    - [Script](#script)
+    - [Performance](#performance)
+      - [Machine performance](#machine-performance)
+  - [Inference](#inference)
+  - [Evaluation](#evaluation)
+
+
+
+
+# Yi-34B
+
+## Training
+
+Here's a hardware summary of pre-training Yi-34B:
+
+| Hardware |                      Value                      |
+| :------: | :---------------------------------------------: |
+|   NPU    |               16 x Ascend NPUs                   |
+
+### Script
+
+1. Clone the repository to your local server:
+
+    ```bash
+    git clone https://gitee.com/ascend/ModelLink.git 
+    git clone https://github.com/NVIDIA/Megatron-LM.git
+    cd Megatron-LM
+    git checkout -f bcce6f
+    cp -r megatron ../ModelLink/
+    cd ..
+    cd ModelLink
+    mkdir logs
+    mkdir model_from_hf
+    mkdir dataset
+    mkdir ckpt
+    ```
+
+2. Build environment
+
+    ```bash
+    # 1).python3.8
+    conda create -n test python=3.8
+    conda activate test
+
+    # 2).install torch and torch_npu 
+    pip install torch-2.2.0-cp38-cp38m-linux_aarch64.whl
+    pip install torch_npu-2.2.0.XXX-cp38-cp38m-linux_aarch64.whl
+    pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
+
+    # 3).modify the path according to your own  ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+    # 4).install AscendSpeed
+    git clone https://gitee.com/ascend/AscendSpeed.git
+    cd AscendSpeed
+    git checkout 224ae35e8fc96778f957029d1371ddb623452a50
+    pip install -r requirements.txt 
+
+    pip3 install -e .
+    cd ..
+
+    # 5).install other packages
+    pip install -r requirements.txt 
+    ```
+    
+3. Prepare pretrained weights
+
+    Download the Yi-34B checkpoint from [here](https://huggingface.co/yi/Yi-34b-hf/tree/main) 
+
+    ```shell
+    mkdir ./model_from_hf/Yi-34B/
+    cd ./model_from_hf/Yi-34B/
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/config.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/generation_config.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00001-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00002-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00003-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00004-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00005-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00006-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00007-of-00007.bin
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model.bin.index.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.json
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.model
+    wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer_config.json
+    cd ../../
+    ```
+
+4. Weights convert
+
+    4.1 In order to adapt to the Yi-34B model, the following script is used to convert the model pre-training weights.
+    ***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
+
+    ```shell
+    # modify the ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+      
+    python tools/checkpoint/convert_ckpt.py \
+        --model-type GPT \
+        --loader llama2_hf \
+        --saver megatron \
+        --target-tensor-parallel-size 8 \
+        --target-pipeline-parallel-size 2 \
+        --load-dir ./model_from_hf/Yi-34B/ \
+        --save-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
+        --tokenizer-model ./model_from_hf/Yi-34B/tokenizer.model \
+        --params-dtype bf16
+    ```
+    For inference or evaluation tasks, set the `--target-pipeline-parallel-size` value to `1` and change the `pp2` value to `pp1` in the `--save-dir` value.
+
+    4.2 Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
+    ***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
+
+    ```shell
+    # Modify the ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    python tools/checkpoint/convert_ckpt.py --model-type GPT \
+        --loader megatron \
+        --saver megatron \
+        --save-model-type save_huggingface_llama \
+        --load-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
+        --target-tensor-parallel-size 1 \
+        --target-pipeline-parallel-size 1 \
+        --save-dir ./model_from_hf/Yi-34B/   # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Yi-34B/mg2hg/
+    ```
+
+5. Pre-training
+    
+    5.1 Prepare dataset
+
+    Download the Yi-34B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet) 
+
+    ```shell
+    # download datasets
+    cd ./dataset
+    wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+    cd ..
+
+    # process datasets          
+    mkdir ./dataset/Yi-34B/
+    python ./tools/preprocess_data.py \
+        --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+        --tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
+        --output-prefix ./dataset/Yi-34B/alpaca \
+        --workers 4 \
+        --log-interval 1000 \
+        --tokenizer-type PretrainedFromHF
+    ```
+    5.2 pre-training
+    
+    Config Yi-34B pre-training script : examples/yi/pretrain_yi_34b_ptd_16p.sh
+
+    ```shell
+    # modify the script according to your own  ascend-toolkit path
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+    CKPT_SAVE_DIR="./ckpt/Yi-34B/"
+    DATA_PATH="./dataset/Yi-34B/alpaca_text_document"
+    TOKENIZER_MODEL="./model_from_hf/Yi-34B/tokenizer.model"
+    CKPT_LOAD_DIR="./model_weights/Yi-34B-v0.1-tp8-pp2/"
+    ```
+
+    Launch Yi-34B  pre-training script: examples/yi/pretrain_yi_34b_ptd_16p.sh
+
+    ```shell
+    bash examples/yi/pretrain_yi_ptd_34B.sh 
+    ```
+    **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
+
+6. fine-tuning
+
+    6.1 Prepare fine-tuning dataset
+
+    Download the fine-tuning datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
+
+
+    ```shell
+    # download datasets
+    mkdir finetune_dataset
+    cd ./finetune_dataset
+    wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+    cd ..
+
+    # process datasets  
+    mkdir ./finetune_dataset/Yi-34B/
+    python ./tools/preprocess_data.py \
+      --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+      --tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
+      --output-prefix ./finetune_dataset/Yi-34B/alpaca \
+      --workers 4 \
+      --log-interval 1000 \
+      --tokenizer-type PretrainedFromHF \
+      --handler-name GeneralInstructionHandler \
+      --append-eod
+    ```
+
+    6.2 Full Parameters Fine-Tuning yi_34B
+
+    The configuration script for full parameters fine-tuning  is basically the same as that for pretrain_yi_34b_ptd_16p.sh.*The difference is that the dataset and the training parameter is-instruction-dataset are added.*
+
+    Add the fine-tuning parameter `--finetune` so that fine-tuning starts from the first step.
+
+    ```bash
+    CKPT_SAVE_DIR="./ckpt/Yi-34B/"
+    DATA_PATH="./finetune_dataset/Yi-34B/alpaca"
+    TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+    CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/"
+
+      --finetune \
+      --is-instruction-dataset \
+      --tokenizer-type PretrainedFromHF \
+      --tokenizer-name-or-path ${TOKENIZER_PATH} \
+      --tokenizer-not-use-fast \
+    ```
+
+### Performance
+
+#### Machine performance
+
+The performance of Yi-34B in **Ascend NPU** and **Reference**:
+
+| Device | Model       | total Iterations | throughput rate (samples/s) | throughput rate (tokens/s/p) | single-step time (s/step) | 
+|:----:|:---------:|:----:|:---------------------:|:---------------:|:----------------:|
+| NPUs |  Yi-34B | - | 3.16| 809| 324 |      
+|  Reference  | Yi-34B | - |  2.85  | 732 | 359 |    
+
+
+## Inference
+
+Config Yi-34B inference script: examples/yi/generate_yi_34b_ptd.sh
+
+```bash
+# modify the script according to your own ascend-toolkit path
+source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+ 
+# modify script model path and tokenizer path
+CHECKPOINT="./model_weights/Yi-34B-v0.1-tp8-pp1/"
+TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+```
+
+Launch Yi-34B inference script: examples/yi/generate_yi_34b_ptd.sh
+
+```bash
+bash examples/yi/generate_yi_34b_ptd.sh
+```
+
+Some inference samples are as follows:
+![Inference](../../sources/images/yi/yi-34b-generate.png)
+
+## Evaluation
+
+We use the boolq benchmark to evaluate our model. Benchmark [Download](https://huggingface.co/datasets/cais/mmlu).
+
+```shell
+# config origin weight and vocab file path
+CHECKPOINT="./model_weights/Yi-34B-v0.1-tp8-pp1/"
+TOKENIZER_PATH="./model_from_hf/Yi-34B/"
+# config tasks and dataset path
+DATA_PATH="./mmlu/"
+TASK="mmlu"
+```
+
+```shell
+bash ./examples/yi/evaluate_yi_34b_ptd.sh
+```
+
+<table>
+  <thead>
+    <tr>
+      <th>Task</th>
+      <th>Model</th>
+      <th>NPU</th>
+      <th>OpenSource</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><a href="https://huggingface.co/datasets/cais/mmlu">MMLU</a></td>
+      <th>Yi 34B</th>
+      <td>75.8</td>
+      <td><a href="https://hub.opencompass.org.cn/dataset-detail/MMLU">76.3</a></td>
+    </tr>
+  </tbody>
+</table>
--- a/examples/yi/evaluate_yi_34b_ptd.sh
+++ b/examples/yi/evaluate_yi_34b_ptd.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export COMBINED_ENABLE=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+CHECKPOINT="Your ckpt file path"
+TOKENIZER_PATH="Your tokenizer path"
+DATA_PATH="./mmlu/test"
+TASK="mmlu"
+# Different task needs different max_new_tokens value, please follow the instruction in readme.
+torchrun $DISTRIBUTED_ARGS evaluation.py   \
+       --task-data-path $DATA_PATH \
+       --task $TASK\
+       --seq-length 4096 \
+       --max-new-tokens 1 \
+       --max-position-embeddings 4096 \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 60  \
+       --hidden-size 7168  \
+       --ffn-hidden-size 20480 \
+       --num-attention-heads 56  \
+       --disable-bias-linear \
+       --swiglu \
+       --position-embedding-type rope \
+       --load ${CHECKPOINT}  \
+       --normalization RMSNorm \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --tokenizer-not-use-fast \
+       --fp16  \
+       --micro-batch-size 1  \
+       --use-fused-rmsnorm \
+       --exit-on-missing-checkpoint \
+       --no-load-rng \
+       --no-load-optim \
+       --untie-embeddings-and-output-weights \
+       --no-masked-softmax-fusion \
+       --make-vocab-size-divisible-by 1 \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --rotary-base 5000000 \
+       --seed 42  | tee logs/evaluation_yi_34b_${TASK}.log
--- a/examples/yi/generate_yi_34b_ptd.sh
+++ b/examples/yi/generate_yi_34b_ptd.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
+export HCCL_CONNECT_TIMEOUT=1200
+export COMBINED_ENABLE=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+CHECKPOINT="your model directory path"
+TOKENIZER_PATH="your tokenizer directory path"
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+torchrun $DISTRIBUTED_ARGS inference.py \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 60  \
+       --hidden-size 7168  \
+       --ffn-hidden-size 20480 \
+       --position-embedding-type rope \
+       --seq-length 4096 \
+       --max-new-tokens 256 \
+       --micro-batch-size 1 \
+       --global-batch-size 16 \
+       --num-attention-heads 56  \
+       --max-position-embeddings 4096 \
+       --swiglu \
+       --load "${CHECKPOINT}"  \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path "${TOKENIZER_PATH}" \
+       --tokenizer-not-use-fast \
+       --fp16 \
+       --normalization RMSNorm \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --attention-softmax-in-fp32 \
+       --no-load-optim \
+       --no-load-rng \
+       --no-masked-softmax-fusion \
+       --no-gradient-accumulation-fusion \
+       --exit-on-missing-checkpoint \
+       --make-vocab-size-divisible-by 1 \
+       --vocab-size 64000 \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --rotary-base 5000000 | tee logs/generate_yi_34b.log
+
+
--- a/examples/yi/pretrain_yi_34b_ptd_16p.sh
+++ b/examples/yi/pretrain_yi_34b_ptd_16p.sh
@ -0,0 +1,97 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NPU_ASD_ENABLE=0
+export WITHOUT_JIT_COMPILE=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=2
+NODE_RANK=0
+WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_MODEL="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+TP=8
+PP=2
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --num-layers 60 \
+    --hidden-size 7168 \
+    --ffn-hidden-size 20480 \
+    --num-attention-heads 56 \
+    --tokenizer-type Llama2Tokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 2 \
+    --global-batch-size 1024 \
+    --make-vocab-size-divisible-by 1 \
+    --lr 1.0e-7 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --swiglu \
+    --use-mc2 \
+    --use-flash-attn \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.0e-8 \
+    --weight-decay 1e-2 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --initial-loss-scale 8188.0 \
+    --no-gradient-accumulation-fusion \
+    --load ${CKPT_LOAD_DIR}  \
+    --no-load-optim \
+    --no-load-rng \
+    --rotary-base 5000000 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save $CKPT_SAVE_DIR \
+    | tee logs/train_yi_34b.log
--- a/sources/images/yi/yi-34b-generate.png
+++ b/sources/images/yi/yi-34b-generate.png
--- a/tests/pipeline/yi-34B/param_config.json
+++ b/tests/pipeline/yi-34B/param_config.json
@ -0,0 +1,69 @@
+{
+    "CONVERT_CKPT_PARAM": [
+        "--model-type", "GPT",
+        "--loader", "llama2_hf",
+        "--saver", "megatron",
+        "--load-dir", "/home/dataset/yi-34B-hf",
+        "--save-dir", "/home/dataset/yi-34B-mt-t8p1",
+        "--target-tensor-parallel-size", "8",
+        "--target-pipeline-parallel-size", "1",
+        "--tokenizer-model", "None"
+    ],
+
+    "NETWORK_SIZE": [
+        "--num-layers", "60",
+        "--hidden-size", "7168",
+        "--ffn-hidden-size", "20480",
+        "--num-attention-heads", "56",
+        "--max-position-embeddings", "4096",
+        "--position-embedding-type", "rope",
+        "--make-vocab-size-divisible-by", "1",
+        "--normalization", "RMSNorm",
+        "--swiglu",
+        "--untie-embeddings-and-output-weights",
+        "--load", "/home/dataset/yi-34B-mt-t8p1"
+    ],
+
+    "TOKENIZER_PARAM": [
+        "--tokenizer-type", "PretrainedFromHF",
+        "--tokenizer-name-or-path", "/home/dataset/yi-34B-hf"
+    ],
+
+    "DISTRIBUTED_PARAM": [
+        "--tensor-model-parallel-size", "8",
+        "--pipeline-model-parallel-size", "1"
+    ],
+
+    "INFERENCE_PARAM": [
+        "--max-new-tokens", "256",
+        "--tokenizer-not-use-fast",
+        "--exit-on-missing-checkpoint",
+        "--attention-softmax-in-fp32"
+    ],
+
+    "EVALUATION_PARAM": [
+        "--tokenizer-not-use-fast",
+        "--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
+        "--task", "mmlu",
+        "--max-new-tokens", "1",
+        "--exit-on-missing-checkpoint"
+    ],
+
+    "AUXILIARY_PARAM": [
+        "--micro-batch-size", "1",
+        "--global-batch-size", "16",
+        "--no-masked-softmax-fusion",
+        "--disable-bias-linear",
+        "--no-gradient-accumulation-fusion",
+        "--bf16",
+        "--seed", "42",
+        "--use-fused-rmsnorm",
+        "--group-query-attention",
+        "--no-load-optim",
+        "--no-load-rng",
+        "--seq-length", "4096",
+        "--num-query-groups", "8",
+        "--vocab-size", "64000",
+        "--rotary-base", "5000000"
+    ]
+}
--- a/tests/pipeline/yi-34B/test_convert_ckpt_from_huggingface.py
+++ b/tests/pipeline/yi-34B/test_convert_ckpt_from_huggingface.py
@ -0,0 +1,61 @@
+import unittest
+import sys
+import os
+import subprocess
+import glob
+from pathlib import Path
+from utils import ParamConfig
+import torch
+
+from ascendspeed import megatron_adaptor
+import modellink
+
+
+class TestConvertCkptFromHuggingface(unittest.TestCase):
+    def setUp(self, config=ParamConfig):
+        # configure params, the index starts from 1
+        self.config = config
+        sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
+    
+    def test_file_exsit(self):
+        """
+        Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
+        """
+        bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
+        self.assertEqual(len(bin_file), 7)
+        self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
+    
+    def test_convert_weights_form_huggingface(self):
+        """
+        Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name, 
+        including embedding, final_norm, output and encoder. In the encoder, there will be some different layers 
+        to compose the unique transformer layer and all these layer stack to compose the entity of the model.
+        """
+        base_dir = Path(__file__).absolute().parent.parent.parent.parent
+        file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
+        arguments = sys.argv[1:]
+        subprocess.run(["python", file_path] + arguments)
+        output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
+        weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
+        weight_common_content = weight_content['model']['language_model'] # extract commmon content
+
+        # embedding, encoder, output_layer is three out layers.
+        self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
+        self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([8000, 7168]))
+        self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([7168]))
+
+        # encoder has a common final_norm and each one has folliowing six layers
+        weight_common_content['encoder'].pop('final_norm.weight')
+        self.assertEqual(len(weight_common_content['encoder']) / 6, 60)
+        self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1152, 7168]))
+        self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([7168, 896]))
+        self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5120, 7168]))
+        self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([7168, 2560]))
+        self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([7168]))
+        self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([7168]))
+
+        self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 7168]))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/pipeline/yi-34B/test_evaluation.py
+++ b/tests/pipeline/yi-34B/test_evaluation.py
@ -0,0 +1,95 @@
+import sys
+import os
+import json
+from pathlib import Path
+import tqdm
+import pandas as pd
+import torch
+import torch_npu
+from transformers import AutoTokenizer
+from ascendspeed import megatron_adaptor
+from common import DistributedTest
+from utils import ParamConfig, assert_judge
+import modellink
+from megatron.model import GPTModel
+from modellink.tasks.evaluation.utils import add_text_generate_args
+
+
+class TestEvaluation(DistributedTest):
+    world_size = 8
+
+    def init(self, config=ParamConfig):
+        sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
+                   config.evaluation_param + config.tokenizer_param
+        from megatron.initialize import initialize_megatron
+        os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
+        initialize_megatron(extra_args_provider=add_text_generate_args,
+                            args_defaults={'no_load_rng': True,
+                                           'no_load_optim': True})
+
+        from megatron import get_args
+        self.args = get_args()
+    
+    def test_mmlu_evaluation(self):
+        self.init(config=ParamConfig)
+        from evaluation import model_provider
+        from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
+        model = GPTModel.from_pretrained(
+            model_provider=model_provider,
+            pretrained_name_or_path=self.args.load
+        )
+        tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
+        max_new_tokens = self.args.max_new_tokens
+
+        instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
+
+        total_acc_n = 0
+        total_n = 0
+
+        test_dir = None
+        for path in self.args.task_data_path:
+            if "mmlu" in path:
+                test_dir = path
+        base_dir = Path(__file__).absolute().parent.parent.parent.parent
+        template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
+        with open(template_dir, encoding='utf-8') as f:
+            mmlu_few_shot_template = json.load(f)
+            
+        for file in tqdm.tqdm(os.listdir(test_dir)):
+            file_path = os.path.join(test_dir, file)
+            data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
+            subject_name = file[0: -9]
+            subject = subject_name.replace("_", " ")
+            acc_n = 0
+            data_df_test = data_df[0:10]
+            for index, row in data_df_test.iterrows():
+                test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
+                instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
+                                                          subject=subject,
+                                                          question=test_question)
+                chat_result = model.generate(
+                    instruction,
+                    do_sample=False,
+                    max_new_tokens=max_new_tokens,
+                    tokenizer=tokenizer,
+                    stream=False,
+                    return_output_log_probs=True
+                )
+                assert_judge(isinstance(chat_result, tuple))
+                assert_judge(isinstance(chat_result[1], torch.Tensor))
+                answer = None
+                if chat_result:
+                    answer = chat_result[0].strip()
+                if answer == row['answer']:
+                    acc_n += 1
+            if torch.distributed.get_rank() == 0:
+                total_n += len(data_df_test)
+                total_acc_n += acc_n
+        if torch.distributed.get_rank() == 0:
+            try:
+                final_acc = total_acc_n / total_n
+            except ZeroDivisionError as e:
+                raise e
+            print(final_acc)
+            assert_judge(abs(final_acc - 0.803) < 0.01)
+            
--- a/tests/pipeline/yi-34B/test_generation.py
+++ b/tests/pipeline/yi-34B/test_generation.py
@ -0,0 +1,100 @@
+import sys
+import os
+import torch
+import torch_npu
+from common import DistributedTest
+from utils import ParamConfig, assert_judge
+import modellink
+from megatron.model import GPTModel
+from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
+
+
+class TestGeneration(DistributedTest):
+    world_size = 8
+
+    def init(self, config=ParamConfig):
+        """
+        initialize the environment and arguments
+        """
+        sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
+                   config.inference_param + config.auxiliary_param + config.tokenizer_param
+        from megatron.initialize import initialize_megatron
+        os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
+        initialize_megatron(extra_args_provider=add_text_generate_args,
+                            args_defaults={'no_load_rng': True,
+                                           'no_load_optim': True})
+        from megatron import get_args
+        self.args = get_args()
+
+    def test_greedy_search(self):
+        """
+        load weight to get model and construct the prompts to generate output, 
+        and compare with expected for `greedy search`.
+        """
+        self.init(config=ParamConfig)
+        from inference import model_provider
+        model = GPTModel.from_pretrained(
+            model_provider=model_provider,
+            pretrained_model_name_or_path=self.args.load
+        )
+        instruction = ["春夏秋冬，四个季节"]
+        output = model.generate(instruction, detokenize=False)
+        expected_output1 = [101, 6001, 15831, 5074, 6435, 35308, 101, 31179, 44445, 60820,
+                            11098, 60721, 8203, 61293, 60583, 35308, 102, 18024, 101, 59647,
+                            60721, 60690, 60452, 4452, 59706, 60207, 24212, 1075, 61759, 60942,
+                            63958, 60585, 59599, 21639, 101, 24212, 1075, 61287, 62566, 60632,
+                            63011, 59599, 37835, 60408, 17664, 102, 60566, 9299, 49085, 101]
+        expected_output2 = [101, 6001, 15831, 5074, 2045, 7753, 101, 5074, 2045, 8511,
+                            102, 144, 18417, 101, 24018, 25592, 101, 59722, 60575, 59823,
+                            39464, 60630, 59676, 59936, 59670, 101, 55550, 59688, 60686, 59801,
+                            7292, 101, 60319, 60502, 60687, 61529, 101, 59722, 61418, 59632,
+                            61441, 59936, 534, 448, 494, 534, 448, 494, 534, 455]
+
+        if torch.distributed.get_rank() == 0:
+            print(output)
+            similarity = torch.nn.CosineSimilarity(dim=1)
+            cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
+                                 output[:50].unsqueeze(0).float())
+            cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
+                                 output[:50].unsqueeze(0).float())
+            cos_sim = torch.max(cos_sim1, cos_sim2)
+            print("similarity: ", cos_sim)
+            assert_judge(cos_sim > 0.95)
+
+    def test_beam_search(self):
+        """
+        load weight to get model and construct the prompts to generate output, 
+        and compare with expected for `beam search`.
+        """
+        self.init(config=ParamConfig)
+        from inference import model_provider
+        model = GPTModel.from_pretrained(
+            model_provider=model_provider,
+            pretrained_model_name_or_path=self.args.load
+        )
+
+        max_new_tokens = self.args.max_new_tokens
+        instruction = "北京奥运会"
+        output = model.generate(
+            instruction,
+            num_beams=2,
+            top_k=self.args.top_k,
+            top_p=self.args.top_p,
+            max_new_tokens=max_new_tokens,
+            tokenizer=None,
+            stream=False,
+            detokenize=False
+        )
+        expected_output = [39047, 59654, 101, 79, 77, 77, 85, 59867, 10536, 60397,
+                           536, 493, 487, 59732, 60516, 101, 62731, 62131, 59757, 59637,
+                           59635, 60382, 59689, 35444, 59670, 101, 59630, 61004, 60159, 60475,
+                           59638, 101, 6919, 59678, 2349, 11923, 17463, 60243, 60034, 59652,
+                           22740, 59599, 9034, 102, 144, 79, 77, 77, 85, 59867]
+
+        if torch.distributed.get_rank() == 0:
+            print(output)
+            similarity = torch.nn.CosineSimilarity(dim=1)
+            cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
+                                 output[:50].unsqueeze(0).float())
+            print("similarity: ", cos_sim)
+            assert_judge(cos_sim > 0.95)
--- a/tests/pipeline/yi-34B/utils.py
+++ b/tests/pipeline/yi-34B/utils.py
@ -0,0 +1,35 @@
+import json
+import os
+from pathlib import Path
+from dataclasses import dataclass
+
+
+@dataclass
+class ParamConfig:
+    """
+    We can config the params in the `.json` file including: 
+        convert_ckpt_param,
+        network_size,
+        tokenizer_param,
+        distributed_param,
+        inference_param,
+        evaluation_param,
+        and other auxiliary_param.
+    """
+    base_dir = Path(__file__).absolute().parent
+    param_config = os.path.join(base_dir, "param_config.json")
+    with open(param_config) as f:
+        config_file = json.load(f)
+    
+    convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
+    network_size = config_file["NETWORK_SIZE"]
+    tokenizer_param = config_file["TOKENIZER_PARAM"]
+    distributed_param = config_file["DISTRIBUTED_PARAM"]
+    inference_param = config_file["INFERENCE_PARAM"]
+    evaluation_param = config_file["EVALUATION_PARAM"]
+    auxiliary_param = config_file["AUXILIARY_PARAM"]
+
+
+def assert_judge(expression):
+    if not expression:
+        raise AssertionError
--- a/tests/pipeline/yi-34B/yi-34B.sh
+++ b/tests/pipeline/yi-34B/yi-34B.sh
@ -0,0 +1,6 @@
+# Provide uniform access for piepline.
+
+python tests/pipeline/yi-34B/test_convert_ckpt_from_huggingface.py
+
+pytest -s tests/pipeline/yi-34B/test_generation.py
+pytest -s tests/pipeline/yi-34B/test_evaluation.py