!1268 添加yi-34B模型适配

Merge pull request !1268 from xiongliangcheng/master
This commit is contained in:
xiongliangcheng 2024-05-14 03:54:20 +00:00 committed by i-robot
parent 1d6d2d354d
commit 6ac3959ae0
14 changed files with 1249 additions and 1 deletions

View File

@ -1,4 +1,4 @@
<p align="center"> <img src="sources/images/logo.png" height="90px" width="400px"> </p>
<p align="center"> <img src="sources/images/logo.png" height="90px" width="400px"> </p>
<p align="center">
<a href="https://gitee.com/ascend/AscendSpeed/blob/master/LICENSE">
@ -47,6 +47,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<th>推理</th>
<th>评估</th>
<th>数据集</th>
<th>贡献方</th>
</tr>
</thead>
<tbody>
@ -57,6 +58,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/aquila/generate_aquila_7b_ptd.sh">对话</a> </td>
<td> <a href="examples/aquila/evaluate_aquila_7b_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="2"> <a href="examples/baichuan/README.md">Baichuan</a> </td>
@ -65,6 +67,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/baichuan/generate_baichuan_7b_ptd.sh">对话</a> </td>
<td> <a href="examples/baichuan/evaluate_baichuan_7B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td> 13B </td>
@ -72,6 +75,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/baichuan/generate_baichuan_13b_ptd.sh">对话</a> </td>
<td> <a href="examples/baichuan/evaluate_baichuan_13B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="2"> <a href="examples/baichuan2/README.md">Baichuan2</a> </td>
@ -80,6 +84,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/baichuan2/generate_baichuan2_7b_ptd.sh">对话</a> </td>
<td> <a href="examples/baichuan2/evaluate_baichuan2_7B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td> 13B </td>
@ -87,6 +92,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/baichuan2/generate_baichuan2_13b_ptd.sh">对话</a> </td>
<td> <a href="examples/baichuan2/evaluate_baichuan2_13B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
@ -95,6 +101,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/bloom/generate_bloom_7b_ptd.sh">对话</a> </td>
<td> <a href="examples/bloom/evaluate_bloom_7B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td> 176B </td>
@ -102,6 +109,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/bloom/generate_bloom_176b_ptd.sh">对话</a> </td>
<td> <a href="examples/bloom/evaluate_bloom_176b_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/intern/README.md">InternLM</a></td>
@ -110,6 +118,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/intern/generate_lnternlm_7b_ptd.sh">对话</a> </td>
<td> <a href="examples/intern/evaluate_internlm_7B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>65B</td>
@ -117,6 +126,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> -- </td>
<td> -- </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="4"><a href="examples/llama/README.md">LLaMA</a></td>
@ -125,6 +135,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama/generate_llama_7b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama/evaluate_llama_7B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>13B</td>
@ -132,6 +143,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama/generate_llama_13b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama/evaluate_llama_13B_ptd.sh">评估</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>33B</td>
@ -139,6 +151,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama/generate_llama_33b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama/evaluate_llama_33B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td > 65B </td>
@ -146,6 +159,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama/generate_llama_65b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama/evaluate_llama_65B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="4"><a href="examples/llama2/README.md">LLaMA2</a></td>
@ -154,6 +168,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama2/generate_llama2_7b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama2/evaluate_llama2_7B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>13B</td>
@ -161,6 +176,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama2/generate_llama2_13b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama2/evaluate_llama2_13B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td> 34B </td>
@ -168,6 +184,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama2/generate_llama2_34B_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama2/evaluate_llama2_34B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td > 70B </td>
@ -175,6 +192,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama2/generate_llama2_70b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama2/evaluate_llama2_70B_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/llama3/README.md">LLaMA3</a></td>
@ -183,6 +201,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama3/generate_llama3_8b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama3/evaluate_llama3_8b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>70B</td>
@ -190,6 +209,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/llama3/generate_llama3_70b_ptd.sh">对话 </a> </td>
<td> <a href="examples/llama3/evaluate_llama3_70b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json</a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="3"><a href="examples/qwen/README.md">Qwen</a></td>
@ -198,6 +218,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/qwen/generate_qwen_7b_ptd.sh">对话 </a></td>
<td> <a href="examples/qwen/evaluate_qwen_7b_ptd.sh">评估 </a></td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>14B</td>
@ -205,6 +226,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/qwen/generate_qwen_14b_ptd.sh">对话 </a> </td>
<td> <a href="examples/qwen/evaluate_qwen_14b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td>72B</td>
@ -212,6 +234,16 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/qwen/generate_qwen_72b_ptd.sh">对话 </a> </td>
<td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
<td>34B</td>
<td> -- </td>
<td> <a href="examples/yi/generate_yi_34b_ptd.sh">对话 </a> </td>
<td> <a href="examples/yi/evaluate_yi_34b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【社区贡献模型】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
@ -220,6 +252,7 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/mixtral/generate_mixtral_8x7b_ptd.sh">对话 </a> </td>
<td> <a href="examples/mixtral/evaluate_mixtral_8x7b_ptd.sh">评估 </a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
<td> 【昇腾贡献模型】 </td>
</tr>
</tbody>
</table>
@ -454,6 +487,15 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> 345 </td>
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">训练</a> </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
<td>34B</td>
<td>2x8</td>
<td>BF16 </td>
<td> 809 </td>
<td> 730 </td>
<td> <a href="examples/yi/pretrain_yi_34b_ptd_16p.sh">训练</a> </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
<td>8x7B</td>

View File

@ -213,6 +213,14 @@ Current ModelLink supports pre-training and fine-tuning for the following models
<td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh">evaluation</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
<td>34B</td>
<td> -- </td>
<td> <a href="examples/yi/generate_yi_34b_ptd.sh">inference</a> </td>
<td> <a href="examples/yi/evaluate_yi_34b_ptd.sh">evaluation</a> </td>
<td> <a href="https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json">alpaca_data.json </a> </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
<td>8x7B</td>
@ -265,6 +273,7 @@ For the supported models listed above, we provide training scripts and readme in
<th>Performance </th>
<th>Reference Performance </th>
<th>Scripts</th>
<th>Contributor</th>
</tr>
</thead>
<tbody>
@ -276,6 +285,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2849 </td>
<td> 2874 </td>
<td> <a href="examples/aquila/pretrain_aquila_7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/baichuan/README.md">Baichuan</a></td>
@ -285,6 +295,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2685 </td>
<td> 2036 </td>
<td> <a href="examples/baichuan/pretrain_baichuan_ptd_7B.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>13B</td>
@ -293,6 +304,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1213 </td>
<td> 862 </td>
<td> <a href="examples/baichuan/pretrain_baichuan_ptd_13B.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/baichuan2/README.md">Baichuan2</a></td>
@ -302,6 +314,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2664 </td>
<td> 3969 </td>
<td> <a href="examples/baichuan2/pretrain_baichuan2_ptd_7B.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>13B</td>
@ -310,6 +323,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1668 </td>
<td> 2062 </td>
<td> <a href="examples/baichuan2/pretrain_baichuan2_ptd_13B.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/bloom/README.md">Bloom</a></td>
@ -319,6 +333,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2034 </td>
<td> 2525 </td>
<td> <a href="examples/bloom/pretrain_bloom_ptd_7B.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td >176B</td>
@ -327,6 +342,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 100 </td>
<td> 107 </td>
<td> <a href="examples/bloom/pretrain_bloom_176b.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/intern/README.md">InternLM</a></td>
@ -336,6 +352,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2776 </td>
<td> 2854 </td>
<td> <a href="examples/intern/pretrain_internlm_7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td >65B</td>
@ -344,6 +361,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 341 </td>
<td> 414 </td>
<td> <a href="examples/intern/pretrain_internlm_65b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="5"><a href="examples/llama/README.md">LLaMA</a></td>
@ -353,6 +371,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 3600 </td>
<td> 3804 </td>
<td> <a href="examples/llama/pretrain_llama_7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>13B</td>
@ -361,6 +380,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1895 </td>
<td> 2012 </td>
<td> <a href="examples/llama/pretrain_llama_13b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>33B</td>
@ -369,6 +389,7 @@ For the supported models listed above, we provide training scripts and readme in
<td>621</td>
<td>776</td>
<td><a href="examples/llama/pretrain_llama_33B_ptd_32p.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2">65B</td>
@ -379,6 +400,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 348 </td>
<td> 426 </td>
<td> <a href="examples/llama/pretrain_llama_65b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="4"><a href="examples/llama2/README.md">LLaMA2</a></td>
@ -388,6 +410,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 4200 </td>
<td> 3850 </td>
<td> <a href="examples/llama2/pretrain_llama2_7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>13B</td>
@ -396,6 +419,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1990 </td>
<td> 1920 </td>
<td> <a href="examples/llama2/pretrain_llama2_13B_ptd_8p.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>34B</td>
@ -404,6 +428,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 690 </td>
<td> 796 </td>
<td> <a href="examples/llama2/pretrain_llama2_34B_ptd_16p.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>70B</td>
@ -412,6 +437,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 350 </td>
<td> 339 </td>
<td> <a href="examples/llama2/pretrain_llama2_70b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/llama3/README.md">LLaMA3</a></td>
@ -421,6 +447,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2483 </td>
<td> 2674 </td>
<td> <a href="examples/llama3/pretrain_llama3_8b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>70B</td>
@ -438,6 +465,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2499 </td>
<td> 2867 </td>
<td> <a href="examples/qwen/pretrain_qwen_7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>14B</td>
@ -446,6 +474,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1560 </td>
<td> 1578 </td>
<td> <a href="examples/qwen/pretrain_qwen_14b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td>72B</td>
@ -454,6 +483,17 @@ For the supported models listed above, we provide training scripts and readme in
<td> 285 </td>
<td> 345 </td>
<td> <a href="examples/qwen/pretrain_qwen_72b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
<td>34B</td>
<td>2x8</td>
<td>BF16 </td>
<td> 809 </td>
<td> 730 </td>
<td> <a href="examples/yi/pretrain_yi_34b_ptd_16p.sh">train</a> </td>
<td> 【Model contributed by Community】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/mixtral/README.md">Mixtral</a></td>
@ -463,6 +503,7 @@ For the supported models listed above, we provide training scripts and readme in
<td> 1054 </td>
<td> 1139 </td>
<td> <a href="examples/mixtral/pretrain_mixtral_8x7b_ptd.sh">train</a> </td>
<td> 【Model contributed by Ascend】 </td>
</tr>
</tbody>
</table>

293
examples/yi/README.md Normal file
View File

@ -0,0 +1,293 @@
# Yi $\color{black}{\bf\tiny{【社区贡献模型】}}$
<p align="left">
<b>简体中文</b> |
<b><a href="README_en.md">English</a> </b>
</p>
</p>
# 目录
- [Yi-34B](#yi-34b)
- [训练](#训练)
- [脚本](#脚本)
- [性能](#性能)
- [吞吐](#吞吐)
- [推理](#推理)
- [评估](#评估)
# Yi-34B
## 训练
Yi-34B 训练的硬件配置如下:
| 硬件 | 配置 |
|:---:|:---------------:|
| NPU | 16 x Ascend NPUs |
### 脚本
1. 克隆仓库到本地服务器
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. 搭建环境
```bash
# 1).python3.8
conda create -n test python=3.8
conda activate test
# 2).安装 torch 和 torch_npu
pip install torch-2.2.0-cp38-cp38m-linux_aarch64.whl
pip install torch_npu-2.2.0.XXX-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 3).修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 4).安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 5).安装其余依赖库
pip install -r requirements.txt
```
3. (可选的)准备预训练权重
从 [huggingface](https://huggingface.co/01-ai/Yi-34B/tree/main) 下载预训练权重
```shell
mkdir ./model_from_hf/Yi-34B/
cd ./model_from_hf/Yi-34B/
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/config.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/generation_config.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00001-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00002-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00003-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00004-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00005-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00006-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00007-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model.bin.index.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.model
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer_config.json
cd ../../
```
4. 权重转换
4.1 将 Yi-34B 模型权重从 huggingface 格式转换为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```shell
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 2 \
--load-dir ./model_from_hf/Yi-34B/ \
--save-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
--tokenizer-model ./model_from_hf/Yi-34B/tokenizer.model \
--params-dtype bf16
```
如果为单机8卡推理或者评估任务将`--target-pipeline-parallel-size`值设为`1`,将`--save-dir`值中的`pp2`改为`pp1`.
4.2 任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```shell
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/Yi-34B/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Yi-34B/mg2hg/
```
5. 预训练
5.1 准备数据集
下载 Yi-34B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
cd dataset/
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
mkdir ./dataset/Yi-34B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
--output-prefix ./dataset/Yi-34B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 预训练
配置 Yi-34B 训练脚本: examples/yi/pretrain_yi_34b_ptd_16p.sh
```shell
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
CKPT_SAVE_DIR="./ckpt/Yi-34B/"
DATA_PATH="./dataset/Yi-34B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Yi-34B/tokenizer.model"
CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/"
```
启动 Yi-34B 训练脚本: examples/yi/pretrain_yi_34b_ptd_16p.sh
```bash
bash examples/yi/pretrain_yi_34b_ptd_16p.sh
```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
6. 微调
6.1 准备微调数据集
下载微调数据集 [这里](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据集
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理微调数据集
mkdir ./finetune_dataset/Yi-34B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
--output-prefix ./finetune_dataset/Yi-34B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF \
--handler-name GeneralInstructionHandler \
--append-eod
```
6.2 全参微调
全参微调的配置脚本基本和预训练脚本一致. *区别是数据集,以及增加训练参数--is-instruction-dataset*
增加微调参数--finetune使微调从第一步开始。修改tokenizer参数去掉--tokenizer-type Llama2Tokenizer 和--tokenizer-model ${TOKENIZER_MODEL},更改为以下参数:
```bash
CKPT_SAVE_DIR="./ckpt/Yi-34B/"
DATA_PATH="./finetune_dataset/Yi-34B/alpaca"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/"
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
```
### 性能
#### 吞吐
Yi-34B 在 **昇腾芯片****参考芯片** 上的性能对比:
| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/s) | token吞吐 (tokens/p/s) | 单步迭代时间 (s/step) |
|:----:|:------------:|:----:|:------------------:|:--------------------:|:---------------:|
| NPUs | Yi-34B | - | 3.16| 809| 324 |
| 参考 | Yi-34B | - | 2.85 | 732 | 359 |
## 推理
配置Yi-34B的推理脚本: examples/yi/generate_yi_34b_ptd.sh
```bash
# 根据您自己的 ascend-toolkit 路径执行set_env.sh
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改模型权重路径和词表路径
CHECKPOINT="./model_weights/Yi-34B-Base-v0.1-tp8-pp1/"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
```
然后可直接启动generate_yi_34b_ptd.sh
```bash
bash examples/yi/generate_yi_34b_ptd.sh
```
推理的示例如下:
![Inference](../../sources/images/yi/yi-34b-generate.png)
## 评估
我们使用MMLU基准来评估我们的模型。基准[下载](https://huggingface.co/datasets/cais/mmlu).
```shell
# 配置原始权重与词表的路径
CHECKPOINT="./model_weights/Yi-34B-Base-v0.1-tp8-pp1/"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
# 配置任务以及数据路径
DATA_PATH="./mmlu/test"
TASK="mmlu"
```
```shell
bash ./examples/yi/evaluate_yi_34b_ptd.sh
```
<table>
<thead>
<tr>
<th>任务</th>
<th>模型</th>
<th>昇腾值</th>
<th>社区值</th>
</tr>
</thead>
<tbody>
<tr>
<td><a href="https://huggingface.co/datasets/cais/mmlu">MMLU</a></td>
<th>Yi 34B</th>
<td>75.8</td>
<td><a href="https://hub.opencompass.org.cn/dataset-detail/MMLU">76.3</a></td>
</tr>
</tbody>
</table>

295
examples/yi/README_en.md Normal file
View File

@ -0,0 +1,295 @@
# Yi $\color{black}{\rm\tiny{【Model}}$ $\color{black}{\rm\tiny{contributed}}$ $\color{black}{\rm\tiny{by}}$ $\color{black}{\rm\tiny{community】}}$
<p align="left">
<b><a href="README.md">简体中文</a></b> |
<b>English</b>
</p>
</p>
# Contents
- [Yi-34B](#yi-34b)
- [Training](#training)
- [Script](#script)
- [Performance](#performance)
- [Machine performance](#machine-performance)
- [Inference](#inference)
- [Evaluation](#evaluation)
# Yi-34B
## Training
Here's a hardware summary of pre-training Yi-34B:
| Hardware | Value |
| :------: | :---------------------------------------------: |
| NPU | 16 x Ascend NPUs |
### Script
1. Clone the repository to your local server:
```bash
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. Build environment
```bash
# 1).python3.8
conda create -n test python=3.8
conda activate test
# 2).install torch and torch_npu
pip install torch-2.2.0-cp38-cp38m-linux_aarch64.whl
pip install torch_npu-2.2.0.XXX-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 3).modify the path according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 4).install AscendSpeed
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 5).install other packages
pip install -r requirements.txt
```
3. Prepare pretrained weights
Download the Yi-34B checkpoint from [here](https://huggingface.co/yi/Yi-34b-hf/tree/main)
```shell
mkdir ./model_from_hf/Yi-34B/
cd ./model_from_hf/Yi-34B/
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/config.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/generation_config.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00001-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00002-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00003-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00004-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00005-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00006-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model-00007-of-00007.bin
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/pytorch_model.bin.index.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.json
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer.model
wget https://huggingface.co/01-ai/Yi-34B/resolve/main/tokenizer_config.json
cd ../../
```
4. Weights convert
4.1 In order to adapt to the Yi-34B model, the following script is used to convert the model pre-training weights.
***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
```shell
# modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 2 \
--load-dir ./model_from_hf/Yi-34B/ \
--save-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
--tokenizer-model ./model_from_hf/Yi-34B/tokenizer.model \
--params-dtype bf16
```
For inference or evaluation tasks, set the `--target-pipeline-parallel-size` value to `1` and change the `pp2` value to `pp1` in the `--save-dir` value.
4.2 Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
```shell
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/Yi-34B-Base-v0.1-tp8-pp2/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/Yi-34B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Yi-34B/mg2hg/
```
5. Pre-training
5.1 Prepare dataset
Download the Yi-34B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# download datasets
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./dataset/Yi-34B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
--output-prefix ./dataset/Yi-34B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 pre-training
Config Yi-34B pre-training script : examples/yi/pretrain_yi_34b_ptd_16p.sh
```shell
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
CKPT_SAVE_DIR="./ckpt/Yi-34B/"
DATA_PATH="./dataset/Yi-34B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Yi-34B/tokenizer.model"
CKPT_LOAD_DIR="./model_weights/Yi-34B-v0.1-tp8-pp2/"
```
Launch Yi-34B pre-training script: examples/yi/pretrain_yi_34b_ptd_16p.sh
```shell
bash examples/yi/pretrain_yi_ptd_34B.sh
```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
6. fine-tuning
6.1 Prepare fine-tuning dataset
Download the fine-tuning datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# download datasets
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./finetune_dataset/Yi-34B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Yi-34B/ \
--output-prefix ./finetune_dataset/Yi-34B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF \
--handler-name GeneralInstructionHandler \
--append-eod
```
6.2 Full Parameters Fine-Tuning yi_34B
The configuration script for full parameters fine-tuning is basically the same as that for pretrain_yi_34b_ptd_16p.sh.*The difference is that the dataset and the training parameter is-instruction-dataset are added.*
Add the fine-tuning parameter `--finetune` so that fine-tuning starts from the first step.
```bash
CKPT_SAVE_DIR="./ckpt/Yi-34B/"
DATA_PATH="./finetune_dataset/Yi-34B/alpaca"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
CKPT_LOAD_DIR="./model_weights/Yi-34B-Base-v0.1-tp8-pp2/"
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
```
### Performance
#### Machine performance
The performance of Yi-34B in **Ascend NPU** and **Reference**:
| Device | Model | total Iterations | throughput rate (samples/s) | throughput rate (tokens/s/p) | single-step time (s/step) |
|:----:|:---------:|:----:|:---------------------:|:---------------:|:----------------:|
| NPUs | Yi-34B | - | 3.16| 809| 324 |
| Reference | Yi-34B | - | 2.85 | 732 | 359 |
## Inference
Config Yi-34B inference script: examples/yi/generate_yi_34b_ptd.sh
```bash
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify script model path and tokenizer path
CHECKPOINT="./model_weights/Yi-34B-v0.1-tp8-pp1/"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
```
Launch Yi-34B inference script: examples/yi/generate_yi_34b_ptd.sh
```bash
bash examples/yi/generate_yi_34b_ptd.sh
```
Some inference samples are as follows:
![Inference](../../sources/images/yi/yi-34b-generate.png)
## Evaluation
We use the boolq benchmark to evaluate our model. Benchmark [Download](https://huggingface.co/datasets/cais/mmlu).
```shell
# config origin weight and vocab file path
CHECKPOINT="./model_weights/Yi-34B-v0.1-tp8-pp1/"
TOKENIZER_PATH="./model_from_hf/Yi-34B/"
# config tasks and dataset path
DATA_PATH="./mmlu/"
TASK="mmlu"
```
```shell
bash ./examples/yi/evaluate_yi_34b_ptd.sh
```
<table>
<thead>
<tr>
<th>Task</th>
<th>Model</th>
<th>NPU</th>
<th>OpenSource</th>
</tr>
</thead>
<tbody>
<tr>
<td><a href="https://huggingface.co/datasets/cais/mmlu">MMLU</a></td>
<th>Yi 34B</th>
<td>75.8</td>
<td><a href="https://hub.opencompass.org.cn/dataset-detail/MMLU">76.3</a></td>
</tr>
</tbody>
</table>

View File

@ -0,0 +1,57 @@
#!/bin/bash
# The number of parameters is not aligned
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
export HCCL_CONNECT_TIMEOUT=1200
export COMBINED_ENABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6001
NNODES=1
NODE_RANK=0
NPUS_PER_NODE=8
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
CHECKPOINT="Your ckpt file path"
TOKENIZER_PATH="Your tokenizer path"
DATA_PATH="./mmlu/test"
TASK="mmlu"
# Different task needs different max_new_tokens value, please follow the instruction in readme.
torchrun $DISTRIBUTED_ARGS evaluation.py \
--task-data-path $DATA_PATH \
--task $TASK\
--seq-length 4096 \
--max-new-tokens 1 \
--max-position-embeddings 4096 \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 60 \
--hidden-size 7168 \
--ffn-hidden-size 20480 \
--num-attention-heads 56 \
--disable-bias-linear \
--swiglu \
--position-embedding-type rope \
--load ${CHECKPOINT} \
--normalization RMSNorm \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
--fp16 \
--micro-batch-size 1 \
--use-fused-rmsnorm \
--exit-on-missing-checkpoint \
--no-load-rng \
--no-load-optim \
--untie-embeddings-and-output-weights \
--no-masked-softmax-fusion \
--make-vocab-size-divisible-by 1 \
--group-query-attention \
--num-query-groups 8 \
--rotary-base 5000000 \
--seed 42 | tee logs/evaluation_yi_34b_${TASK}.log

View File

@ -0,0 +1,57 @@
#!/bin/bash
# The number of parameters is not aligned
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
export HCCL_CONNECT_TIMEOUT=1200
export COMBINED_ENABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
# please fill these path configurations
CHECKPOINT="your model directory path"
TOKENIZER_PATH="your tokenizer directory path"
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6001
NNODES=1
NODE_RANK=0
NPUS_PER_NODE=8
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
torchrun $DISTRIBUTED_ARGS inference.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 60 \
--hidden-size 7168 \
--ffn-hidden-size 20480 \
--position-embedding-type rope \
--seq-length 4096 \
--max-new-tokens 256 \
--micro-batch-size 1 \
--global-batch-size 16 \
--num-attention-heads 56 \
--max-position-embeddings 4096 \
--swiglu \
--load "${CHECKPOINT}" \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path "${TOKENIZER_PATH}" \
--tokenizer-not-use-fast \
--fp16 \
--normalization RMSNorm \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-softmax-in-fp32 \
--no-load-optim \
--no-load-rng \
--no-masked-softmax-fusion \
--no-gradient-accumulation-fusion \
--exit-on-missing-checkpoint \
--make-vocab-size-divisible-by 1 \
--vocab-size 64000 \
--group-query-attention \
--num-query-groups 8 \
--rotary-base 5000000 | tee logs/generate_yi_34b.log

View File

@ -0,0 +1,97 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_ASD_ENABLE=0
export WITHOUT_JIT_COMPILE=1
NPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6001
NNODES=2
NODE_RANK=0
WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
CKPT_SAVE_DIR="your model save ckpt path"
DATA_PATH="your data path"
TOKENIZER_MODEL="your tokenizer path"
CKPT_LOAD_DIR="your model ckpt path"
TP=8
PP=2
DISTRIBUTED_ARGS="
--nproc_per_node $NPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--num-layers 60 \
--hidden-size 7168 \
--ffn-hidden-size 20480 \
--num-attention-heads 56 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 2 \
--global-batch-size 1024 \
--make-vocab-size-divisible-by 1 \
--lr 1.0e-7 \
--train-iters 2000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--use-fused-rmsnorm \
--use-fused-rotary-pos-emb \
--use-rotary-position-embeddings \
--swiglu \
--use-mc2 \
--use-flash-attn \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.0e-8 \
--weight-decay 1e-2 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--adam-beta2 0.999 \
--initial-loss-scale 8188.0 \
--no-gradient-accumulation-fusion \
--load ${CKPT_LOAD_DIR} \
--no-load-optim \
--no-load-rng \
--rotary-base 5000000 \
--bf16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 100,0,0
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 2000 \
--eval-interval 2000 \
--eval-iters 0 \
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save $CKPT_SAVE_DIR \
| tee logs/train_yi_34b.log

Binary file not shown.

After

Width:  |  Height:  |  Size: 92 KiB

View File

@ -0,0 +1,69 @@
{
"CONVERT_CKPT_PARAM": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--load-dir", "/home/dataset/yi-34B-hf",
"--save-dir", "/home/dataset/yi-34B-mt-t8p1",
"--target-tensor-parallel-size", "8",
"--target-pipeline-parallel-size", "1",
"--tokenizer-model", "None"
],
"NETWORK_SIZE": [
"--num-layers", "60",
"--hidden-size", "7168",
"--ffn-hidden-size", "20480",
"--num-attention-heads", "56",
"--max-position-embeddings", "4096",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/yi-34B-mt-t8p1"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/yi-34B-hf"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint",
"--attention-softmax-in-fp32"
],
"EVALUATION_PARAM": [
"--tokenizer-not-use-fast",
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
"--task", "mmlu",
"--max-new-tokens", "1",
"--exit-on-missing-checkpoint"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--seed", "42",
"--use-fused-rmsnorm",
"--group-query-attention",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "4096",
"--num-query-groups", "8",
"--vocab-size", "64000",
"--rotary-base", "5000000"
]
}

View File

@ -0,0 +1,61 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
from utils import ParamConfig
import torch
from ascendspeed import megatron_adaptor
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
self.assertEqual(len(bin_file), 7)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([8000, 7168]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([7168]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(len(weight_common_content['encoder']) / 6, 60)
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1152, 7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([7168, 896]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5120, 7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([7168, 2560]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([7168]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 7168]))
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,95 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from ascendspeed import megatron_adaptor
from common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.evaluation_param + config.tokenizer_param
from megatron.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron import get_args
self.args = get_args()
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0].strip()
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.803) < 0.01)

View File

@ -0,0 +1,100 @@
import sys
import os
import torch
import torch_npu
from common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
from megatron.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron import get_args
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["春夏秋冬,四个季节"]
output = model.generate(instruction, detokenize=False)
expected_output1 = [101, 6001, 15831, 5074, 6435, 35308, 101, 31179, 44445, 60820,
11098, 60721, 8203, 61293, 60583, 35308, 102, 18024, 101, 59647,
60721, 60690, 60452, 4452, 59706, 60207, 24212, 1075, 61759, 60942,
63958, 60585, 59599, 21639, 101, 24212, 1075, 61287, 62566, 60632,
63011, 59599, 37835, 60408, 17664, 102, 60566, 9299, 49085, 101]
expected_output2 = [101, 6001, 15831, 5074, 2045, 7753, 101, 5074, 2045, 8511,
102, 144, 18417, 101, 24018, 25592, 101, 59722, 60575, 59823,
39464, 60630, 59676, 59936, 59670, 101, 55550, 59688, 60686, 59801,
7292, 101, 60319, 60502, 60687, 61529, 101, 59722, 61418, 59632,
61441, 59936, 534, 448, 494, 534, 448, 494, 534, 455]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
cos_sim = torch.max(cos_sim1, cos_sim2)
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "北京奥运会"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [39047, 59654, 101, 79, 77, 77, 85, 59867, 10536, 60397,
536, 493, 487, 59732, 60516, 101, 62731, 62131, 59757, 59637,
59635, 60382, 59689, 35444, 59670, 101, 59630, 61004, 60159, 60475,
59638, 101, 6919, 59678, 2349, 11923, 17463, 60243, 60034, 59652,
22740, 59599, 9034, 102, 144, 79, 77, 77, 85, 59867]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)

View File

@ -0,0 +1,35 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
convert_ckpt_param,
network_size,
tokenizer_param,
distributed_param,
inference_param,
evaluation_param,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
network_size = config_file["NETWORK_SIZE"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
distributed_param = config_file["DISTRIBUTED_PARAM"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -0,0 +1,6 @@
# Provide uniform access for piepline.
python tests/pipeline/yi-34B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/yi-34B/test_generation.py
pytest -s tests/pipeline/yi-34B/test_evaluation.py