!1291 添加Qwen1.5-14B适配

Merge pull request !1291 from 商元义/14B
This commit is contained in:
商元义 2024-05-23 02:57:38 +00:00 committed by i-robot
parent 0ddad3f9c8
commit 19d3b157ff
9 changed files with 978 additions and 52 deletions

View File

@ -305,6 +305,27 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
<td> 【昇腾贡献】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td>7B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
<td rowspan="1"> 【社区贡献】 </td>
</tr>
<td>14B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
<td rowspan="1"> 【社区贡献】 </td>
<tr>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
<td>34B</td>
@ -337,17 +358,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> -- </td>
<td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a> </td>
<td>【昇腾贡献】</td>
<tr>
<td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td>7B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
<td> 【社区贡献】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
<td>7B</td>
@ -587,6 +597,21 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> 285 </td>
<td> 345 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td> 7B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 2862 </td>
<td> 2621 </td>
</tr>
<tr>
<td> 14B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 1717 </td>
<td> 1702 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
<td>34B</td>
@ -611,14 +636,6 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
<td> 2806 </td>
<td> 2734 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td> 7B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 2548 </td>
<td> 2578 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
<td>7B</td>

View File

@ -304,6 +304,27 @@ Current ModelLink supports pre-training and fine-tuning for the following models
<td> <a href="examples/qwen/evaluate_qwen_72b_ptd.sh"> eval </a> </td>
<td> 【Ascend】 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td>7B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
<td rowspan="1"> 【Community】 </td>
</tr>
<td>14B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_14b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_14b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_14b_ptd.sh"> eval </a> </td>
<td rowspan="1"> 【Community】 </td>
<tr>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README.md">Yi</a></td>
<td>34B</td>
@ -336,17 +357,6 @@ Current ModelLink supports pre-training and fine-tuning for the following models
<td> -- </td>
<td> <a href="examples/mistral/evaluate_mistral_7b_ptd.sh"> eval </a> </td>
<td>【Ascend】</td>
<tr>
<td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td>7B</td>
<td> <a href="examples/qwen15/pretrain_qwen15_7b_ptd.sh"> pretrain </a> </td>
<td> <a href="examples/qwen15/generate_qwen15_7b_ptd.sh"> generate </a> </td>
<td> -- </td>
<td> -- </td>
<td> -- </td>
<td> <a href="examples/qwen15/evaluate_qwen15_7b_ptd.sh"> eval </a> </td>
<td> 【Community】 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
<td>7B</td>
@ -586,6 +596,21 @@ For the supported models listed above, we provide training scripts and readme in
<td> 285 </td>
<td> 345 </td>
</tr>
<tr>
<td rowspan="2"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td> 7B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 2862 </td>
<td> 2621 </td>
</tr>
<tr>
<td> 14B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 1717 </td>
<td> 1702 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/yi/README_en.md">Yi</a></td>
<td>34B</td>
@ -610,14 +635,6 @@ For the supported models listed above, we provide training scripts and readme in
<td> 2806 </td>
<td> 2734 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/qwen15/README.md">Qwen1.5</a></td>
<td> 7B </td>
<td> 1x8 </td>
<td> BF16 </td>
<td> 2548 </td>
<td> 2578 </td>
</tr>
<tr>
<td rowspan="1"><a href="examples/gemma/README.md">Gemma</a></td>
<td>7B</td>

View File

@ -15,6 +15,14 @@
- [推理](#推理)
- [评估](#评估)
- [Qwen1.5-14B](#qwen15-14b)
- [训练](#训练)
- [脚本](#脚本)
- [性能](#性能)
- [吞吐](#吞吐)
- [推理](#推理)
- [评估](#评估)
# Qwen1.5-7B
## 训练
@ -102,9 +110,10 @@ Qwen1.5-7B 训练的硬件配置:
--target-pipeline-parallel-size 1 \
--make-vocab-size-divisible-by 16 \
--load-dir ./model_from_hf/Qwen1.5-7B/ \
--save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
--save-dir ./model_weights/DA-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
--add-qkv-bias
--add-qkv-bias \
--param-dtype bf16
```
任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
@ -122,7 +131,7 @@ Qwen1.5-7B 训练的硬件配置:
--target-pipeline-parallel-size 1 \
--add-qkv-bias \
--load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
--save-dir ./model_from_hf/Qwen1.5-7B # 需要填入原始HF模型路径新权重会存于./model_from_hf/Qwen-7B/mg2hg/
--save-dir ./model_from_hf/Qwen1.5-7B # 需要填入原始HF模型路径新权重会存于./model_from_hf/Qwen1.5-7B/mg2hg/
```
5. 预训练
@ -199,8 +208,7 @@ Qwen1.5-7B 训练的硬件配置:
6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。
*区别是数据集,以及增加训练参数`--is-instruction-dataset`,增加微调参数`--finetune`,增加预训练权重加载参数`--load`
使微调从第一步开始修改tokenizer参数去掉`--tokenizer-type Llama2Tokenizer`
和`--tokenizer-model ${TOKENIZER_MODEL}`。*
,使微调从第一步开始。*
修改如下:
@ -213,8 +221,6 @@ Qwen1.5-7B 训练的硬件配置:
--load ${CKPT_LOAD_DIR} \
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
```
@ -286,4 +292,274 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|:-----:|:----:|:-----:|:-------------------------------------------------------:|:------:|
| MMLU | 57 | 14042 | [61.0](https://qwenlm.github.io/zh/blog/qwen1.5) | 60.3 |
| MMLU | 57 | 14042 | [61.0](https://qwenlm.github.io/zh/blog/qwen1.5) | 60.3 |
# Qwen1.5-14B
## 训练
Qwen1.5-14B 训练的硬件配置:
| 硬件 | 配置 |
|:---:|:---------------:|
| NPU | 8 x Ascend NPUs |
### 脚本
1. 克隆仓库到本地服务器
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. 搭建环境
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# 安装 torch 和 torch_npu
pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip install -e .
cd ..
# 安装其余依赖库
pip install -r requirements.txt
```
**注意**transformer版本要4.37.0以上
3. 下载 Qwen1.5-14B 的 [预训练权重和词表](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
```bash
mkdir ./model_from_hf/Qwen1.5-14B/
cd ./model_from_hf/Qwen1.5-14B/
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
...
cd ../../
```
4. 权重转换
将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```shell
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 1 \
--make-vocab-size-divisible-by 16 \
--load-dir ./model_from_hf/Qwen1.5-14B/ \
--save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
--add-qkv-bias \
--param-dtype bf16
```
任意并行切分策略的Megatron权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```bash
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_qwen \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--add-qkv-bias \
--load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
--save-dir ./model_from_hf/Qwen1.5-14B # 需要填入原始HF模型路径新权重会存于./model_from_hf/Qwen1.5-14B/mg2hg/
```
5. 预训练
5.1 准备数据集
下载Qwen1.5-14B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/Qwen1.5-14B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
--output-prefix ./dataset/Qwen1.5-14B/alpaca \
--tokenizer-type PretrainedFromHF \
--seq-length 8192 \
--workers 4 \
--log-interval 1000
```
5.2 预训练
配置Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
```shell
# 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 根据实际情况配置词表、数据集、模型参数保存路径
CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B"
TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B" #词表路径
DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document" #数据集路径
CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
```
多机运行增加参数 `--overlap-grad-reduce`
启动 Qwen1.5-14B 预训练脚本: examples/qwen15/pretrain_qwen15_14b_ptd.sh
```shell
bash examples/qwen15/pretrain_qwen15_14b_ptd.sh
```
**注意**:如果使用多机训练,且没有设置数据共享,需要在训练启动脚本中增加 `--no-shared-storage` 参数设置此参数之后将会根据分布式参数判断非主节点是否需要load数据并检查相应缓存和生成数据。
6. 微调
6.1 准备微调数据集
下载微调数据集 [这里](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据集
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理微调数据集
mkdir ./finetune_dataset/Qwen1.5-14B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
--output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF \
--handler-name GeneralInstructionHandler \
--append-eod
```
6.2 全参微调 全参微调的配置脚本基本和预训练脚本一致。
*区别是数据集,以及增加训练参数`--is-instruction-dataset`,增加微调参数`--finetune`,增加预训练权重加载参数`--load`
,使微调从第一步开始}`。*
修改如下:
```bash
CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
--load ${CKPT_LOAD_DIR} \
--finetune \
--is-instruction-dataset \
--tokenizer-not-use-fast \
```
启动微调脚本: examples/qwen15/tune_qwen15_14b_ptd.sh
```shell
bash examples/qwen15/tune_qwen15_14b_ptd.sh
```
### 性能
#### 吞吐
Qwen1.5-14B 在 **昇腾芯片****参考芯片** 上的性能对比:
| 设备 | 模型 | tokens吞吐 (tokens/s/p) |
|:--------------:|:-----------:|:---------------------:|
| NPUs | Qwen1.5-14B | 1717.8 |
| 参考 | Qwen1.5-14B | 1702.2 |
## 推理
配置 Qwen1.5-14B 推理脚本examples/qwen15/generate_qwen15_14b_ptd.sh
```bash
# ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改模型权重路径和词表路径
CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
```
启动Qwen1.5-14B推理脚本
```bash
bash examples/qwen15/generate_qwen15_14b_ptd.sh
```
推理示例如下:
![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
## 评估
使用[CEval数据集](https://huggingface.co/datasets/ceval/ceval-exam)
和[MMLU数据集](https://huggingface.co/datasets/cais/mmlu)评估模型.
配置Qwen1.5-14B评估脚本: examples/qwen15/evaluate_qwen15_14b_ptd.sh
```bash
# ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改模型参数路径和词表路径
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/" #词表路径
CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/" #模型路径
# 配置任务和数据集路径
DATA_PATH="./mmlu/data/test/" # ceval任务配置为 "./ceval/val/"
TASK="mmlu" # ceval任务配置为 "ceval"
```
启动评估
```bash
bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
```
| 数据集 | 总学科数 | 总问题数 | 参考准确率 | NPU准确率 |
|:-----:|:----:|:-----:|:------------------------------------------------:|:------:|
| MMLU | 57 | 14042 | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) | 67.3 |

View File

@ -14,6 +14,14 @@
- [Inference](#Inference)
- [Evaluation](#Evaluation)
- [Qwen1.5-14B](#qwen15-14b)
- [Training](#training)
- [Script](#script)
- [Performance](#performance)
- [Machine performance](#machine-performance)
- [Inference](#Inference)
- [Evaluation](#Evaluation)
# Qwen1.5-7B
## Training
@ -105,7 +113,8 @@ Here's a hardware summary of pre-training Qwen1.5-7B:
--load-dir ./model_from_hf/Qwen1.5-7B/ \
--save-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/Qwen1.5-7B/tokenizer.json \
--add-qkv-bias
--add-qkv-bias \
--param-dtype bf16
```
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
@ -123,7 +132,7 @@ Here's a hardware summary of pre-training Qwen1.5-7B:
--target-pipeline-parallel-size 1 \
--add-qkv-bias \
--load-dir ./model_weights/Qwen1.5-7B-v0.1-tp8-pp1 \
--save-dir ./model_from_hf/Qwen1.5-7B # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen-7B/mg2hg/
--save-dir ./model_from_hf/Qwen1.5-7B # Fill in the original HF model path here, new weights will be saved in 1.5/mg2hg/
```
5. Pre-training
@ -199,7 +208,7 @@ Here's a hardware summary of pre-training Qwen1.5-7B:
The configuration script with the fine-tuning parameters is basically the same as the pre-training script.
*The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter, Drop `--tokenizer-type Llama2Tokenizer` and `--tokenizer-model ${TOKENIZER_MODEL}`.*
*The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*
Modified as follows:
@ -212,8 +221,6 @@ Here's a hardware summary of pre-training Qwen1.5-7B:
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
```
@ -256,7 +263,6 @@ bash examples/qwen15/generate_qwen15_7b_ptd.sh
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
Some inference samples are as follows:
![Inference](../../sources/images/qwen15/qwen1.5_7b_inference.png)
## Evaluation
@ -287,3 +293,277 @@ bash examples/qwen15/evaluate_qwen15_7b_ptd.sh
| Task | Subset | Question | OpenSource | NPU |
|:-----:|:------:|:--------:|:-------------------------------------------------------------:|:-------:|
| MMLU | 57 | 14042 | [61.0](https://qwenlm.github.io/zh/blog/qwen1.5) | 60.3 |
# Qwen1.5-14B
## Training
Here's a hardware summary of pre-training Qwen1.5-14B:
| Hardware | Value |
| :------: | :---------------------------------------------: |
| NPU | 8 x Ascend NPUs |
### Script
1. Clone the repository to your local server:
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. Build environment
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# install torch and torch_npu
pip install torch-2.2.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.2.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# install AscendSpeed
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip install -e .
cd ..
# install other packages
pip install -r requirements.txt
```
**Note** : transformer version 4.37.0 or higher
3. Prepare pretrained weights and tokenizer
Download the Qwen1.5-14B checkpoint from [here](https://huggingface.co/Qwen/Qwen1.5-14B/tree/main)
```bash
mkdir ./model_from_hf/Qwen1.5-14B/
cd ./model_from_hf/Qwen1.5-14B/
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/generation_config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/merges.txt
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model.safetensors.index.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/special_tokens_map.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/tokenizer_config.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/vocab.json
wget https://huggingface.co/Qwen/Qwen1.5-14B/blob/main/model-00001-of-00008.safetensors
...
cd ../../
```
4. Weights convert
Convert weights from huggingface format to megatron format
***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
```bash
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 1 \
--make-vocab-size-divisible-by 16 \
--load-dir ./model_from_hf/Qwen1.5-14B/ \
--save-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/Qwen1.5-14B/tokenizer.json \
--add-qkv-bias \
--param-dtype bf16
```
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
```shell
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_qwen \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--add-qkv-bias \
--load-dir ./model_weights/Qwen1.5-14B-v0.1-tp8-pp1 \
--save-dir ./model_from_hf/Qwen1.5-14B # Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Qwen1.5-14B/mg2hg/
```
5. Pre-training
5.1 prepare dataset
Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# download datasets
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./dataset/Qwen1.5-14B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B \
--output-prefix ./dataset/Qwen1.5-14B/alpaca \
--tokenizer-type PretrainedFromHF \
--seq-length 8192 \
--workers 4 \
--log-interval 1000
```
5.2 pre-training
Config Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
```shell
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
TOKENIZER_MODEL="./model_from_hf/Qwen1.5-14B" #tokenizer path
DATA_PATH="./dataset/Qwen1.5-14B/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
```
Multi-machine training requires the addition of parameter `--overlap-grad-reduce`.
Launch Qwen1.5-14B pre-training script: examples/qwen15/pretrain_qwen15_14b_ptd.sh
```shell
bash examples/qwen15/pretrain_qwen15_14b_ptd.sh
```
**Note**: If using multi machine training, and no data sharing configuration on the mechines, it's necessary to add the parameter `--no-shared-storage`. This parameter will determine whether non master nodes need to load data based on distributed parameters, and check the corresponding cache and generated data.
6. fine-tuning
6.1 Prepare fine-tuning dataset Download the Qwen1.5-14B datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# download datasets
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./finetune_dataset/Qwen1.5-14B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Qwen1.5-14B/ \
--output-prefix ./finetune_dataset/Qwen1.5-14B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF \
--handler-name GeneralInstructionHandler \
--append-eod
```
6.2 Full Parameters Fine-Tuning
The configuration script with the fine-tuning parameters is basically the same as the pre-training script.
*The difference is the dataset, and add the training parameter `--is-instruction dataset`, add the fine-tuning parameter `--finetune`, add the pre-training weight loading parameter `--load`, so that the fine-tuning starts from the first step, modify the tokenizer parameter.*
Modified as follows:
```bash
CKPT_LOAD_DIR="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/"
CKPT_SAVE_DIR="./ckpt/Qwen1.5-14B/"
DATA_PATH="./finetune_dataset/Qwen1.5-14B/alpaca"
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/"
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
```
Launch Qwen1.5-14B fine-tuning script: examples/qwen15/tune_qwen15_14b_ptd.sh
```shell
bash examples/qwen15/tune_qwen15_14b_ptd.sh
```
### Performance
#### Machine performance
The performance of Qwen1.5-14B in **Ascend NPU** and **Reference**:
| Device | Model | throughput rate (tokens/s/p) |
|:---------:|:-----------:|:----------------------------:|
| NPUs | Qwen1.5-14B | 1717.8 |
| Reference | Qwen1.5-14B | 1702.2 |
## Inference
Config Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
```bash
# ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify script model path and tokenizer path
CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1"
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B"
```
Launch Qwen1.5-14B inference script: examples/qwen15/generate_qwen15_14b_ptd.sh
```bash
bash examples/qwen15/generate_qwen15_14b_ptd.sh
```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
Some inference samples are as follows:
![Inference](../../sources/images/qwen15/qwen1.5_14b_inference.png)
## Evaluation
We use the [CEval benchmark](https://huggingface.co/datasets/ceval/ceval-exam) and [MMLU benchmark](https://huggingface.co/datasets/cais/mmlu) to evaluate our model.
Config Qwen1.5-14B evaluation script: examples/qwen15/evaluate_qwen15_14b_ptd.sh
```bash
# ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# Modify the model parameter path and vocabulary path
TOKENIZER_PATH="./model_from_hf/Qwen1.5-14B/" # vocabulary path
CHECKPOINT="./model_weights/Qwen1.5-14B-v0.1-tp8-pp1/" # parameter path
# Configure the task type and dataset path
DATA_PATH="./mmlu/data/test/" # "./ceval/val/" for ceval task
TASK="mmlu" # "ceval" for ceval task
```
Launch Qwen1.5-14B evaluation
```bash
bash examples/qwen15/evaluate_qwen15_14b_ptd.sh
```
| Task | Subset | Question | OpenSource | NPU |
|:-----:|:------:|:--------:|:------------------------------------------------:|:----:|
| MMLU | 57 | 14042 | [67.6](https://qwenlm.github.io/zh/blog/qwen1.5) | 67.3 |

View File

@ -0,0 +1,68 @@
#!/bin/bash
# The number of parameters is not aligned
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
export HCCL_CONNECT_TIMEOUT=1200
export COMBINED_ENABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6001
NNODES=1
NODE_RANK=0
NPUS_PER_NODE=8
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
# please fill these path configurations
CHECKPOINT="your model ckpt path"
TOKENIZER_PATH="your tokenizer path"
DATA_PATH="your data path"
TASK="mmlu"
TP=8
PP=1
DISTRIBUTED_ARGS="
--nproc_per_node $NPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
# Different task needs different max_new_tokens value, please follow the instruction in readme.
torchrun $DISTRIBUTED_ARGS evaluation.py \
--task-data-path $DATA_PATH \
--task ${TASK} \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--seq-length 8192 \
--max-new-tokens 1 \
--max-position-embeddings 32768 \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 13696 \
--num-attention-heads 40 \
--disable-bias-linear \
--swiglu \
--position-embedding-type rope \
--load $CHECKPOINT \
--normalization RMSNorm \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--tokenizer-not-use-fast \
--micro-batch-size 1 \
--exit-on-missing-checkpoint \
--no-load-rng \
--no-load-optim \
--untie-embeddings-and-output-weights \
--add-qkv-bias \
--make-vocab-size-divisible-by 16 \
--padded-vocab-size 152064 \
--rotary-base 1000000 \
--no-gradient-accumulation-fusion \
--attention-softmax-in-fp32 \
--seed 42 \
--bf16 \
| tee logs/eval_qwen15_14b_${TASK}.log

View File

@ -0,0 +1,64 @@
#!/bin/bash
# The number of parameters is not aligned
export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
export HCCL_CONNECT_TIMEOUT=1200
export COMBINED_ENABLE=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
# please fill these path configurations
CHECKPOINT="your model ckpt path"
TOKENIZER_PATH="your tokenizer path"
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6010
NNODES=1
NODE_RANK=0
NPUS_PER_NODE=8
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
TP=8
PP=1
DISTRIBUTED_ARGS="
--nproc_per_node $NPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
torchrun $DISTRIBUTED_ARGS inference.py \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--num-layers 40 \
--hidden-size 5120 \
--num-attention-heads 40 \
--ffn-hidden-size 13696 \
--max-position-embeddings 32768 \
--seq-length 8192 \
--make-vocab-size-divisible-by 1 \
--padded-vocab-size 152064 \
--rotary-base 1000000 \
--untie-embeddings-and-output-weights \
--micro-batch-size 1 \
--swiglu \
--disable-bias-linear \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--load ${CHECKPOINT} \
--normalization RMSNorm \
--position-embedding-type rope \
--norm-epsilon 1e-6 \
--hidden-dropout 0 \
--attention-dropout 0 \
--tokenizer-not-use-fast \
--add-qkv-bias \
--max-new-tokens 256 \
--no-gradient-accumulation-fusion \
--exit-on-missing-checkpoint \
--attention-softmax-in-fp32 \
--seed 42 \
--bf16 \
| tee logs/generate_qwen15_14b.log

View File

@ -0,0 +1,101 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_ASD_ENABLE=0
export WITHOUT_JIT_COMPILE=1
NPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
# please fill these path configurations
CKPT_LOAD_DIR="your model ckpt path"
CKPT_SAVE_DIR="your model save ckpt path"
DATA_PATH="your data path"
TOKENIZER_MODEL="your tokenizer path"
TP=4
PP=2
DISTRIBUTED_ARGS="
--nproc_per_node $NPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 13696 \
--num-attention-heads 40 \
--load ${CKPT_LOAD_DIR} \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
--seq-length 8192 \
--max-position-embeddings 32768 \
--micro-batch-size 1 \
--global-batch-size 256 \
--make-vocab-size-divisible-by 1 \
--padded-vocab-size 152064 \
--rotary-base 1000000 \
--lr 1.25e-6 \
--train-iters 5000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--swiglu \
--use-flash-attn \
--use-fused-rmsnorm \
--use-fused-rotary-pos-emb \
--use-rotary-position-embeddings \
--use-fused-swiglu \
--use-mc2 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--add-qkv-bias \
--initial-loss-scale 4096 \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--seed 42 \
--bf16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 100,0,0
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 5000 \
--eval-interval 5000 \
--eval-iters 0 \
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR} \
| tee logs/train_qwen15_14b.log

View File

@ -0,0 +1,103 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_ASD_ENABLE=0
export WITHOUT_JIT_COMPILE=1
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
NPUS_PER_NODE=8
WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
# please fill these path configurations
CKPT_LOAD_DIR="your model ckpt path"
CKPT_SAVE_DIR="your model save ckpt path"
DATA_PATH="your data path"
TOKENIZER_PATH="your tokenizer path"
TP=4
PP=2
DISTRIBUTED_ARGS="
--nproc_per_node $NPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 13686 \
--num-attention-heads 40 \
--load ${CKPT_LOAD_DIR} \
--finetune \
--is-instruction-dataset \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_PATH} \
--seq-length 8192 \
--max-position-embeddings 32768 \
--micro-batch-size 1 \
--global-batch-size 256 \
--make-vocab-size-divisible-by 16 \
--padded-vocab-size 152064 \
--rotary-base 1000000 \
--lr 1.25e-6 \
--train-iters 5000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--use-fused-rmsnorm \
--swiglu \
--use-flash-attn \
--use-fused-rotary-pos-emb \
--use-rotary-position-embeddings \
--use-fused-swiglu \
--use-mc2 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--add-qkv-bias \
--initial-loss-scale 4096 \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--seed 42 \
--bf16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 100,0,0
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 5000 \
--eval-interval 5000 \
--eval-iters 0 \
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl \
--save ${CKPT_SAVE_DIR} \
| tee logs/finetune_qwen15_14b.log

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB