!1265 统一 readme 格式(llama3 + mixtral)

Merge pull request !1265 from wucong/dev9
This commit is contained in:
wucong 2024-05-07 02:16:49 +00:00 committed by i-robot
parent 2ae8749f4a
commit 297fe8b01b
4 changed files with 569 additions and 569 deletions

View File

@ -38,69 +38,69 @@ LLAMA3-8B 训练的硬件配置:
1. 克隆仓库到本地服务器 1. 克隆仓库到本地服务器
```shell ```shell
git clone https://gitee.com/ascend/ModelLink.git git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM cd Megatron-LM
git checkout -f bcce6f git checkout -f bcce6f
cp -r megatron ../ModelLink/ cp -r megatron ../ModelLink/
cd .. cd ..
cd ModelLink cd ModelLink
mkdir logs mkdir logs
mkdir model_from_hf mkdir model_from_hf
mkdir dataset mkdir dataset
mkdir ckpt mkdir ckpt
``` ```
2. 搭建环境 2. 搭建环境
```bash ```bash
# python3.8 # python3.8
conda create -n test python=3.8 conda create -n test python=3.8
conda activate test conda activate test
# 安装 torch 和 torch_npu # 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库 # 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50 git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt pip install -r requirements.txt
pip3 install -e . pip3 install -e .
cd .. cd ..
# 安装其余依赖库 # 安装其余依赖库
pip install -r requirements.txt pip install -r requirements.txt
``` ```
3. 下载 LLAMA3-8B 的 [预训练权重和词表](https://huggingface.co/unsloth/llama-3-8B/tree/main) 3. 下载 LLAMA3-8B 的 [预训练权重和词表](https://huggingface.co/unsloth/llama-3-8B/tree/main)
```shell ```shell
#!/bin/bash #!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/ mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/ cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/config.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/generation_config.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00001-of-00004.safetensors wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00002-of-00004.safetensors wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00003-of-00004.safetensors wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00004-of-00004.safetensors wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model.safetensors.index.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/special_tokens_map.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer_config.json wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer_config.json
cd ../../ cd ../../
``` ```
4. 权重转换 4. 权重转换
4.1 将权重从 huggingface 格式转化为 megatron 格式 4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练*** ***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash ```bash
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -114,53 +114,53 @@ LLAMA3-8B 训练的硬件配置:
--load-dir ./model_from_hf/llama-3-8b-hf/ \ --load-dir ./model_from_hf/llama-3-8b-hf/ \
--save-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \ --save-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/llama-3-8b-hf/tokenizer.json --tokenizer-model ./model_from_hf/llama-3-8b-hf/tokenizer.json
``` ```
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重 4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式*** ***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```shell ```shell
# 请按照您的真实环境修改 set_env.sh 路径 # 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader megatron \ --loader megatron \
--saver megatron \ --saver megatron \
--save-model-type save_huggingface_llama \ --save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \ --load-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \ --target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-8b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-8b-hf/mg2hg/ --save-dir ./model_from_hf/llama-3-8b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-8b-hf/mg2hg/
``` ```
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size` 权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
5. 预训练 5. 预训练
5.1 准备数据集 5.1 准备数据集
下载 LLaMA3-8B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet) 下载 LLaMA3-8B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell ```shell
# 下载数据 # 下载数据
cd ./dataset cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd .. cd ..
# 处理数据 # 处理数据
mkdir ./dataset/llama-3-8b-hf/ mkdir ./dataset/llama-3-8b-hf/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \ --tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \ --output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
5.2 预训练 5.2 预训练
配置llama3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh 配置llama3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell ```shell
# 设置 ascend-toolkit 路径 # 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -169,17 +169,17 @@ LLAMA3-8B 训练的硬件配置:
TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #词表路径 TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #词表路径
DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #数据集路径 DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #数据集路径
CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #权重路径 CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #权重路径
``` ```
多机运行增加参数--overlap-grad-reduce 多机运行增加参数--overlap-grad-reduce
启动 LLaMA3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh 启动 LLaMA3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell ```shell
bash examples/llama3/pretrain_llama3_8b_ptd.sh bash examples/llama3/pretrain_llama3_8b_ptd.sh
``` ```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。 **注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
### 性能 ### 性能
@ -293,71 +293,71 @@ LLAMA3-70B 训练的硬件配置:
1. 克隆仓库到本地服务器 1. 克隆仓库到本地服务器
```shell ```shell
git clone https://gitee.com/ascend/ModelLink.git git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM cd Megatron-LM
git checkout -f bcce6f git checkout -f bcce6f
cp -r megatron ../ModelLink/ cp -r megatron ../ModelLink/
cd .. cd ..
cd ModelLink cd ModelLink
mkdir logs mkdir logs
mkdir model_from_hf mkdir model_from_hf
mkdir dataset mkdir dataset
mkdir ckpt mkdir ckpt
``` ```
2. 搭建环境 2. 搭建环境
```bash ```bash
# python3.8 # python3.8
conda create -n test python=3.8 conda create -n test python=3.8
conda activate test conda activate test
# 安装 torch 和 torch_npu # 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库 # 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50 git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt pip install -r requirements.txt
pip3 install -e . pip3 install -e .
cd .. cd ..
# 安装其余依赖库 # 安装其余依赖库
pip install -r requirements.txt pip install -r requirements.txt
``` ```
3. 下载 LLAMA3-70B 的 [预训练权重和词表](https://huggingface.co/v2ray/Llama-3-70B/tree/main) 3. 下载 LLAMA3-70B 的 [预训练权重和词表](https://huggingface.co/v2ray/Llama-3-70B/tree/main)
```shell ```shell
#!/bin/bash #!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/ mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/ cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
... ...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../ cd ../../
``` ```
4. 权重转换 4. 权重转换
4.1 将权重从 huggingface 格式转化为 megatron 格式 4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练*** ***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash ```bash
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -371,52 +371,52 @@ LLAMA3-70B 训练的硬件配置:
--load-dir ./model_from_hf/llama-3-70b-hf/ \ --load-dir ./model_from_hf/llama-3-70b-hf/ \
--save-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \ --save-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \
--tokenizer-model ./model_from_hf/llama-3-70b-hf/tokenizer.json --tokenizer-model ./model_from_hf/llama-3-70b-hf/tokenizer.json
``` ```
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重 4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式*** ***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```shell ```shell
# 请按照您的真实环境修改 set_env.sh 路径 # 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader megatron \ --loader megatron \
--saver megatron \ --saver megatron \
--save-model-type save_huggingface_llama \ --save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \ --load-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \ --target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-70b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-70b-hf/mg2hg/ --save-dir ./model_from_hf/llama-3-70b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-70b-hf/mg2hg/
``` ```
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size` 权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
5. 预训练 5. 预训练
5.1 准备数据集 5.1 准备数据集
下载 LLaMA3-70B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet) 下载 LLaMA3-70B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell ```shell
# 下载数据 # 下载数据
cd ./dataset cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd .. cd ..
# 处理数据 # 处理数据
mkdir ./dataset/llama-3-70b-hf/ mkdir ./dataset/llama-3-70b-hf/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \ --tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \ --output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
5.2 预训练 5.2 预训练
配置llama3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh 配置llama3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell ```shell
# 设置 ascend-toolkit 路径 # 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -425,17 +425,17 @@ LLAMA3-70B 训练的硬件配置:
TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #词表路径 TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #词表路径
DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #数据集路径 DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #数据集路径
CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #权重路径 CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #权重路径
``` ```
多机运行增加参数--overlap-grad-reduce 多机运行增加参数--overlap-grad-reduce
启动 LLaMA3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh 启动 LLaMA3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell ```shell
bash examples/llama3/pretrain_llama3_70b_ptd.sh bash examples/llama3/pretrain_llama3_70b_ptd.sh
``` ```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。 **注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
### 性能 ### 性能

View File

@ -97,26 +97,26 @@ Here's a hardware summary of pre-training LLAMA3-8B:
Download the LLAMA3-8B checkpoint from [here](https://huggingface.co/unsloth/llama-3-8B/tree/main) Download the LLAMA3-8B checkpoint from [here](https://huggingface.co/unsloth/llama-3-8B/tree/main)
```shell ```shell
#!/bin/bash #!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/ mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/ cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/generation_config.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00001-of-00004.safetensors wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00002-of-00004.safetensors wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00003-of-00004.safetensors wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00004-of-00004.safetensors wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model.safetensors.index.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/special_tokens_map.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer_config.json wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer_config.json
cd ../../ cd ../../
``` ```
4. weight conversion in ptd mode 4. weight conversion in ptd mode
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-8b model weight conversion in ptd as an example.* *Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-8b model weight conversion in ptd as an example.*
```bash ```bash
# modify the script according to your own ascend-toolkit path # modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -164,26 +164,26 @@ Here's a hardware summary of pre-training LLAMA3-8B:
# process datasets # process datasets
mkdir ./dataset/llama-3-8b-hf/ mkdir ./dataset/llama-3-8b-hf/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \ --tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \ --output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
5.2 pre-training using ptd mode 5.2 pre-training using ptd mode
Config LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh Config LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell ```shell
# modify the script according to your own ascend-toolkit path # modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation # modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-8b-hf/" CKPT_SAVE_DIR="./ckpt/llama-3-8b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #tokenizer path TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #processed dataset DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #weight path CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #weight path
``` ```
Multi-machine training requires the addition of parameter --overlap-grad-reduce Multi-machine training requires the addition of parameter --overlap-grad-reduce
@ -191,7 +191,7 @@ Here's a hardware summary of pre-training LLAMA3-8B:
Launch LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh Launch LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell ```shell
bash examples/llama3/pretrain_llama3_8b_ptd.sh bash examples/llama3/pretrain_llama3_8b_ptd.sh
``` ```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes. **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
@ -366,28 +366,28 @@ Here's a hardware summary of pre-training LLAMA3-70B:
Download the LLAMA3-70B checkpoint from [here](https://huggingface.co/v2ray/Llama-3-70B/tree/main) Download the LLAMA3-70B checkpoint from [here](https://huggingface.co/v2ray/Llama-3-70B/tree/main)
```shell ```shell
#!/bin/bash #!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/ mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/ cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
... ...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../ cd ../../
``` ```
4. weight conversion in ptd mode 4. weight conversion in ptd mode
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-70b model weight conversion in ptd as an example.* *Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-70b model weight conversion in ptd as an example.*
```bash ```bash
# modify the script according to your own ascend-toolkit path # modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -435,26 +435,26 @@ Here's a hardware summary of pre-training LLAMA3-70B:
# process datasets # process datasets
mkdir ./dataset/llama-3-70b-hf/ mkdir ./dataset/llama-3-70b-hf/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \ --tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \ --output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
5.2 pre-training using ptd mode 5.2 pre-training using ptd mode
Config LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh Config LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell ```shell
# modify the script according to your own ascend-toolkit path # modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation # modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-70b-hf/" CKPT_SAVE_DIR="./ckpt/llama-3-70b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #tokenizer path TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #processed dataset DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #weight path CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #weight path
``` ```
Multi-machine training requires the addition of parameter --overlap-grad-reduce Multi-machine training requires the addition of parameter --overlap-grad-reduce
@ -462,7 +462,7 @@ Here's a hardware summary of pre-training LLAMA3-70B:
Launch LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh Launch LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell ```shell
bash examples/llama3/pretrain_llama3_70b_ptd.sh bash examples/llama3/pretrain_llama3_70b_ptd.sh
``` ```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes. **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.

View File

@ -38,208 +38,208 @@
1. 克隆仓库到本地服务器 1. 克隆仓库到本地服务器
```shell ```shell
git clone https://gitee.com/ascend/ModelLink.git git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM cd Megatron-LM
git checkout -f bcce6f git checkout -f bcce6f
cp -r megatron ../ModelLink/ cp -r megatron ../ModelLink/
cd .. cd ..
cd ModelLink cd ModelLink
mkdir logs mkdir logs
mkdir model_from_hf mkdir model_from_hf
mkdir dataset mkdir dataset
mkdir ckpt mkdir ckpt
``` ```
2. 搭建环境 2. 搭建环境
```bash ```bash
# python3.8 # python3.8
conda create -n test python=3.8 conda create -n test python=3.8
conda activate test conda activate test
# 安装 torch 和 torch_npu # 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库 # 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50 git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt pip install -r requirements.txt
pip3 install -e . pip3 install -e .
cd .. cd ..
# 安装其余依赖库 # 安装其余依赖库
pip install -r requirements.txt pip install -r requirements.txt
``` ```
3. 下载 Mixtral-8x7B 的 [预训练权重和词表](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main)*建议仅下载使用safetensors格式的权重* 3. 下载 Mixtral-8x7B 的 [预训练权重和词表](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main)*建议仅下载使用safetensors格式的权重*
```shell ```shell
#!/bin/bash #!/bin/bash
cd ./model_from_hf/ cd ./model_from_hf/
git lfs install git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1 git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd .. cd ..
``` ```
4. 权重转换 4. 权重转换
HuggingFace权重 --> 任意并行切分策略的Megatron权重 HuggingFace权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练*** ***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash ```bash
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# HF 转 tp1-pp8-ep2 # HF 转 tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_hf \ --loader mixtral_hf \
--saver mixtral \ --saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \ --load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \ --tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \ --target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2 --target-expert-parallel-size 2
``` ```
任意并行切分策略的Megatron权重 --> 任意并行切分策略的Megatron权重 任意并行切分策略的Megatron权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于重新配置切分后模型的权重比如在双机16卡 EP2-PP8策略下训练完了想在单机8卡 TP8上进行推理*** ***该场景一般用于重新配置切分后模型的权重比如在双机16卡 EP2-PP8策略下训练完了想在单机8卡 TP8上进行推理***
```bash ```bash
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 转 tp1-pp8-ep1 # tp1-pp8-ep2 转 tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_mg \ --loader mixtral_mg \
--saver mixtral \ --saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \ --save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \ --target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1 --target-expert-parallel-size 1
``` ```
任意并行切分策略的Megatron权重 --> HuggingFace权重 任意并行切分策略的Megatron权重 --> HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式*** ***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```bash ```bash
# 修改 ascend-toolkit 路径 # 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 转 HF # tp1-pp8-ep2 转 HF
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_mg \ --loader mixtral_mg \
--saver mixtral \ --saver mixtral \
--save-model-type huggingface \ --save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Mixtral-8x7B/mg2hg/ --save-dir ./model_from_hf/Mixtral-8x7B/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Mixtral-8x7B/mg2hg/
``` ```
## 模型训练 ## 模型训练
准备数据集 1. 准备数据集
下载 Mixtral-8x7B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet) 下载 Mixtral-8x7B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell ```shell
# 下载数据 # 下载数据
cd ./dataset cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd .. cd ..
# 处理数据 # 处理数据
mkdir ./dataset/Mixtral-8x7B/ mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \ --tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \ --output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
配置 Mixtral-8x7B 预训练脚本:***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh*** 配置 Mixtral-8x7B 预训练脚本:***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
```shell ```shell
# 设置 ascend-toolkit 路径 # 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 根据实际情况配置词表、数据集、模型参数保存路径 # 根据实际情况配置词表、数据集、模型参数保存路径
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document" DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/" TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/" CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# 根据分布式集群实际情况配置分布式参数 # 根据分布式集群实际情况配置分布式参数
GPUS_PER_NODE=8 GPUS_PER_NODE=8
MASTER_ADDR="your master node IP" MASTER_ADDR="your master node IP"
MASTER_PORT=6000 MASTER_PORT=6000
NNODES=2 NNODES=2
NODE_RANK="current node id" NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# 训练并行策略 # 训练并行策略
TP=1 TP=1
PP=8 PP=8
EP=2 EP=2
``` ```
启动 Mixtral-8x7B 预训练脚本: ***examples/pretrain_mixtral_8x7b_ptd.sh*** 启动 Mixtral-8x7B 预训练脚本: ***examples/pretrain_mixtral_8x7b_ptd.sh***
```shell ```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
``` ```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。 **注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
微调 2. 微调
下载微调数据集 [这里](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl) 下载微调数据集 [这里](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
```shell ```shell
# 下载数据集 # 下载数据集
mkdir finetune_dataset mkdir finetune_dataset
cd ./finetune_dataset cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd .. cd ..
# 处理微调数据集 # 处理微调数据集
mkdir ./finetune_dataset/Mixtral-8x7B/ mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \ --input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \ --output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \ --tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \ --tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \ --append-eod \
--tokenizer-not-use-fast \ --tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \ --handler-name GeneralInstructionHandler \
--workers 4 --workers 4
``` ```
指令微调 3. 指令微调
微调的配置脚本基本和预训练脚本一致. *区别是数据集,以及增加训练参数--is-instruction-dataset 微调的配置脚本基本和预训练脚本一致. *区别是数据集,以及增加训练参数--is-instruction-dataset
增加微调参数--finetune增加预训练权重加载参数--load使微调从第一步开始 增加微调参数--finetune增加预训练权重加载参数--load使微调从第一步开始
```bash ```bash
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca" DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/" CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \ --load ${CKPT_PATH} \
--finetune \ --finetune \
--is-instruction-dataset --is-instruction-dataset
``` ```
## 模型性能 ## 模型性能

View File

@ -38,209 +38,209 @@ Recommended hardware configuration for inference:
1. Clone the code repository to the local server 1. Clone the code repository to the local server
```shell ```shell
git clone https://gitee.com/ascend/ModelLink.git git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM cd Megatron-LM
git checkout -f bcce6f git checkout -f bcce6f
cp -r megatron ../ModelLink/ cp -r megatron ../ModelLink/
cd .. cd ..
cd ModelLink cd ModelLink
mkdir logs mkdir logs
mkdir model_from_hf mkdir model_from_hf
mkdir dataset mkdir dataset
mkdir ckpt mkdir ckpt
``` ```
2. Set up the environment 2. Set up the environment
```bash ```bash
# python3.8 # python3.8
conda create -n test python=3.8 conda create -n test python=3.8
conda activate test conda activate test
# Install torch and torch_npu # Install torch and torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# modify the path according to your own ascend-toolkit path # modify the path according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# install AscendSpeed # install AscendSpeed
git clone https://gitee.com/ascend/AscendSpeed.git git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50 git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt pip install -r requirements.txt
pip3 install -e . pip3 install -e .
cd .. cd ..
# install other packages # install other packages
pip install -r requirements.txt pip install -r requirements.txt
``` ```
3. Download the pre-trained weights and vocabulary for Mixtral-8x7B from [here](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main). (It is recommended to only download weights in safetensors format) 3. Download the pre-trained weights and vocabulary for Mixtral-8x7B from [here](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main). (It is recommended to only download weights in safetensors format)
```shell ```shell
#!/bin/bash #!/bin/bash
cd ./model_from_hf/ cd ./model_from_hf/
git lfs install git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1 git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd .. cd ..
``` ```
4. Weight conversion 4. Weight conversion
HuggingFace weights --> Megatron weights with any parallel slicing strategy HuggingFace weights --> Megatron weights with any parallel slicing strategy
***(This scenario is generally used to train open-source HuggingFace models on Megatron)*** ***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
```bash ```bash
# Modify the ascend-toolkit path # Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# HF to tp1-pp8-ep2 # HF to tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_hf \ --loader mixtral_hf \
--saver mixtral \ --saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \ --load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \ --tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \ --target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2 --target-expert-parallel-size 2
``` ```
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
***(This scenario is generally used to reconfigure the sliced model weights, such as training on a dual-node 16-card EP2-PP8 strategy, and then wanting to infer on a single-node 8-card TP8)*** ***(This scenario is generally used to reconfigure the sliced model weights, such as training on a dual-node 16-card EP2-PP8 strategy, and then wanting to infer on a single-node 8-card TP8)***
```bash ```bash
# Modify the ascend-toolkit path # Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 to tp1-pp8-ep1 # tp1-pp8-ep2 to tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_mg \ --loader mixtral_mg \
--saver mixtral \ --saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \ --save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \ --target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \ --target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1 --target-expert-parallel-size 1
``` ```
Any Megatron weights with parallel slicing strategy --> HuggingFace weights Any Megatron weights with parallel slicing strategy --> HuggingFace weights
***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)*** ***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
```bash ```bash
# Modify the ascend-toolkit path # Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 to HF # tp1-pp8-ep2 to HF
python tools/checkpoint/convert_ckpt.py \ python tools/checkpoint/convert_ckpt.py \
--model-type GPT \ --model-type GPT \
--loader mixtral_mg \ --loader mixtral_mg \
--saver mixtral \ --saver mixtral \
--save-model-type huggingface \ --save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \ --load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Mixtral-8x7B/mg2hg/ --save-dir ./model_from_hf/Mixtral-8x7B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Mixtral-8x7B/mg2hg/
``` ```
## Model-Training ## Model-Training
Prepare dataset 1. Prepare dataset
Download the datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet), save to ModelLink/dataset/ directory. Download the datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet), save to ModelLink/dataset/ directory.
```shell ```shell
# download datasets # download datasets
cd ./dataset cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd .. cd ..
# process datasets # process datasets
mkdir ./dataset/Mixtral-8x7B/ mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \ --tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \ --output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \ --workers 4 \
--log-interval 1000 \ --log-interval 1000 \
--tokenizer-type PretrainedFromHF --tokenizer-type PretrainedFromHF
``` ```
Configure Mixtral-8x7B pre-training script: ***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh*** Configure Mixtral-8x7B pre-training script: ***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
```shell ```shell
# Set the ascend-toolkit path # Set the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/ascend-toolkit/set_env.sh
# Configure according to the actual vocabulary, dataset, and model parameter save path # Configure according to the actual vocabulary, dataset, and model parameter save path
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document" DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/" TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/" CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# Configure distributed parameters according to the actual distributed cluster # Configure distributed parameters according to the actual distributed cluster
GPUS_PER_NODE=8 GPUS_PER_NODE=8
MASTER_ADDR="your master node IP" MASTER_ADDR="your master node IP"
MASTER_PORT=6000 MASTER_PORT=6000
NNODES=2 NNODES=2
NODE_RANK="current node id" NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# Training parallel strategy # Training parallel strategy
TP=1 TP=1
PP=8 PP=8
EP=2 EP=2
``` ```
Start Mixtral-8x7B pre-training script: ***examples/pretrain_mixtral_8x7b_ptd.sh*** Start Mixtral-8x7B pre-training script: ***examples/pretrain_mixtral_8x7b_ptd.sh***
```shell ```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
``` ```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes. **Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
Fine-Tuning 2. Fine-Tuning
Prepare fine-tuning dataset Prepare fine-tuning dataset
Download the fine-tuning datasets from [here](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl) Download the fine-tuning datasets from [here](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
```shell ```shell
# download datasets # download datasets
mkdir finetune_dataset mkdir finetune_dataset
cd ./finetune_dataset cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd .. cd ..
# process datasets # process datasets
mkdir ./finetune_dataset/Mixtral-8x7B/ mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \ python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \ --input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \ --output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \ --tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \ --tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \ --append-eod \
--tokenizer-not-use-fast \ --tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \ --handler-name GeneralInstructionHandler \
--workers 4 --workers 4
``` ```
Supervised Fine-Tuning 3. Supervised Fine-Tuning
The configuration script for full parameters fine-tuning is basically the same as that for pretrain shell. *The difference is that the dataset and the training parameter is-instruction-dataset are added.* The configuration script for full parameters fine-tuning is basically the same as that for pretrain shell. *The difference is that the dataset and the training parameter is-instruction-dataset are added.*
Add the fine-tuning parameter `--finetune` and add pretrained-weight load parameter `--load`, so that fine-tuning starts from the first step. Add the fine-tuning parameter `--finetune` and add pretrained-weight load parameter `--load`, so that fine-tuning starts from the first step.
```shell ```shell
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca" DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/" CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \ --load ${CKPT_PATH} \
--finetune \ --finetune \
--is-instruction-dataset --is-instruction-dataset
``` ```
## Model-Performance ## Model-Performance