!1265 统一 readme 格式(llama3 + mixtral)

Merge pull request !1265 from wucong/dev9
This commit is contained in:
wucong 2024-05-07 02:16:49 +00:00 committed by i-robot
parent 2ae8749f4a
commit 297fe8b01b
4 changed files with 569 additions and 569 deletions

View File

@ -38,69 +38,69 @@ LLAMA3-8B 训练的硬件配置:
1. 克隆仓库到本地服务器
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. 搭建环境
```bash
# python3.8
conda create -n test python=3.8
conda activate test
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装其余依赖库
pip install -r requirements.txt
```
# 安装其余依赖库
pip install -r requirements.txt
```
3. 下载 LLAMA3-8B 的 [预训练权重和词表](https://huggingface.co/unsloth/llama-3-8B/tree/main)
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer_config.json
cd ../../
```
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/unsloth/llama-3-8B/blob/main/tokenizer_config.json
cd ../../
```
4. 权重转换
4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -114,53 +114,53 @@ LLAMA3-8B 训练的硬件配置:
--load-dir ./model_from_hf/llama-3-8b-hf/ \
--save-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \
--tokenizer-model ./model_from_hf/llama-3-8b-hf/tokenizer.json
```
```
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```shell
```shell
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-8b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-8b-hf/mg2hg/
```
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-8b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-8b-hf/mg2hg/
```
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
5. 预训练
5.1 准备数据集
5.1 准备数据集
下载 LLaMA3-8B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
下载 LLaMA3-8B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/llama-3-8b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/llama-3-8b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 预训练
配置llama3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
5.2 预训练
配置llama3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell
```shell
# 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -169,17 +169,17 @@ LLAMA3-8B 训练的硬件配置:
TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #词表路径
DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #数据集路径
CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #权重路径
```
```
多机运行增加参数--overlap-grad-reduce
多机运行增加参数--overlap-grad-reduce
启动 LLaMA3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
启动 LLaMA3-8B 预训练脚本: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell
```shell
bash examples/llama3/pretrain_llama3_8b_ptd.sh
```
```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
### 性能
@ -293,71 +293,71 @@ LLAMA3-70B 训练的硬件配置:
1. 克隆仓库到本地服务器
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. 搭建环境
```bash
# python3.8
conda create -n test python=3.8
conda activate test
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装其余依赖库
pip install -r requirements.txt
```
# 安装其余依赖库
pip install -r requirements.txt
```
3. 下载 LLAMA3-70B 的 [预训练权重和词表](https://huggingface.co/v2ray/Llama-3-70B/tree/main)
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../
```
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../
```
4. 权重转换
4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
4.1 将权重从 huggingface 格式转化为 megatron 格式
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -371,52 +371,52 @@ LLAMA3-70B 训练的硬件配置:
--load-dir ./model_from_hf/llama-3-70b-hf/ \
--save-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \
--tokenizer-model ./model_from_hf/llama-3-70b-hf/tokenizer.json
```
```
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
4.2 任意并行切分策略的 Megatron 权重 格式转化为 HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```shell
```shell
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-70b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-70b-hf/mg2hg/
```
--model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \
--load-dir ./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/llama-3-70b-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/llama-3-70b-hf/mg2hg/
```
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
权重转换适用于预训练、微调、推理和评估,根据任务不同调整参数 `target-tensor-parallel-size``target-pipeline-parallel-size`
5. 预训练
5.1 准备数据集
5.1 准备数据集
下载 LLaMA3-70B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
下载 LLaMA3-70B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/llama-3-70b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/blob/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/llama-3-70b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 预训练
配置llama3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell
5.2 预训练
配置llama3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell
# 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -425,17 +425,17 @@ LLAMA3-70B 训练的硬件配置:
TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #词表路径
DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #数据集路径
CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #权重路径
```
```
多机运行增加参数--overlap-grad-reduce
多机运行增加参数--overlap-grad-reduce
启动 LLaMA3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
启动 LLaMA3-70B 预训练脚本: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell
```shell
bash examples/llama3/pretrain_llama3_70b_ptd.sh
```
```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
### 性能

View File

@ -97,26 +97,26 @@ Here's a hardware summary of pre-training LLAMA3-8B:
Download the LLAMA3-8B checkpoint from [here](https://huggingface.co/unsloth/llama-3-8B/tree/main)
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer_config.json
cd ../../
#!/bin/bash
mkdir ./model_from_hf/llama-3-8b-hf/
cd ./model_from_hf/llama-3-8b-hf/
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/generation_config.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00001-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00002-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00003-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model-00004-of-00004.safetensors
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/model.safetensors.index.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/special_tokens_map.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer.json
wget https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/tokenizer_config.json
cd ../../
```
4. weight conversion in ptd mode
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-8b model weight conversion in ptd as an example.*
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-8b model weight conversion in ptd as an example.*
```bash
```bash
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -164,26 +164,26 @@ Here's a hardware summary of pre-training LLAMA3-8B:
# process datasets
mkdir ./dataset/llama-3-8b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-8b-hf/ \
--output-prefix ./dataset/llama-3-8b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 pre-training using ptd mode
Config LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-8b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #weight path
# modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-8b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-8b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-8b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-8b-hf-v0.1-tp8-pp1/" #weight path
```
Multi-machine training requires the addition of parameter --overlap-grad-reduce
@ -191,7 +191,7 @@ Here's a hardware summary of pre-training LLAMA3-8B:
Launch LLAMA3-8B pre-training script: examples/llama3/pretrain_llama3_8b_ptd.sh
```shell
bash examples/llama3/pretrain_llama3_8b_ptd.sh
bash examples/llama3/pretrain_llama3_8b_ptd.sh
```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
@ -366,28 +366,28 @@ Here's a hardware summary of pre-training LLAMA3-70B:
Download the LLAMA3-70B checkpoint from [here](https://huggingface.co/v2ray/Llama-3-70B/tree/main)
```shell
#!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../
#!/bin/bash
mkdir ./model_from_hf/llama-3-70b-hf/
cd ./model_from_hf/llama-3-70b-hf/
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/generation_config.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00001-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00002-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00003-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00004-of-00030.safetensors
...
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model-00030-of-00030.safetensors
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/model.safetensors.index.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/special_tokens_map.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer.json
wget https://huggingface.co/v2ray/Llama-3-70B/blob/main/tokenizer_config.json
cd ../../
```
4. weight conversion in ptd mode
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-70b model weight conversion in ptd as an example.*
*Note that if you want to use the weight from huggingface, please run the weight conversion script first. The following uses llama-3-70b model weight conversion in ptd as an example.*
```bash
```bash
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
@ -435,26 +435,26 @@ Here's a hardware summary of pre-training LLAMA3-70B:
# process datasets
mkdir ./dataset/llama-3-70b-hf/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/llama-3-70b-hf/ \
--output-prefix ./dataset/llama-3-70b-hf/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
5.2 pre-training using ptd mode
Config LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify the script according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-70b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #weight path
# modify config according to your own actual situation
CKPT_SAVE_DIR="./ckpt/llama-3-70b-hf/"
TOKENIZER_MODEL="./model_from_hf/llama-3-70b-hf/" #tokenizer path
DATA_PATH="./dataset/llama-3-70b-hf/alpaca_text_document" #processed dataset
CKPT_LOAD_DIR="./model_weights/llama-3-70b-hf-v0.1-tp8-pp8/" #weight path
```
Multi-machine training requires the addition of parameter --overlap-grad-reduce
@ -462,7 +462,7 @@ Here's a hardware summary of pre-training LLAMA3-70B:
Launch LLAMA3-70B pre-training script: examples/llama3/pretrain_llama3_70b_ptd.sh
```shell
bash examples/llama3/pretrain_llama3_70b_ptd.sh
bash examples/llama3/pretrain_llama3_70b_ptd.sh
```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.

View File

@ -38,208 +38,208 @@
1. 克隆仓库到本地服务器
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. 搭建环境
```bash
# python3.8
conda create -n test python=3.8
conda activate test
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装加速库
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装其余依赖库
pip install -r requirements.txt
```
# 安装其余依赖库
pip install -r requirements.txt
```
3. 下载 Mixtral-8x7B 的 [预训练权重和词表](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main)*建议仅下载使用safetensors格式的权重*
```shell
#!/bin/bash
cd ./model_from_hf/
git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd ..
```
```shell
#!/bin/bash
cd ./model_from_hf/
git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd ..
```
4. 权重转换
HuggingFace权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
HuggingFace权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于使能开源的HuggingFace模型在Megatron上进行训练***
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# HF 转 tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_hf \
--saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2
```
# HF 转 tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_hf \
--saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2
```
任意并行切分策略的Megatron权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于重新配置切分后模型的权重比如在双机16卡 EP2-PP8策略下训练完了想在单机8卡 TP8上进行推理***
任意并行切分策略的Megatron权重 --> 任意并行切分策略的Megatron权重
***该场景一般用于重新配置切分后模型的权重比如在双机16卡 EP2-PP8策略下训练完了想在单机8卡 TP8上进行推理***
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 转 tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1
```
# tp1-pp8-ep2 转 tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1
```
任意并行切分策略的Megatron权重 --> HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
任意并行切分策略的Megatron权重 --> HuggingFace权重
***该场景一般用于将训练好的megatron模型重新转回HuggingFace格式***
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 转 HF
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Mixtral-8x7B/mg2hg/
```
# tp1-pp8-ep2 转 HF
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Mixtral-8x7B/mg2hg/
```
## 模型训练
准备数据集
1. 准备数据集
下载 Mixtral-8x7B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
下载 Mixtral-8x7B [数据集](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet)
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
```shell
# 下载数据
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# 处理数据
mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
配置 Mixtral-8x7B 预训练脚本:***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
配置 Mixtral-8x7B 预训练脚本:***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
```shell
# 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```shell
# 设置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 根据实际情况配置词表、数据集、模型参数保存路径
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# 根据实际情况配置词表、数据集、模型参数保存路径
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# 根据分布式集群实际情况配置分布式参数
GPUS_PER_NODE=8
MASTER_ADDR="your master node IP"
MASTER_PORT=6000
NNODES=2
NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# 根据分布式集群实际情况配置分布式参数
GPUS_PER_NODE=8
MASTER_ADDR="your master node IP"
MASTER_PORT=6000
NNODES=2
NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# 训练并行策略
TP=1
PP=8
EP=2
```
# 训练并行策略
TP=1
PP=8
EP=2
```
启动 Mixtral-8x7B 预训练脚本: ***examples/pretrain_mixtral_8x7b_ptd.sh***
启动 Mixtral-8x7B 预训练脚本: ***examples/pretrain_mixtral_8x7b_ptd.sh***
```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
```
```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
```
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
**注意**:如果使用多机训练,需要设置多机数据共享,非主节点通过数据共享读取主节点数据。或者,直接将主节点生成的数据复制到非主节点。
微调
2. 微调
下载微调数据集 [这里](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
下载微调数据集 [这里](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
```shell
# 下载数据集
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd ..
```shell
# 下载数据集
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd ..
# 处理微调数据集
mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \
--tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \
--workers 4
```
# 处理微调数据集
mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \
--tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \
--workers 4
```
指令微调
3. 指令微调
微调的配置脚本基本和预训练脚本一致. *区别是数据集,以及增加训练参数--is-instruction-dataset
微调的配置脚本基本和预训练脚本一致. *区别是数据集,以及增加训练参数--is-instruction-dataset
增加微调参数--finetune增加预训练权重加载参数--load使微调从第一步开始
增加微调参数--finetune增加预训练权重加载参数--load使微调从第一步开始
```bash
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset
```
```bash
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset
```
## 模型性能

View File

@ -38,209 +38,209 @@ Recommended hardware configuration for inference:
1. Clone the code repository to the local server
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout -f bcce6f
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir model_from_hf
mkdir dataset
mkdir ckpt
```
2. Set up the environment
```bash
# python3.8
conda create -n test python=3.8
conda activate test
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# Install torch and torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# Install torch and torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# modify the path according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify the path according to your own ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# install AscendSpeed
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# install AscendSpeed
git clone https://gitee.com/ascend/AscendSpeed.git
cd AscendSpeed
git checkout 224ae35e8fc96778f957029d1371ddb623452a50
pip install -r requirements.txt
pip3 install -e .
cd ..
# install other packages
pip install -r requirements.txt
```
# install other packages
pip install -r requirements.txt
```
3. Download the pre-trained weights and vocabulary for Mixtral-8x7B from [here](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/tree/main). (It is recommended to only download weights in safetensors format)
```shell
#!/bin/bash
cd ./model_from_hf/
git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd ..
```
```shell
#!/bin/bash
cd ./model_from_hf/
git lfs install
git clone https://huggingface.co/mistralai/Mixtral-8x7B-v0.1
mv Mixtral-8x7B-v0.1 Mixtral-8x7B
cd ..
```
4. Weight conversion
HuggingFace weights --> Megatron weights with any parallel slicing strategy
***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
HuggingFace weights --> Megatron weights with any parallel slicing strategy
***(This scenario is generally used to train open-source HuggingFace models on Megatron)***
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# HF to tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_hf \
--saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2
```
# HF to tp1-pp8-ep2
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_hf \
--saver mixtral \
--load-dir ./model_from_hf/Mixtral-8x7B/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 2
```
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
***(This scenario is generally used to reconfigure the sliced model weights, such as training on a dual-node 16-card EP2-PP8 strategy, and then wanting to infer on a single-node 8-card TP8)***
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
***(This scenario is generally used to reconfigure the sliced model weights, such as training on a dual-node 16-card EP2-PP8 strategy, and then wanting to infer on a single-node 8-card TP8)***
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 to tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1
```
# tp1-pp8-ep2 to tp1-pp8-ep1
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 8 \
--target-expert-parallel-size 1
```
Any Megatron weights with parallel slicing strategy --> HuggingFace weights
***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
Any Megatron weights with parallel slicing strategy --> HuggingFace weights
***(This scenario is generally used to convert the trained megatron model back to the HuggingFace format)***
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```bash
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# tp1-pp8-ep2 to HF
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Mixtral-8x7B/mg2hg/
```
# tp1-pp8-ep2 to HF
python tools/checkpoint/convert_ckpt.py \
--model-type GPT \
--loader mixtral_mg \
--saver mixtral \
--save-model-type huggingface \
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Mixtral-8x7B/mg2hg/
```
## Model-Training
Prepare dataset
1. Prepare dataset
Download the datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet), save to ModelLink/dataset/ directory.
Download the datasets from [here](https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet), save to ModelLink/dataset/ directory.
```shell
# download datasets
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
```shell
# download datasets
cd ./dataset
wget https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
cd ..
# process datasets
mkdir ./dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--output-prefix ./dataset/Mixtral-8x7B/alpaca \
--workers 4 \
--log-interval 1000 \
--tokenizer-type PretrainedFromHF
```
Configure Mixtral-8x7B pre-training script: ***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
Configure Mixtral-8x7B pre-training script: ***examples/mixtral/pretrain_mixtral_8x7b_ptd.sh***
```shell
# Set the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
```shell
# Set the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# Configure according to the actual vocabulary, dataset, and model parameter save path
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# Configure according to the actual vocabulary, dataset, and model parameter save path
DATA_PATH="./dataset/Mixtral-8x7B/alpaca_text_document"
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
CKPT_SAVE_DIR="./ckpt/Mixtral-8x7B/"
# Configure distributed parameters according to the actual distributed cluster
GPUS_PER_NODE=8
MASTER_ADDR="your master node IP"
MASTER_PORT=6000
NNODES=2
NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# Configure distributed parameters according to the actual distributed cluster
GPUS_PER_NODE=8
MASTER_ADDR="your master node IP"
MASTER_PORT=6000
NNODES=2
NODE_RANK="current node id"
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
# Training parallel strategy
TP=1
PP=8
EP=2
```
# Training parallel strategy
TP=1
PP=8
EP=2
```
Start Mixtral-8x7B pre-training script: ***examples/pretrain_mixtral_8x7b_ptd.sh***
Start Mixtral-8x7B pre-training script: ***examples/pretrain_mixtral_8x7b_ptd.sh***
```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
```
```shell
bash examples/mixtral/pretrain_mixtral_8x7b_ptd.sh
```
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
**Note**: If using multi machine training, it is necessary to set up multi machine data sharing, and non primary nodes can read the primary node data through data sharing. Alternatively, directly copy the data generated by the master node to non master nodes.
Fine-Tuning
2. Fine-Tuning
Prepare fine-tuning dataset
Download the fine-tuning datasets from [here](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
Prepare fine-tuning dataset
Download the fine-tuning datasets from [here](https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl)
```shell
# download datasets
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd ..
```shell
# download datasets
mkdir finetune_dataset
cd ./finetune_dataset
wget https://huggingface.co/datasets/silk-road/alpaca-data-gpt4-chinese/blob/main/Alpaca_data_gpt4_zh.jsonl
cd ..
# process datasets
mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \
--tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \
--workers 4
```
# process datasets
mkdir ./finetune_dataset/Mixtral-8x7B/
python ./tools/preprocess_data.py \
--input ./finetune_dataset/Alpaca_data_gpt4_zh.jsonl \
--output-prefix ./finetune_dataset/Mixtral-8x7B/alpaca \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ./model_from_hf/Mixtral-8x7B/ \
--append-eod \
--tokenizer-not-use-fast \
--handler-name GeneralInstructionHandler \
--workers 4
```
Supervised Fine-Tuning
3. Supervised Fine-Tuning
The configuration script for full parameters fine-tuning is basically the same as that for pretrain shell. *The difference is that the dataset and the training parameter is-instruction-dataset are added.*
The configuration script for full parameters fine-tuning is basically the same as that for pretrain shell. *The difference is that the dataset and the training parameter is-instruction-dataset are added.*
Add the fine-tuning parameter `--finetune` and add pretrained-weight load parameter `--load`, so that fine-tuning starts from the first step.
Add the fine-tuning parameter `--finetune` and add pretrained-weight load parameter `--load`, so that fine-tuning starts from the first step.
```shell
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset
```
```shell
DATA_PATH="./finetune_dataset/Mixtral-8x7B/alpaca"
CKPT_PATH="./ckpt/Mixtral-8x7B/"
--load ${CKPT_PATH} \
--finetune \
--is-instruction-dataset
```
## Model-Performance