mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-05 05:17:40 +08:00
!1285 更新 mixtral-moe 模型至32K
Merge pull request !1285 from fengliangjun/master
This commit is contained in:
parent
cf6e8f4a9c
commit
4a683f8dbe
821
README_en.md
821
README_en.md
File diff suppressed because it is too large
Load Diff
@ -24,9 +24,9 @@
|
||||
|
||||
训练的最低硬件配置:
|
||||
|
||||
| 硬件 | 配置 |
|
||||
| :--: | :--------------: |
|
||||
| NPU | 16 x Ascend NPUs |
|
||||
| 硬件 | 配置 |
|
||||
| :--: |:----------------:|
|
||||
| NPU | 32 x Ascend NPUs |
|
||||
|
||||
推理的推荐硬件配置:
|
||||
|
||||
@ -104,15 +104,15 @@
|
||||
--loader mixtral_hf \
|
||||
--saver mixtral \
|
||||
--load-dir ./model_from_hf/Mixtral-8x7B/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 8 \
|
||||
--target-expert-parallel-size 2
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 4 \
|
||||
--target-expert-parallel-size 1
|
||||
```
|
||||
|
||||
任意并行切分策略的Megatron权重 --> 任意并行切分策略的Megatron权重
|
||||
***(该场景一般用于重新配置切分后模型的权重,比如在双机16卡 EP2-PP8策略下训练完了,想在单机8卡 TP8上进行推理)***
|
||||
***(该场景一般用于重新配置切分后模型的权重,比如在四机32卡 TP8-PP4策略下训练完了,想在单机8卡 TP8上进行推理)***
|
||||
|
||||
```bash
|
||||
# 修改 ascend-toolkit 路径
|
||||
@ -123,10 +123,10 @@
|
||||
--model-type GPT \
|
||||
--loader mixtral_mg \
|
||||
--saver mixtral \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 8 \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1/ \
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 1 \
|
||||
--target-expert-parallel-size 1
|
||||
```
|
||||
|
||||
@ -143,7 +143,7 @@
|
||||
--loader mixtral_mg \
|
||||
--saver mixtral \
|
||||
--save-model-type huggingface \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- 需要填入原始HF模型路径,新权重会存于./model_from_hf/Mixtral-8x7B/mg2hg/
|
||||
```
|
||||
|
||||
@ -184,14 +184,14 @@
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR="your master node IP"
|
||||
MASTER_PORT=6000
|
||||
NNODES=2
|
||||
NNODES=4
|
||||
NODE_RANK="current node id"
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
|
||||
|
||||
# 训练并行策略
|
||||
TP=1
|
||||
PP=8
|
||||
EP=2
|
||||
TP=8
|
||||
PP=4
|
||||
EP=1
|
||||
```
|
||||
|
||||
启动 Mixtral-8x7B 预训练脚本: ***examples/pretrain_mixtral_8x7b_ptd.sh***
|
||||
@ -245,13 +245,12 @@
|
||||
|
||||
### 吞吐
|
||||
|
||||
Mixtral-8x7B 在双机16卡上(ep2 pp8) **昇腾芯片** 和 **参考芯片** 上的性能对比:
|
||||
*(当节点够多的情况下,ep越大吞吐越大,这里并非为最佳性能,仅供参考)*
|
||||
Mixtral-8x7B 在四机32卡上(tp8 pp4) **昇腾芯片** 和 **参考芯片** 上的性能对比:
|
||||
|
||||
| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/step) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) |
|
||||
| :--: | :----------: | :----: | :---------------------: | :---------------------: | :-------------------: |
|
||||
| NPUs | Mixtral-8x7B | 1000 | 4.11 | 1053.6 | 31.13 |
|
||||
| 参考 | Mixtral-8x7B | 1000 | 4.45 | 1139.3 | 28.76 |
|
||||
| :--: | :----------: | :----: |:-------------------:|:---------------------:|:---------------:|
|
||||
| NPUs | Mixtral-8x7B | 1000 | 0.47 | 487 | 16.81 |
|
||||
| 参考 | Mixtral-8x7B | 1000 | 0.59 | 610 | 13.41 |
|
||||
|
||||
## 模型推理
|
||||
|
||||
@ -301,7 +300,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# 修改模型参数路径和词表路径
|
||||
TOKENIZER_PATH="./model_from_hf/Mixtral-8x7B/" #词表路径
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1" #模型路径
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1" #模型路径
|
||||
# 配置任务和数据集路径
|
||||
DATA_PATH="./mmlu/test/"
|
||||
TASK="mmlu"
|
||||
|
@ -24,9 +24,9 @@
|
||||
|
||||
Minimum hardware requirements for training:
|
||||
|
||||
| Hardware | Configuration |
|
||||
| :------: | :--------------: |
|
||||
| NPU | 16 x Ascend NPUs |
|
||||
| Hardware | Configuration |
|
||||
| :------: |:----------------:|
|
||||
| NPU | 32 x Ascend NPUs |
|
||||
|
||||
Recommended hardware configuration for inference:
|
||||
|
||||
@ -105,11 +105,11 @@ Recommended hardware configuration for inference:
|
||||
--loader mixtral_hf \
|
||||
--saver mixtral \
|
||||
--load-dir ./model_from_hf/Mixtral-8x7B/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--tokenizer-model ./model_from_hf/Mixtral-8x7B/tokenizer.model \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 8 \
|
||||
--target-expert-parallel-size 2
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 4 \
|
||||
--target-expert-parallel-size 1
|
||||
```
|
||||
|
||||
Any Megatron weights with parallel slicing strategy --> Any Megatron weights with parallel slicing strategy
|
||||
@ -124,10 +124,10 @@ Recommended hardware configuration for inference:
|
||||
--model-type GPT \
|
||||
--loader mixtral_mg \
|
||||
--saver mixtral \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/ \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 8 \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--save-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1/ \
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 1 \
|
||||
--target-expert-parallel-size 1
|
||||
```
|
||||
|
||||
@ -144,7 +144,7 @@ Recommended hardware configuration for inference:
|
||||
--loader mixtral_mg \
|
||||
--saver mixtral \
|
||||
--save-model-type huggingface \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep2/ \
|
||||
--load-dir ./model_weights/Mixtral-8x7B-v0.1-tp8-pp4-ep1/ \
|
||||
--save-dir ./model_from_hf/Mixtral-8x7B/ # <-- Fill in the original HF model path here, new weights will be saved in ./model_from_hf/Mixtral-8x7B/mg2hg/
|
||||
```
|
||||
|
||||
@ -185,14 +185,14 @@ Recommended hardware configuration for inference:
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR="your master node IP"
|
||||
MASTER_PORT=6000
|
||||
NNODES=2
|
||||
NNODES=4
|
||||
NODE_RANK="current node id"
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
|
||||
|
||||
# Training parallel strategy
|
||||
TP=1
|
||||
PP=8
|
||||
EP=2
|
||||
TP=8
|
||||
PP=4
|
||||
EP=1
|
||||
```
|
||||
|
||||
Start Mixtral-8x7B pre-training script: ***examples/pretrain_mixtral_8x7b_ptd.sh***
|
||||
@ -246,13 +246,12 @@ Recommended hardware configuration for inference:
|
||||
|
||||
### Throughput
|
||||
|
||||
Comparison of Mixtral-8x7B performance on 2 nodes and 16 chips with ep2 pp8:
|
||||
**(When there are enough nodes, the larger the ep, the higher the throughput. This is not the optimal performance here, just for reference)**
|
||||
Comparison of Mixtral-8x7B performance on 4 nodes and 32 chips with tp8 pp4:
|
||||
|
||||
| Device | Model | Iterations | Sample Throughput (samples/step) | Tokens Throughput (tokens/s/p) | Single Step Iteration Time (s/step) |
|
||||
| :-------: | :----------: | :--------: | :------------------------------: | :----------------------------: | :---------------------------------: |
|
||||
| NPUs | Mixtral-8x7B | 1000 | 3.13 | 1053.63 | 31.13 |
|
||||
| Reference | Mixtral-8x7B | 1000 | 4.45 | 1139.3 | 28.76 |
|
||||
| :-------: | :----------: | :--------: |:--------------------------------:|:------------------------------:|:-----------------------------------:|
|
||||
| NPUs | Mixtral-8x7B | 1000 | 0.47 | 487 | 16.81 |
|
||||
| Reference | Mixtral-8x7B | 1000 | 0.59 | 610 | 13.41 |
|
||||
|
||||
## Model-Inference
|
||||
|
||||
@ -263,7 +262,7 @@ First, configure the inference script: ***examples/mixtral/generate_mixtral_8x7b
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# Modify the model weight path and tokenizer path
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1/"
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1/"
|
||||
TOKENIZER_MODEL="./model_from_hf/Mixtral-8x7B/"
|
||||
|
||||
# Modify according to the actual loaded model weight the parallel configuration
|
||||
@ -302,7 +301,7 @@ source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# Modify the model parameter path and tokenizer path
|
||||
TOKENIZER_PATH="./model_from_hf/Mixtral-8x7B/" #tokenizer path
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp1-pp8-ep1" #model path
|
||||
CHECKPOINT="./model_weights/Mixtral-8x7B-v0.1-tp8-pp1-ep1" #model path
|
||||
|
||||
# Configure tasks and dataset paths
|
||||
DATA_PATH="./mmlu/data/test/"
|
||||
|
@ -41,7 +41,7 @@ GPT_ARGS="
|
||||
--num-query-groups 8 \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path ${TOKENIZER_PATH} \
|
||||
--seq-length 4096 \
|
||||
--seq-length 32768 \
|
||||
--max-position-embeddings 32768 \
|
||||
--micro-batch-size 1 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
|
@ -38,7 +38,7 @@ GPT_ARGS="
|
||||
--num-query-groups 8 \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
|
||||
--seq-length 4096 \
|
||||
--seq-length 32768 \
|
||||
--max-position-embeddings 32768 \
|
||||
--micro-batch-size 1 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
|
@ -6,7 +6,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR="your master node IP"
|
||||
MASTER_PORT=6000
|
||||
NNODES=2
|
||||
NNODES=4
|
||||
NODE_RANK=0
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
|
||||
|
||||
@ -26,9 +26,9 @@ CKPT_SAVE_DIR="your model save ckpt path"
|
||||
CKPT_LOAD_DIR="your model ckpt path"
|
||||
|
||||
|
||||
TP=1
|
||||
PP=8
|
||||
EP=2
|
||||
TP=8
|
||||
PP=4
|
||||
EP=1
|
||||
NUM_LAYERS=32
|
||||
|
||||
MOE_ARGS="
|
||||
@ -56,13 +56,13 @@ GPT_ARGS="
|
||||
--num-query-groups 8 \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
|
||||
--seq-length 4096 \
|
||||
--seq-length 32768 \
|
||||
--max-position-embeddings 32768 \
|
||||
--micro-batch-size 1 \
|
||||
--global-batch-size 128 \
|
||||
--global-batch-size 8 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
--lr 1.25e-6 \
|
||||
--train-iters 1000 \
|
||||
--train-iters 2000 \
|
||||
--lr-decay-style cosine \
|
||||
--untie-embeddings-and-output-weights \
|
||||
--disable-bias-linear \
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 19 KiB |
Loading…
Reference in New Issue
Block a user