From bf0ebba09d58a9f5e11ab12bb655f27a6fe0c8b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=95=86=E5=85=83=E4=B9=89?= Date: Tue, 24 Sep 2024 14:40:52 +0000 Subject: [PATCH] =?UTF-8?q?!1707=20=E6=B7=BB=E5=8A=A0=E6=96=B0=E6=A8=A1?= =?UTF-8?q?=E5=9E=8BQwen2-57B-A14B=20Merge=20pull=20request=20!1707=20from?= =?UTF-8?q?=20=E5=95=86=E5=85=83=E4=B9=89/master?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 21 ++- convert_ckpt.py | 2 +- .../qwen2/ckpt_convert_qwen2_hf2mcore.sh | 2 +- .../mcore/qwen2/pretrain_qwen2_72b_4K_ptd.sh | 1 + .../ckpt_convert_qwen2_moe_hf2mcore.sh | 19 +++ .../ckpt_convert_qwen2_moe_mcore2hf.sh | 17 ++ .../data_convert_qwen2_moe_pretrain.sh | 13 ++ .../data_convert_qwen2_moe_pretrain_32K.sh | 12 ++ .../data_convert_qwen2_moe_pretrain_4K.sh | 11 ++ .../qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh | 85 ++++++++++ .../qwen2_moe/generate_qwen2_57b_a14b_ptd.sh | 88 +++++++++++ .../pretrain_qwen2_57b_a14b_32K_ptd.sh | 146 ++++++++++++++++++ .../pretrain_qwen2_57b_a14b_4K_ptd.sh | 140 +++++++++++++++++ modellink/arguments.py | 11 +- modellink/checkpointing.py | 3 +- modellink/core/transformer/moe/moe_layer.py | 15 ++ .../core/transformer/transformer_layer.py | 3 + modellink/tasks/checkpoint/loader_hf.py | 4 + modellink/tasks/checkpoint/loader_mg.py | 4 + modellink/tasks/checkpoint/model_cfg.json | 41 +++++ modellink/tasks/checkpoint/models.py | 11 +- modellink/tasks/checkpoint/saver.py | 5 + 22 files changed, 642 insertions(+), 12 deletions(-) create mode 100644 examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_hf2mcore.sh create mode 100644 examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_mcore2hf.sh create mode 100644 examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain.sh create mode 100644 examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_32K.sh create mode 100644 examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_4K.sh create mode 100644 examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh create mode 100644 examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh create mode 100644 examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32K_ptd.sh create mode 100644 examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4K_ptd.sh diff --git a/README.md b/README.md index be8f59fd..8f908398 100644 --- a/README.md +++ b/README.md @@ -596,8 +596,7 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演 【GTS】 【Test】 - - Qwen2 + Qwen2 0.5B 4K Mcore @@ -655,6 +654,16 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演 【GTS】 【Test】 + 57B-A14B + 4K + Mcore + 8x8 + 3380 + -- + -- + 【GTS】 + 【Test】 + 72B 4K Mcore @@ -664,7 +673,7 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演 -- 【GTS】 【Test】 - + Yi 34B @@ -1509,9 +1518,9 @@ ModelLink已支持模型的评估数据统计如下: | QWen1.5-7B | MMLU | 60.3% | [61.0%](https://qwenlm.github.io/zh/blog/qwen1.5/) | QWen1.5-14B | MMLU | 67.3% | [67.6%](https://qwenlm.github.io/zh/blog/qwen1.5) | | QWen1.5-32B | MMLU | 72.5% | [73.4%](https://huggingface.co/Qwen/Qwen-72B) | QWen1.5-72B | MMLU | 76.4% | [77.5%](https://qwenlm.github.io/zh/blog/qwen1.5) | | Qwen1.5-110B | MMLU | 80.4% | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/) | Yi-34B | MMLU | 76.3% | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU) | -| Qwen2-0.5B | MMLU | 44.6% | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/) | Qwen2-1.5B | MMLU | 54.7% | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/) | -| QWen2-7B | MMLU | 70.3% | [70.3%](https://qwenlm.github.io/zh/blog/qwen2/) | Qwen2-72B | MMLU | 83.6% | [84.2%](https://qwenlm.github.io/zh/blog/qwen2/) | -| MiniCPM-2B | MMLU | 51.6% | [53.4%](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#3) | -- | -- | -- | -- +| QWen2-0.5B | MMLU | 44.6% | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/) | QWen2-1.5B | MMLU | 54.7% | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/) | +| QWen2-7B | MMLU | 70.3% | [70.3%](https://qwenlm.github.io/zh/blog/qwen2/) | QWen2-57B-A14B |MMLU|75.6% | [76.5%](https://qwenlm.github.io/zh/blog/qwen2/)| +| QWen2-72B | MMLU | 83.6% | [84.2%](https://qwenlm.github.io/zh/blog/qwen2/)| MiniCPM-2B | MMLU | 51.6% | [53.4%](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#3) | | DeepSeek-V2-Lite-16B | MMLU | 57.4% | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite) | -- | -- | -- | -- | diff --git a/convert_ckpt.py b/convert_ckpt.py index 5083b2ae..555812a9 100644 --- a/convert_ckpt.py +++ b/convert_ckpt.py @@ -73,7 +73,7 @@ def main(): dest='checking') parser.add_argument('--model-type-hf', type=str, default="llama2", choices=['baichuan', 'baichuan2', 'llama2', 'mixtral', 'chatglm3', 'gemma', 'gemma2', 'bloom', - 'qwen', 'internlm2', 'deepseek2', 'minicpm', 'minicpm-moe', 'deepseek2-lite'], + 'qwen', 'internlm2', 'deepseek2', 'minicpm', 'minicpm-moe', 'deepseek2-lite', 'qwen2-moe'], help='model type of huggingface') known_args, _ = parser.parse_known_args() diff --git a/examples/mcore/qwen2/ckpt_convert_qwen2_hf2mcore.sh b/examples/mcore/qwen2/ckpt_convert_qwen2_hf2mcore.sh index 1060114d..09ccaee4 100644 --- a/examples/mcore/qwen2/ckpt_convert_qwen2_hf2mcore.sh +++ b/examples/mcore/qwen2/ckpt_convert_qwen2_hf2mcore.sh @@ -14,4 +14,4 @@ python convert_ckpt.py \ --save-dir ./model_weights/qwen2_mcore/ \ --tokenizer-model ./model_from_hf/qwen2_hf/tokenizer.json \ --model-type-hf llama2 \ - --params-dtype bf16 + --params-dtype bf16 # --num-layers-per-virtual-pipeline-stage 2 等参数根据模型需要添加 diff --git a/examples/mcore/qwen2/pretrain_qwen2_72b_4K_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_72b_4K_ptd.sh index 57a12540..79806d1f 100644 --- a/examples/mcore/qwen2/pretrain_qwen2_72b_4K_ptd.sh +++ b/examples/mcore/qwen2/pretrain_qwen2_72b_4K_ptd.sh @@ -30,6 +30,7 @@ GPT_ARGS=" --use-mcore-models \ --tensor-model-parallel-size ${TP} \ --pipeline-model-parallel-size ${PP} \ + --num-layers-per-virtual-pipeline-stage 2 --sequence-parallel \ --num-layers 80 \ --hidden-size 8192 \ diff --git a/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_hf2mcore.sh b/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_hf2mcore.sh new file mode 100644 index 00000000..5859a082 --- /dev/null +++ b/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_hf2mcore.sh @@ -0,0 +1,19 @@ +# 请按照您的真实环境修改 set_env.sh 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +# 设置需要的权重转换参数 +python convert_ckpt.py \ + --use-mcore-models \ + --model-type GPT \ + --load-model-type hf \ + --save-model-type mg \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 4 \ + --target-expert-parallel-size 1 \ + --add-qkv-bias \ + --load-dir ./model_from_hf/qwen2_moe_hf/ \ + --save-dir ./model_weights/qwen2_moe_mcore/ \ + --tokenizer-model ./model_from_hf/qwen2_moe_hf/tokenizer.json \ + --model-type-hf qwen2-moe \ + --moe-grouped-gemm \ + --params-dtype bf16 \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_mcore2hf.sh b/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_mcore2hf.sh new file mode 100644 index 00000000..e3485c29 --- /dev/null +++ b/examples/mcore/qwen2_moe/ckpt_convert_qwen2_moe_mcore2hf.sh @@ -0,0 +1,17 @@ +## 修改 ascend-toolkit 路径 +#source /usr/local/Ascend/ascend-toolkit/set_env.sh + +# 设置需要的权重转换参数 +python convert_ckpt.py \ + --use-mcore-models \ + --model-type GPT \ + --model-type-hf qwen2-moe \ + --load-model-type mg \ + --save-model-type hf \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 1 \ + --add-qkv-bias \ + --moe-grouped-gemm \ + --params-dtype bf16 \ + --load-dir ./model_weights/qwen2_moe_mcore/ \ + --save-dir ./model_from_hf/qwen2_moe_hf/ # 需要填入原始HF模型路径,新权重会存于./model_from_hf/qwen2_moe_hf/mg2hf/ \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain.sh b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain.sh new file mode 100644 index 00000000..73dc5d81 --- /dev/null +++ b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain.sh @@ -0,0 +1,13 @@ +# 修改 ascend-toolkit 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +mkdir ./dataset + +python ./preprocess_data.py \ + --input ./dataset/train-00000-of-00042-d964455e17e96d5a.parquet \ + --tokenizer-name-or-path ./model_from_hf/qwen2_moe_hf/ \ + --tokenizer-type PretrainedFromHF \ + --handler-name GeneralPretrainHandler \ + --output-prefix ./dataset/enwiki \ + --json-keys text \ + --workers 4 \ + --log-interval 1000 \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_32K.sh b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_32K.sh new file mode 100644 index 00000000..03e5e60f --- /dev/null +++ b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_32K.sh @@ -0,0 +1,12 @@ +source /usr/local/Ascend/ascend-toolkit/set_env.sh +mkdir ./dataset + +python ./preprocess_data.py \ + --input ./dataset/train-00000-of-00042-d964455e17e96d5a.parquet \ + --tokenizer-name-or-path ./model_from_hf/qwen2_moe_hf/ \ + --tokenizer-type PretrainedFromHF \ + --handler-name GeneralPretrainHandler \ + --output-prefix ./dataset/enwiki \ + --json-keys text \ + --workers 4 \ + --log-interval 1000 \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_4K.sh b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_4K.sh new file mode 100644 index 00000000..91169857 --- /dev/null +++ b/examples/mcore/qwen2_moe/data_convert_qwen2_moe_pretrain_4K.sh @@ -0,0 +1,11 @@ +# 请按照您的真实环境修改 set_env.sh 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +mkdir ./dataset + +python ./preprocess_data.py \ + --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ + --tokenizer-name-or-path ./model_from_hf/qwen2_moe_hf \ + --output-prefix ./dataset/alpaca \ + --tokenizer-type PretrainedFromHF \ + --workers 4 \ + --log-interval 1000 \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh new file mode 100644 index 00000000..91dfe73d --- /dev/null +++ b/examples/mcore/qwen2_moe/evaluate_qwen2_57b_a14b_ptd.sh @@ -0,0 +1,85 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +CHECKPOINT="Your ckpt file path" +TOKENIZER_PATH="Your vocab file path" +DATA_PATH="Your data path (such as ./mmlu/test/)" +TASK="mmlu" + +# distributed config +MASTER_ADDR=localhost +MASTER_PORT=6014 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=4 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=4 +EP=1 +SEQ_LENGTH=4096 +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 64 \ + --moe-router-topk 8 \ + --n-shared-experts 8 \ + --shared-expert-gate \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 2560 \ + --moe-grouped-gemm \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather +" + +torchrun $DISTRIBUTED_ARGS evaluation.py \ + $MOE_ARGS \ + --use-mcore-models \ + --task-data-path $DATA_PATH \ + --task ${TASK} \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --seq-length ${SEQ_LENGTH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --max-new-tokens 1 \ + --num-layers 28 \ + --hidden-size 3584 \ + --ffn-hidden-size 18944 \ + --num-attention-heads 28 \ + --disable-bias-linear \ + --swiglu \ + --position-embedding-type rope \ + --load ${CHECKPOINT} \ + --normalization RMSNorm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --micro-batch-size 1 \ + --exit-on-missing-checkpoint \ + --no-load-rng \ + --no-load-optim \ + --untie-embeddings-and-output-weights \ + --add-qkv-bias \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --no-gradient-accumulation-fusion \ + --attention-softmax-in-fp32 \ + --input-layernorm-in-fp32 \ + --no-masked-softmax-fusion \ + --seed 42 \ + --group-query-attention \ + --num-query-groups 4 \ + --no-chat-template \ + --seed 42 \ + --bf16 \ + | tee logs/eval_mcore_qwen2_57b_a14b.log \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh new file mode 100644 index 00000000..b736b828 --- /dev/null +++ b/examples/mcore/qwen2_moe/generate_qwen2_57b_a14b_ptd.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# The number of parameters is not aligned +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your model directory path" + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6015 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=4 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=4 +EP=1 +SEQ_LENGTH=4096 +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 64 \ + --moe-router-topk 8 \ + --n-shared-experts 8 \ + --shared-expert-gate \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 2560 \ + --moe-grouped-gemm \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather \ + --moe-aux-loss-coeff 0.001 +" + +torchrun $DISTRIBUTED_ARGS inference.py \ + $MOE_ARGS \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --load ${CHECKPOINT} \ + --num-layers 28 \ + --hidden-size 3584 \ + --use-rotary-position-embeddings \ + --num-attention-heads 28 \ + --ffn-hidden-size 18944 \ + --max-position-embeddings ${SEQ_LENGTH} \ + --seq-length ${SEQ_LENGTH} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --micro-batch-size 1 \ + --disable-bias-linear \ + --swiglu \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --use-rotary-position-embeddings \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --tokenizer-not-use-fast \ + --add-qkv-bias \ + --max-new-tokens 256 \ + --no-gradient-accumulation-fusion \ + --exit-on-missing-checkpoint \ + --attention-softmax-in-fp32 \ + --input-layernorm-in-fp32 \ + --no-masked-softmax-fusion \ + --group-query-attention \ + --num-query-groups 4 \ + --seed 42 \ + --bf16 \ + | tee logs/generate_mcore_qwen2_57b_a14b.log \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32K_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32K_ptd.sh new file mode 100644 index 00000000..3423926a --- /dev/null +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_32K_ptd.sh @@ -0,0 +1,146 @@ +#!/bin/bash + +export HCCL_CONNECT_TIMEOUT=1800 +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=28 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=28 +EP=2 +CP=4 +SEQ_LENGTH=32768 +TRAIN_ITERS=5000 +CP_TYPE='ulysses_cp_algo' +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 64 \ + --moe-router-topk 8 \ + --n-shared-experts 8 \ + --shared-expert-gate \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 2560 \ + --moe-grouped-gemm \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather \ + --moe-aux-loss-coeff 0.001 +" + +ROPE_ARGS=" + --rope-scaling-type yarn \ + --rope-scaling-factor 16 \ + --rope-scaling-original-max-position-embeddings 4096 \ +" + +OPTIMIZE_ARGS=" + --use-mc2 \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer +" + +TRAIN_ARGS=" + --micro-batch-size 1 \ + --global-batch-size 64 \ + --lr 1.25e-7 \ + --lr-decay-style cosine \ + --min-lr 1.25e-8 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --num-layers 28 \ + --hidden-size 3584 \ + --ffn-hidden-size 18944 \ + --num-attention-heads 28 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --add-qkv-bias \ + --no-gradient-accumulation-fusion \ + --group-query-attention \ + --num-query-groups 4 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + $MOE_ARGS \ + $ROPE_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + --distributed-backend nccl \ + | tee logs/train_mcore_qwen2_57b_a14b_32k.log \ No newline at end of file diff --git a/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4K_ptd.sh b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4K_ptd.sh new file mode 100644 index 00000000..52b77eea --- /dev/null +++ b/examples/mcore/qwen2_moe/pretrain_qwen2_57b_a14b_4K_ptd.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +export HCCL_CONNECT_TIMEOUT=1800 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NPU_ASD_ENABLE=0 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=8 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=4 +EP=4 +CP=4 +SEQ_LENGTH=4096 +TRAIN_ITERS=5000 +CP_TYPE='ulysses_cp_algo' +ROUTER_BALANCING_TYPE='softmax_topk' + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +MOE_ARGS=" + --num-experts 64 \ + --moe-router-topk 8 \ + --n-shared-experts 8 \ + --shared-expert-gate \ + --moe-router-load-balancing-type ${ROUTER_BALANCING_TYPE} \ + --moe-intermediate-size 2560 \ + --moe-grouped-gemm \ + --moe-permutation-async-comm \ + --moe-token-dispatcher-type allgather \ + --moe-aux-loss-coeff 0.001 +" + +OPTIMIZE_ARGS=" + --use-mc2 \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --use-distributed-optimizer +" + +TRAIN_ARGS=" + --micro-batch-size 1 \ + --global-batch-size 64 \ + --lr 1.25e-6 \ + --lr-decay-style cosine \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --seed 42 \ + --bf16 \ + --train-iters ${TRAIN_ITERS} \ + --seq-length ${SEQ_LENGTH} \ + --no-shared-storage +" + +MODEL_PARALLEL_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --expert-model-parallel-size ${EP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_TYPE} \ +" + +GPT_ARGS=" + --use-mcore-models \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-position-embeddings ${SEQ_LENGTH} \ + --num-layers 28 \ + --hidden-size 3584 \ + --ffn-hidden-size 18944 \ + --num-attention-heads 28 \ + --tokenizer-type PretrainedFromHF \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --untie-embeddings-and-output-weights \ + --disable-bias-linear \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --swiglu \ + --attention-softmax-in-fp32 \ + --add-qkv-bias \ + --no-gradient-accumulation-fusion \ + --group-query-attention \ + --num-query-groups 4 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval ${TRAIN_ITERS} \ + --eval-interval ${TRAIN_ITERS} \ + --eval-iters 0 \ + --no-load-optim \ + --no-load-rng +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $MOE_ARGS \ + $OUTPUT_ARGS \ + $OPTIMIZE_ARGS \ + $TRAIN_ARGS \ + $MODEL_PARALLEL_ARGS \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + --distributed-backend nccl \ + | tee logs/train_mcore_qwen2_57b_a14b_4k.log \ No newline at end of file diff --git a/modellink/arguments.py b/modellink/arguments.py index cc64ca66..21985d28 100644 --- a/modellink/arguments.py +++ b/modellink/arguments.py @@ -268,7 +268,11 @@ def _add_moe_args(parser): group.add_argument('--output-multiplier-scale', type=float, default=None, help='Add scale for logits output.') group.add_argument("--moe-permutation-async-comm", action='store_true', help="overlap moe permutation 3 all gather communications") - + group.add_argument("--shared-expert-gate", action='store_true', + help="moe model has shared expert gate") + group.add_argument("--shared-expert-gate-output-dimension", type=int, default=1, + help="moe model shared expert gate output dimension for qwen2 moe, this parameter can only configured with" + "1 or hidden_state") return parser @@ -295,7 +299,8 @@ def _add_data_args(parser): help="Name or path of the huggingface tokenizer.") group.add_argument("--tokenizer-not-use-fast", action='store_false', help="HuggingFace tokenizer not use the fast version.") - + group.add_argument("--input-layernorm-in-fp32", action='store_true', + help="Convert input-layernorm to fp32") return parser @@ -639,6 +644,8 @@ def _validate_moe_args(args): raise ValueError(f'moe_expert_capacity_factor only works with aux_loss or none load balancing') if args.moe_expert_capacity_factor is None and args.moe_pad_expert_input_to_capacity: raise ValueError(f'moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity') + if args.shared_expert_gate_output_dimension != 1 and args.shared_expert_gate_output_dimension != args.hidden_size: + raise AssertionError('shared expert gate output dimension can only be configured with 1 or hidden_size') def _validate_mla(args): diff --git a/modellink/checkpointing.py b/modellink/checkpointing.py index 62d464ab..979ecd22 100644 --- a/modellink/checkpointing.py +++ b/modellink/checkpointing.py @@ -94,7 +94,8 @@ def load_args_from_checkpoint_wrapper(fn): _set_arg('q_lora_rank', force=True) _set_arg('kv_lora_rank', force=True) _set_arg('v_head_dim', force=True) - + _set_arg('shared_expert_gate', force=True) + state_dict, checkpoint_name, release = _load_base_checkpoint( getattr(args, kwargs.get('load_arg', 'load')), rank0=True, diff --git a/modellink/core/transformer/moe/moe_layer.py b/modellink/core/transformer/moe/moe_layer.py index 7d80ab04..d9fca079 100644 --- a/modellink/core/transformer/moe/moe_layer.py +++ b/modellink/core/transformer/moe/moe_layer.py @@ -4,9 +4,11 @@ import types from copy import deepcopy from functools import wraps import torch +import torch.nn.functional as F from mindspeed.moe.utils import MoEAuxLossAutoScaler from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear +from megatron.core.transformer import build_module from megatron.core.transformer.mlp import MLPSubmodules, MLP from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP from megatron.core.transformer.moe.moe_utils import save_to_aux_losses_tracker @@ -36,6 +38,17 @@ def moe_layer_init_wrapper(init_func): linear_fc2=RowParallelLinear,)) # For using layer_number when recompute activation function is enabled. self.shared_experts.layer_number = self.layer_number + if global_args.shared_expert_gate: + self.shared_expert_gate = build_module( + RowParallelLinear, + config.hidden_size, + global_args.shared_expert_gate_output_dimension, + config=config, + init_method=config.output_layer_init_method, + bias=None, + input_is_parallel=True, + skip_bias_add=True + ) return moe_layer_init @@ -83,6 +96,8 @@ def moe_layer_forward(self, hidden_states: torch.Tensor): if args.n_shared_experts: share_experts_output, share_experts_bias = self.shared_experts(hidden_states) + if args.shared_expert_gate: + share_experts_output = F.sigmoid(self.shared_expert_gate(hidden_states)[0]) * share_experts_output output = output + share_experts_output if self.token_dispatcher.add_bias: diff --git a/modellink/core/transformer/transformer_layer.py b/modellink/core/transformer/transformer_layer.py index c51d7f34..eb3c2f1e 100644 --- a/modellink/core/transformer/transformer_layer.py +++ b/modellink/core/transformer/transformer_layer.py @@ -107,6 +107,9 @@ def transformer_layer_forward(self, hidden_states, attention_mask, context=None, # Optional Input Layer norm input_layernorm_output = self.input_layernorm(hidden_states) + if args.input_layernorm_in_fp32: + input_layernorm_output = input_layernorm_output.float() + # Self attention. attention_output_with_bias = self.self_attention( input_layernorm_output, diff --git a/modellink/tasks/checkpoint/loader_hf.py b/modellink/tasks/checkpoint/loader_hf.py index c61448a4..646cab78 100644 --- a/modellink/tasks/checkpoint/loader_hf.py +++ b/modellink/tasks/checkpoint/loader_hf.py @@ -239,6 +239,7 @@ def get_message_layer_mlp(message, model, layer_idx, md=None, tp_size=1): margs = model.get_args() first_k_dense_replace = getattr(margs, 'first_k_dense_replace', None) moe_layer_freq = getattr(margs, 'moe_layer_freq', None) + shared_expert_gate = getattr(margs, 'shared_expert_gate', None) if ( margs.num_experts and first_k_dense_replace is not None @@ -248,6 +249,9 @@ def get_message_layer_mlp(message, model, layer_idx, md=None, tp_size=1): message["mlp_moe"] = {} mlp_router_weight = model.get_layers_mlp_router_weight(layer_idx=layer_idx) message["mlp_moe"]["mlp router weight"] = mlp_router_weight + if shared_expert_gate: + shared_expert_gate = model.get_layers_mlp_shared_expert_gate_weight(layer_idx=layer_idx) + message["mlp_moe"]["mlp shared_expert_gate weight"] = shared_expert_gate if getattr(margs, "n_shared_experts", None) is not None: fc1_weight = model.get_layers_mlp_shared_experts_linear_fc1_weight(layer_idx=layer_idx) fc2_weight = model.get_layers_mlp_shared_experts_linear_fc2_weight(layer_idx=layer_idx) diff --git a/modellink/tasks/checkpoint/loader_mg.py b/modellink/tasks/checkpoint/loader_mg.py index b9a3ea08..57f0f473 100644 --- a/modellink/tasks/checkpoint/loader_mg.py +++ b/modellink/tasks/checkpoint/loader_mg.py @@ -231,6 +231,7 @@ def get_message_layer_mlp(message, model, md=None, **kwargs): layer_idx = kwargs["layer_idx"] + kwargs["pp_rank"] * len(model.get_layers_module(**kwargs)) first_k_dense_replace = getattr(margs, 'first_k_dense_replace', None) moe_layer_freq = getattr(margs, 'moe_layer_freq', None) + shared_expert_gate = getattr(margs, 'shared_expert_gate', None) if ( margs.num_experts and first_k_dense_replace is not None @@ -241,6 +242,9 @@ def get_message_layer_mlp(message, model, md=None, **kwargs): mlp_router_weight = model.get_layers_mlp_router_weight(**kwargs) num_experts_local = margs.num_experts // margs.expert_model_parallel_size message["mlp_moe"]["mlp router weight"] = mlp_router_weight + if shared_expert_gate: + shared_expert_gate = model.get_layers_mlp_shared_expert_gate_weight(**kwargs) + message["mlp_moe"]["mlp shared_expert_gate weight"] = shared_expert_gate weight1 = [] weight2 = [] for ep_rank in range(margs.expert_model_parallel_size): diff --git a/modellink/tasks/checkpoint/model_cfg.json b/modellink/tasks/checkpoint/model_cfg.json index 13f777d2..b194a732 100644 --- a/modellink/tasks/checkpoint/model_cfg.json +++ b/modellink/tasks/checkpoint/model_cfg.json @@ -45,6 +45,47 @@ "final_layernorm": "model.norm", "output_layer": "lm_head" } + }, + "qwen2-moe": { + "__base__": "base", + "config_set_value": { + "seq_length": 4096, + "global_batch_size": 64, + "qkv_type": "unpack", + "mlp_experts_flag": true, + "n_shared_experts": 8, + "shared_expert_gate": true, + "first_k_dense_replace": 0, + "moe_layer_freq": 1 + }, + "config_hf_key_mapping": { + "num_layers": "num_hidden_layers", + "norm_epsilon": "rms_norm_eps", + "rotary_base": "rope_theta" + }, + "model_hf_key_mapping": { + "model": "module[0]", + "embedding_word_embeddings": "model.embed_tokens", + "embedding_word_embeddings_norm": "model.embedding.word_embeddings.norm", + "layers": "model.layers", + "layers_input_layernorm": "model.layers[layer_idx].input_layernorm", + "layers_self_attention_linear_proj": "model.layers[layer_idx].self_attn.o_proj", + "layers_self_attention_linear_q_proj": "model.layers[layer_idx].self_attn.q_proj", + "layers_self_attention_linear_k_proj": "model.layers[layer_idx].self_attn.k_proj", + "layers_self_attention_linear_v_proj": "model.layers[layer_idx].self_attn.v_proj", + "layers_self_attention_pre_mlp_layernorm": "model.layers[layer_idx].post_attention_layernorm", + "layers_mlp_router": "model.layers[layer_idx].mlp.gate", + "layers_mlp_experts_gate_proj": "model.layers[layer_idx].mlp.experts[expert_idx].gate_proj", + "layers_mlp_experts_up_proj": "model.layers[layer_idx].mlp.experts[expert_idx].up_proj", + "layers_mlp_experts_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].down_proj", + + "layers_mlp_shared_expert_gate": "model.layers[layer_idx].mlp.shared_expert_gate", + "layers_mlp_shared_experts_gate_proj": "model.layers[layer_idx].mlp.shared_expert.gate_proj", + "layers_mlp_shared_experts_up_proj": "model.layers[layer_idx].mlp.shared_expert.up_proj", + "layers_mlp_shared_experts_linear_fc2": "model.layers[layer_idx].mlp.shared_expert.down_proj", + "final_layernorm": "model.norm", + "output_layer": "lm_head" + } }, "llama2": { "__base__": "base" diff --git a/modellink/tasks/checkpoint/models.py b/modellink/tasks/checkpoint/models.py index 21edda50..1798f62a 100644 --- a/modellink/tasks/checkpoint/models.py +++ b/modellink/tasks/checkpoint/models.py @@ -254,6 +254,7 @@ class ModelBase(abc.ABC): num_experts = getattr(args, 'num_experts', None) or getattr(args, 'num_local_experts', None) first_k_dense_replace = getattr(args, 'first_k_dense_replace', None) moe_layer_freq = getattr(args, 'moe_layer_freq', None) + shared_expert_gate = getattr(args, 'shared_expert_gate', False) if (num_experts and first_k_dense_replace is not None and moe_layer_freq is not None @@ -261,6 +262,9 @@ class ModelBase(abc.ABC): if layer_idx >= first_k_dense_replace and layer_idx % moe_layer_freq == 0: router_weight = src_model.get_layers_mlp_router_weight(**kwargs) self.set_layers_mlp_router_weight(**kwargs, data=router_weight) + if shared_expert_gate: + shared_expert_gate_weight = src_model.get_layers_mlp_shared_expert_gate_weight(**kwargs) + self.set_layers_mlp_shared_expert_gate_weight(**kwargs, data=shared_expert_gate_weight) if getattr(self.args, "n_shared_experts", None) is not None: self._set_mlp_shared_experts_state(src_model, **kwargs) if args.moe_grouped_gemm: @@ -728,6 +732,7 @@ class MegatronModel(ModelBase): self.args.moe_grouped_gemm = hf_args.moe_grouped_gemm self.args.num_experts = getattr(hf_args, "num_experts", None) self.args.n_shared_experts = getattr(hf_args, "n_shared_experts", None) + self.args.shared_expert_gate = getattr(hf_args, "shared_expert_gate", None) self.args.qk_layernorm = getattr(hf_args, "qk_layernorm", False) self.args.moe_intermediate_size = getattr(hf_args, "moe_intermediate_size", None) self.args.first_k_dense_replace = getattr(hf_args, "first_k_dense_replace", None) @@ -1095,7 +1100,11 @@ class MegatronMCoreModel(MegatronModel): "layers_mlp_shared_experts_linear_fc1"] = module_layer + "mlp.shared_experts.linear_fc1" self.module_mapping[ "layers_mlp_shared_experts_linear_fc2"] = module_layer + "mlp.shared_experts.linear_fc2" - + + # shared experts gate + if config_value.get('shared_expert_gate', False): + self.module_mapping["layers_mlp_shared_expert_gate"] = module_layer + "mlp.shared_expert_gate" + # moe grouped gemm self.module_mapping[ "layers_mlp_experts_weight1"] = module_layer + "mlp.experts.weight1" diff --git a/modellink/tasks/checkpoint/saver.py b/modellink/tasks/checkpoint/saver.py index 914be18c..1ffa49c5 100644 --- a/modellink/tasks/checkpoint/saver.py +++ b/modellink/tasks/checkpoint/saver.py @@ -289,6 +289,7 @@ def _set_set_model_layer_mlp(model_mg, msg, md, pop_flag=True, is_moe_mlp=False, def set_model_layer_mlp(model_mg, msg, md, total_layer_num, **kwargs): margs = model_mg.get_args() first_k_dense_replace = getattr(margs, 'first_k_dense_replace', None) + shared_expert_gate = getattr(margs, 'shared_expert_gate', None) moe_layer_freq = getattr(margs, 'moe_layer_freq', None) if ( margs.num_experts @@ -299,6 +300,8 @@ def set_model_layer_mlp(model_mg, msg, md, total_layer_num, **kwargs): num_experts_local = margs.num_experts // margs.expert_model_parallel_size mlp_moe = msg.pop("mlp_moe") mlp_router_weight = mlp_moe.pop("mlp router weight") + if shared_expert_gate: + mlp_shared_expert_gate_weights = mlp_moe.pop("mlp shared_expert_gate weight") if getattr(margs, "n_shared_experts", None) is not None: shared_experts_linear_fc1_weight = mlp_moe.pop("mlp shared experts linear fc1 weight") shared_experts_linear_fc2_weight = mlp_moe.pop("mlp shared experts linear fc2 weight") @@ -313,6 +316,8 @@ def set_model_layer_mlp(model_mg, msg, md, total_layer_num, **kwargs): for tp_rank in range(margs.tensor_model_parallel_size): kwargs['tp_rank'] = tp_rank model_mg.set_layers_mlp_router_weight(**kwargs, data=mlp_router_weight) + if shared_expert_gate: + model_mg.set_layers_mlp_shared_expert_gate_weight(**kwargs, data=mlp_shared_expert_gate_weights) if getattr(margs, "n_shared_experts", None) is not None: model_mg.set_layers_mlp_shared_experts_linear_fc1_weight(**kwargs, data=shared_experts_linear_fc1_weight)