From 4d00313fbd0671f48a92192f36b45e2bc22400d6 Mon Sep 17 00:00:00 2001 From: changlei Date: Sat, 31 Aug 2024 01:58:39 +0000 Subject: [PATCH] =?UTF-8?q?!1582=20=E6=96=B0=E5=A2=9EQwen2-0.5B=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=20Merge=20pull=20request=20!1582=20from=20changlei/ma?= =?UTF-8?q?ster?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 23 +++- examples/README.md | 5 +- .../qwen2/evaluate_qwen2_0point5b_ptd.sh | 69 +++++++++++ .../qwen2/generate_qwen2_0point5b_ptd.sh | 63 ++++++++++ .../qwen2/pretrain_qwen2_0point5b_32K_ptd.sh | 113 ++++++++++++++++++ .../qwen2/pretrain_qwen2_0point5b_4K_ptd.sh | 106 ++++++++++++++++ 6 files changed, 375 insertions(+), 4 deletions(-) create mode 100644 examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh create mode 100644 examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh create mode 100644 examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh create mode 100644 examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh diff --git a/README.md b/README.md index 215458515..49c250bc4 100644 --- a/README.md +++ b/README.md @@ -544,7 +544,26 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演 【Test】 - Qwen2 + Qwen2 + 0.5B + 4K + Mcore + 1x8 + BF16 + 28618 + 34859 +
【GTS】 +
【Test】 + + 32K + Mcore + 1x8 + BF16 + 11338 +
-- +
【GTS】 + 【Test】 + 1.5B 4K Mcore @@ -560,7 +579,7 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演 1x8 BF16 7281 -
-- +
8538
【GTS】 【Test】 diff --git a/examples/README.md b/examples/README.md index bd16ed266..4e7f3f5fe 100644 --- a/examples/README.md +++ b/examples/README.md @@ -797,8 +797,9 @@ ModelLink已支持模型评估分数如下: | QWen1.5-1.8b | [MMLU](https://paperswithcode.com/dataset/mmlu) | 46.2% | [46.8%](https://qwenlm.github.io/zh/blog/qwen1.5/) | QWen1.5-4B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 55.0% | [0.561](https://qwenlm.github.io/zh/blog/qwen1.5) | | QWen1.5-7B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 60.3% | [61.0%](https://qwenlm.github.io/zh/blog/qwen1.5/) | QWen1.5-14B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 67.3% | [67.6%](https://qwenlm.github.io/zh/blog/qwen1.5) | | QWen1.5-32B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 72.6% | [73.4%](https://huggingface.co/Qwen/Qwen-72B) | QWen1.5-72B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 77.5% | [77.5%](https://qwenlm.github.io/zh/blog/qwen1.5) | -| Yi-34B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 76.3% | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU) | Qwen2-72B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 83.6% | [84.2](https://qwenlm.github.io/zh/blog/qwen2/) | -| Qwen1.5-110B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 80.4% | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/) | Qwen2-1.5B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 54.7% | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/) | +| Qwen1.5-110B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 80.4% | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/) | Yi-34B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 76.3% | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU) | +| Qwen2-0.5B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 44.6% | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/) | Qwen2-1.5B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 54.7% | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/) | +| Qwen2-72B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 83.6% | [84.2](https://qwenlm.github.io/zh/blog/qwen2/) | diff --git a/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh b/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh new file mode 100644 index 000000000..3689de691 --- /dev/null +++ b/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh @@ -0,0 +1,69 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6012 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=1 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CHECKPOINT="Your ckpt file path" +TOKENIZER_PATH="Your vocab file path" +DATA_PATH="Your data path (such as ./mmlu/test/)" +TASK="mmlu" + +TP=1 +PP=1 +MBS=2 +SEQ_LEN=4096 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +# Different task needs different max_new_tokens value, please follow the instruction in readme. +torchrun $DISTRIBUTED_ARGS evaluation.py \ + --use-mcore-models \ + --task-data-path ${DATA_PATH} \ + --task ${TASK} \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --micro-batch-size ${MBS} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --max-new-tokens 1 \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --num-layers 24 \ + --hidden-size 896 \ + --ffn-hidden-size 4864 \ + --num-attention-heads 14 \ + --group-query-attention \ + --num-query-groups 2 \ + --add-qkv-bias \ + --disable-bias-linear \ + --swiglu \ + --rotary-base 1000000 \ + --position-embedding-type rope \ + --load ${CHECKPOINT} \ + --normalization RMSNorm \ + --norm-epsilon 1e-6 \ + --tokenizer-not-use-fast \ + --exit-on-missing-checkpoint \ + --no-load-rng \ + --no-load-optim \ + --no-gradient-accumulation-fusion \ + --attention-softmax-in-fp32 \ + --seed 42 \ + --bf16 \ + --no-chat-template \ + | tee logs/eval_mcore_qwen2_0point5b_${TASK}.log diff --git a/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh b/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh new file mode 100644 index 000000000..61aaf07bc --- /dev/null +++ b/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh @@ -0,0 +1,63 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# please fill these path configurations +CHECKPOINT="your model ckpt path" +TOKENIZER_PATH="your tokenizer path" + +# Change for multinode config +MASTER_ADDR=localhost +MASTER_PORT=6011 +NNODES=1 +NODE_RANK=0 +NPUS_PER_NODE=1 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +TP=1 +PP=1 +MBS=2 +SEQ_LEN=4096 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +torchrun $DISTRIBUTED_ARGS inference.py \ + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --num-layers 24 \ + --hidden-size 896 \ + --num-attention-heads 14 \ + --group-query-attention \ + --num-query-groups 2 \ + --ffn-hidden-size 4864 \ + --max-position-embeddings ${SEQ_LEN} \ + --seq-length ${SEQ_LEN} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --micro-batch-size ${MBS} \ + --swiglu \ + --add-qkv-bias \ + --disable-bias-linear \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --load ${CHECKPOINT} \ + --normalization RMSNorm \ + --position-embedding-type rope \ + --norm-epsilon 1e-6 \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --tokenizer-not-use-fast \ + --max-new-tokens 256 \ + --no-gradient-accumulation-fusion \ + --exit-on-missing-checkpoint \ + --attention-softmax-in-fp32 \ + --seed 42 \ + --bf16 \ + | tee logs/generate_mcore_qwen2_0point5b.log diff --git a/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh new file mode 100644 index 000000000..ca94f1fa2 --- /dev/null +++ b/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh @@ -0,0 +1,113 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6011 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=1 +CP=2 +MBS=1 +GBS=16 +SEQ_LEN=32768 +CP_ALGO=megatron_cp_algo + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --context-parallel-algo ${CP_ALGO} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --group-query-attention \ + --num-query-groups 2 \ + --num-layers 24 \ + --hidden-size 896 \ + --ffn-hidden-size 4864 \ + --num-attention-heads 14 \ + --rotary-base 1000000 \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --swiglu \ + --add-qkv-bias \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --lr 1.25e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --lr-warmup-fraction 0.01 \ + --init-method-std 0.01 \ + --position-embedding-type rope \ + --use-fused-rmsnorm \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-swiglu \ + --use-mc2 \ + --overlap-grad-reduce \ + --use-flash-attn \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --rope-scaling-factor 8 \ + --rope-scaling-original-max-position-embeddings 4096 \ + --rope-scaling-type yarn \ + --no-load-optim \ + --no-load-rng \ + --seed 42 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 1000 \ + --eval-interval 1000 \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen2_0point5b_32k.log diff --git a/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh new file mode 100644 index 000000000..5459758ef --- /dev/null +++ b/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh @@ -0,0 +1,106 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_PATH="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=1 +PP=1 +MBS=2 +GBS=128 +SEQ_LEN=4096 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --use-distributed-optimizer \ + --num-layers 24 \ + --hidden-size 896 \ + --ffn-hidden-size 4864 \ + --num-attention-heads 14 \ + --group-query-attention \ + --num-query-groups 2 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --seq-length ${SEQ_LEN} \ + --max-position-embeddings ${SEQ_LEN} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --make-vocab-size-divisible-by 1 \ + --padded-vocab-size 151936 \ + --rotary-base 1000000 \ + --lr 1.25e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --add-qkv-bias \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --norm-epsilon 1e-06 \ + --swiglu \ + --use-fused-swiglu \ + --use-flash-attn \ + --use-fused-rotary-pos-emb \ + --use-rotary-position-embeddings \ + --use-fused-rmsnorm \ + --use-mc2 \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --initial-loss-scale 4096 \ + --overlap-grad-reduce \ + --seed 42 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 1000 \ + --eval-interval 1000 \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_mcore_qwen2_0point5b_4k.log