From 4d00313fbd0671f48a92192f36b45e2bc22400d6 Mon Sep 17 00:00:00 2001
From: changlei <changlei20@huawei.com>
Date: Sat, 31 Aug 2024 01:58:39 +0000
Subject: [PATCH] =?UTF-8?q?!1582=20=E6=96=B0=E5=A2=9EQwen2-0.5B=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=20Merge=20pull=20request=20!1582=20from=20changlei/ma?=
 =?UTF-8?q?ster?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                     |  23 +++-
 examples/README.md                            |   5 +-
 .../qwen2/evaluate_qwen2_0point5b_ptd.sh      |  69 +++++++++++
 .../qwen2/generate_qwen2_0point5b_ptd.sh      |  63 ++++++++++
 .../qwen2/pretrain_qwen2_0point5b_32K_ptd.sh  | 113 ++++++++++++++++++
 .../qwen2/pretrain_qwen2_0point5b_4K_ptd.sh   | 106 ++++++++++++++++
 6 files changed, 375 insertions(+), 4 deletions(-)
 create mode 100644 examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh
 create mode 100644 examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh
 create mode 100644 examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh
 create mode 100644 examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh
diff --git a/README.md b/README.md
index 215458515..49c250bc4 100644
--- a/README.md
+++ b/README.md
@@ -544,7 +544,26 @@ ModelLink 通过模型并行与数据并行来训练大语言模型，为了演
       <td>【Test】</td>
     </tr>
     <tr>
-      <td rowspan="3"><a href="https://huggingface.co/Qwen">Qwen2</a></td>
+      <td rowspan="5"><a href="https://huggingface.co/Qwen">Qwen2</a></td>
+      <td rowspan="2"> <a href="https://huggingface.co/Qwen/Qwen2-0.5B/tree/main">0.5B</a> </td>
+      <td> 4K </td>
+      <th> Mcore </th>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 28618 </td>
+      <td> 34859 </td>
+      <td><center>【GTS】</td>
+      <td><center>【Test】</td>
+      <tr>
+      <td> 32K </td>
+      <th> Mcore </th>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 11338 </td>
+      <td><center> -- </td>
+      <td><center>【GTS】</td>
+      <td>【Test】</td>
+      <tr>
       <td rowspan="2"> <a href="https://huggingface.co/Qwen/Qwen2-1.5B/tree/main">1.5B</a> </td>
       <td> 4K </td>
       <th> Mcore </th>
@@ -560,7 +579,7 @@ ModelLink 通过模型并行与数据并行来训练大语言模型，为了演
       <td> 1x8 </td>
       <td> BF16 </td>
       <td> 7281 </td>
-      <td><center> -- </td>
+      <td><center> 8538 </td>
       <td><center>【GTS】</td>
       <td>【Test】</td>
       <tr>
diff --git a/examples/README.md b/examples/README.md
index bd16ed266..4e7f3f5fe 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -797,8 +797,9 @@ ModelLink已支持模型评估分数如下：
 | QWen1.5-1.8b  | [MMLU](https://paperswithcode.com/dataset/mmlu)                                                                      | 46.2%     | [46.8%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-4B   | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 55.0%     | [0.561](https://qwenlm.github.io/zh/blog/qwen1.5)                   |
 | QWen1.5-7B    | [MMLU](https://paperswithcode.com/dataset/mmlu)                                                                       | 60.3%     | [61.0%](https://qwenlm.github.io/zh/blog/qwen1.5/)                    | QWen1.5-14B  | [MMLU](https://paperswithcode.com/dataset/mmlu)                                                                    | 67.3%     | [67.6%](https://qwenlm.github.io/zh/blog/qwen1.5)                   |
 | QWen1.5-32B   | [MMLU](https://paperswithcode.com/dataset/mmlu)                                                                       | 72.6%     | [73.4%](https://huggingface.co/Qwen/Qwen-72B)                         | QWen1.5-72B  | [MMLU](https://paperswithcode.com/dataset/mmlu)                                                                    | 77.5%     | [77.5%](https://qwenlm.github.io/zh/blog/qwen1.5)                   |
-|    Yi-34B     |              [MMLU](https://paperswithcode.com/dataset/mmlu)              |   76.3%   |      [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU)      |  Qwen2-72B   |                    [MMLU](https://paperswithcode.com/dataset/mmlu)                    | 83.6%     |           [84.2](https://qwenlm.github.io/zh/blog/qwen2/)           |
-| Qwen1.5-110B  |              [MMLU](https://paperswithcode.com/dataset/mmlu)              | 80.4%     | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/)               | Qwen2-1.5B   | [MMLU](https://paperswithcode.com/dataset/mmlu)                        | 54.7%     | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/)                    |
+| Qwen1.5-110B  | [MMLU](https://paperswithcode.com/dataset/mmlu)                           | 80.4%     | [80.4%](https://qwenlm.github.io/zh/blog/qwen1.5-110b/)               | Yi-34B       | [MMLU](https://paperswithcode.com/dataset/mmlu)                        |   76.3%   | [75.8%](https://hub.opencompass.org.cn/dataset-detail/MMLU)         |
+| Qwen2-0.5B    | [MMLU](https://paperswithcode.com/dataset/mmlu)                           | 44.6%     | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/)                      | Qwen2-1.5B   | [MMLU](https://paperswithcode.com/dataset/mmlu)                        | 54.7%     | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/)                    |
+| Qwen2-72B     | [MMLU](https://paperswithcode.com/dataset/mmlu)                           | 83.6%     | [84.2](https://qwenlm.github.io/zh/blog/qwen2/)                       |
 
 
 
diff --git a/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh b/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh
new file mode 100644
index 000000000..3689de691
--- /dev/null
+++ b/examples/mcore/qwen2/evaluate_qwen2_0point5b_ptd.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6012
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=1
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CHECKPOINT="Your ckpt file path"
+TOKENIZER_PATH="Your vocab file path"
+DATA_PATH="Your data path (such as ./mmlu/test/)"
+TASK="mmlu"
+
+TP=1
+PP=1
+MBS=2
+SEQ_LEN=4096
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+# Different task needs different max_new_tokens value, please follow the instruction in readme.
+torchrun $DISTRIBUTED_ARGS evaluation.py \
+       --use-mcore-models \
+       --task-data-path ${DATA_PATH} \
+       --task ${TASK} \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --micro-batch-size ${MBS} \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --max-new-tokens 1 \
+       --make-vocab-size-divisible-by 1 \
+       --padded-vocab-size 151936 \
+       --num-layers 24  \
+       --hidden-size 896  \
+       --ffn-hidden-size 4864 \
+       --num-attention-heads 14  \
+       --group-query-attention \
+       --num-query-groups 2 \
+       --add-qkv-bias \
+       --disable-bias-linear \
+       --swiglu \
+       --rotary-base 1000000 \
+       --position-embedding-type rope \
+       --load ${CHECKPOINT} \
+       --normalization RMSNorm \
+       --norm-epsilon 1e-6 \
+       --tokenizer-not-use-fast \
+       --exit-on-missing-checkpoint \
+       --no-load-rng \
+       --no-load-optim \
+       --no-gradient-accumulation-fusion \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       --no-chat-template \
+      | tee logs/eval_mcore_qwen2_0point5b_${TASK}.log
diff --git a/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh b/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh
new file mode 100644
index 000000000..61aaf07bc
--- /dev/null
+++ b/examples/mcore/qwen2/generate_qwen2_0point5b_ptd.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+CHECKPOINT="your model ckpt path"
+TOKENIZER_PATH="your tokenizer path"
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6011
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=1
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+TP=1
+PP=1
+MBS=2
+SEQ_LEN=4096
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+torchrun $DISTRIBUTED_ARGS inference.py \
+       --use-mcore-models \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --num-layers 24 \
+       --hidden-size 896  \
+       --num-attention-heads 14  \
+       --group-query-attention \
+       --num-query-groups 2 \
+       --ffn-hidden-size 4864 \
+       --max-position-embeddings ${SEQ_LEN} \
+       --seq-length ${SEQ_LEN} \
+       --make-vocab-size-divisible-by 1 \
+       --padded-vocab-size 151936 \
+       --rotary-base 1000000 \
+       --micro-batch-size ${MBS} \
+       --swiglu \
+       --add-qkv-bias \
+       --disable-bias-linear \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --load ${CHECKPOINT}  \
+       --normalization RMSNorm \
+       --position-embedding-type rope \
+       --norm-epsilon 1e-6 \
+       --hidden-dropout 0 \
+       --attention-dropout 0 \
+       --tokenizer-not-use-fast \
+       --max-new-tokens 256 \
+       --no-gradient-accumulation-fusion \
+       --exit-on-missing-checkpoint \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       | tee logs/generate_mcore_qwen2_0point5b.log
diff --git a/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh
new file mode 100644
index 000000000..ca94f1fa2
--- /dev/null
+++ b/examples/mcore/qwen2/pretrain_qwen2_0point5b_32K_ptd.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6011
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=1
+CP=2
+MBS=1
+GBS=16
+SEQ_LEN=32768
+CP_ALGO=megatron_cp_algo
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo ${CP_ALGO} \
+    --sequence-parallel \
+    --use-distributed-optimizer \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --num-layers 24 \
+    --hidden-size 896 \
+    --ffn-hidden-size 4864 \
+    --num-attention-heads 14 \
+    --rotary-base 1000000 \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --swiglu \
+    --add-qkv-bias \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 151936 \
+    --lr 1.25e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --lr-warmup-fraction 0.01 \
+    --init-method-std 0.01 \
+    --position-embedding-type rope \
+    --use-fused-rmsnorm \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-mc2 \
+    --overlap-grad-reduce \
+    --use-flash-attn \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --no-gradient-accumulation-fusion \
+    --rope-scaling-factor 8 \
+    --rope-scaling-original-max-position-embeddings 4096 \
+    --rope-scaling-type yarn \
+    --no-load-optim \
+    --no-load-rng \
+    --seed 42 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 1000 \
+    --eval-interval 1000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --load ${CKPT_LOAD_DIR} \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_mcore_qwen2_0point5b_32k.log
diff --git a/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh b/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh
new file mode 100644
index 000000000..5459758ef
--- /dev/null
+++ b/examples/mcore/qwen2/pretrain_qwen2_0point5b_4K_ptd.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=1
+MBS=2
+GBS=128
+SEQ_LEN=4096
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --use-distributed-optimizer \
+    --num-layers 24 \
+    --hidden-size 896 \
+    --ffn-hidden-size 4864 \
+    --num-attention-heads 14 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 151936 \
+    --rotary-base 1000000 \
+    --lr 1.25e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --add-qkv-bias \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --swiglu \
+    --use-fused-swiglu \
+    --use-flash-attn \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-rmsnorm \
+    --use-mc2 \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --initial-loss-scale 4096 \
+    --overlap-grad-reduce \
+    --seed 42 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 1000 \
+    --eval-interval 1000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --load ${CKPT_LOAD_DIR} \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_mcore_qwen2_0point5b_4k.log