!1793 添加Qwen2.5-1.5B模型

Merge pull request !1793 from caoruichao/master
2024-12-05 05:17:40 +08:00 · 2024-10-26 06:38:36 +00:00 · 2024-10-26 06:38:36 +00:00 · 45ca70b2aa
commit 45ca70b2aa
parent bf92daf093
8 changed files with 255 additions and 6 deletions
--- a/docs/models/evaluation.md
+++ b/docs/models/evaluation.md
@ -42,8 +42,8 @@ MindSpeed-LLM 已支持的大模型评估数据统计如下：
 | QWen2-0.5B    | MMLU   | 44.6%     | [45.4%](https://qwenlm.github.io/zh/blog/qwen2/)                     | QWen2-1.5B       | MMLU   | 54.7%     | [56.5%](https://qwenlm.github.io/zh/blog/qwen2/)                                  |
 | QWen2-7B      | MMLU   | 70.3%     | [70.3%](https://qwenlm.github.io/zh/blog/qwen2/)                     | QWen2-57B-A14B   |MMLU|75.6% | [76.5%](https://qwenlm.github.io/zh/blog/qwen2/)|
 | QWen2-72B     | MMLU   | 83.6%     | [84.2%](https://qwenlm.github.io/zh/blog/qwen2/)| MiniCPM-2B       | MMLU   | 51.6%     | [53.4%](https://github.com/OpenBMB/MiniCPM?tab=readme-ov-file#3)     |                                                                               
-| DeepSeek-V2-Lite-16B   | MMLU   | 57.4%     | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)      |
-| QWen2.5-3B            | MMLU   | 65.6%     | [65.6%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-7B      | MMLU   | 73.8%     | [74.2%](https://qwenlm.github.io/blog/qwen2.5-llm/)                     | 
+| DeepSeek-V2-Lite-16B   | MMLU   | 57.4%     | [58.3%](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite)      | QWen2.5-1.5B     | MMLU   | 59.4%     | [60.9%](https://qwenlm.github.io/blog/qwen2.5-llm/)                     |
+| QWen2.5-3B            | MMLU   | 65.6%     | [65.6%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | QWen2.5-7B       | MMLU   | 73.8%     | [74.2%](https://qwenlm.github.io/blog/qwen2.5-llm/)                     | 
 | QWen2.5-14B           | MMLU   | 79.4%   | [79.7%](https://qwenlm.github.io/blog/qwen2.5-llm/)                   | 

 具体的权重转换功能命令介绍见 [MindSpeed-LLM 大模型使用指南分布式评估](../USER_GUIDE.md/#大模型分布式评估)章节.
--- a/docs/models/pretrain.md
+++ b/docs/models/pretrain.md
@ -449,7 +449,14 @@
      <td>【Test】</td>
    </tr>
    <tr>
-      <td rowspan="4"><a href="https://huggingface.co/Qwen">Qwen2.5</a></td>
+      <td rowspan="5"><a href="https://huggingface.co/Qwen">Qwen2.5</a></td>
+      <td><a href="https://huggingface.co/Qwen/Qwen2.5-1.5B/tree/main">1.5B</a></td>
+      <td> 32K </td>
+      <th>Mcore</th>
+      <td>1x8</td>
+      <td>【GTS】</td>
+      <td>【Test】</td>
+    </tr>
      <td><a href="https://huggingface.co/Qwen/Qwen2.5-3B/tree/main">3B</a></td>
      <td> 32K </td>
      <th>Mcore</th>
--- a/examples/mcore/qwen25/evaluate_qwen25_1point5b_ptd.sh
+++ b/examples/mcore/qwen25/evaluate_qwen25_1point5b_ptd.sh
@ -0,0 +1,69 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6003
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=1
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CHECKPOINT="Your ckpt file path"
+TOKENIZER_PATH="Your vocab file path"
+DATA_PATH="Your data path (such as ./mmlu/test/)"
+TASK="mmlu"
+
+TP=1
+PP=1
+MBS=1
+SEQ_LEN=32768
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+# Different task needs different max_new_tokens value, please follow the instruction in readme.
+torchrun $DISTRIBUTED_ARGS evaluation.py \
+       --use-mcore-models \
+       --task-data-path $DATA_PATH \
+       --task ${TASK} \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --micro-batch-size ${MBS}  \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --max-new-tokens 1 \
+       --make-vocab-size-divisible-by 1 \
+       --padded-vocab-size 151936 \
+       --rotary-base 1000000 \
+       --num-layers 28  \
+       --hidden-size 1536  \
+       --ffn-hidden-size 8960 \
+       --num-attention-heads 12 \
+       --group-query-attention \
+       --num-query-groups 2 \
+       --add-qkv-bias \
+       --disable-bias-linear \
+       --swiglu \
+       --position-embedding-type rope \
+       --load ${CHECKPOINT} \
+       --normalization RMSNorm \
+       --norm-epsilon 1e-06 \
+       --tokenizer-not-use-fast \
+       --exit-on-missing-checkpoint \
+       --no-load-rng \
+       --no-load-optim \
+       --no-gradient-accumulation-fusion \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       --no-chat-template \
+       | tee logs/eval_mcore_qwen25_1point5b_${TASK}.log
--- a/examples/mcore/qwen25/evaluate_qwen25_32b_ptd.sh
+++ b/examples/mcore/qwen25/evaluate_qwen25_32b_ptd.sh
@ -36,7 +36,7 @@ torchrun $DISTRIBUTED_ARGS evaluation.py \
       --seq-length ${SEQ_LENGTH} \
       --max-position-embeddings ${SEQ_LENGTH} \
       --max-new-tokens 1 \
-       --num-layers 60  \
+       --num-layers 64  \
       --hidden-size 5120  \
       --ffn-hidden-size 27648 \
       --num-attention-heads 40  \
--- a/examples/mcore/qwen25/generate_qwen25_1point5b_ptd.sh
+++ b/examples/mcore/qwen25/generate_qwen25_1point5b_ptd.sh
@ -0,0 +1,63 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6002
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=1
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CHECKPOINT="your model ckpt path"
+TOKENIZER_PATH="your tokenizer path"
+
+TP=1
+PP=1
+MBS=1
+SEQ_LEN=32768
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+torchrun $DISTRIBUTED_ARGS inference.py \
+       --use-mcore-models \
+       --tensor-model-parallel-size ${TP} \
+       --pipeline-model-parallel-size ${PP} \
+       --num-layers 28 \
+       --hidden-size 1536  \
+       --ffn-hidden-size 8960 \
+       --num-attention-heads 12 \
+       --group-query-attention \
+       --num-query-groups 2 \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path ${TOKENIZER_PATH} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --seq-length ${SEQ_LEN} \
+       --make-vocab-size-divisible-by 1 \
+       --padded-vocab-size 151936 \
+       --rotary-base 1000000 \
+       --micro-batch-size ${MBS} \
+       --swiglu \
+       --add-qkv-bias \
+       --disable-bias-linear \
+       --load ${CHECKPOINT}  \
+       --normalization RMSNorm \
+       --norm-epsilon 1e-6 \
+       --position-embedding-type rope \
+       --hidden-dropout 0 \
+       --attention-dropout 0 \
+       --tokenizer-not-use-fast \
+       --max-new-tokens 256 \
+       --no-gradient-accumulation-fusion \
+       --exit-on-missing-checkpoint \
+       --attention-softmax-in-fp32 \
+       --seed 42 \
+       --bf16 \
+       | tee logs/generate_mcore_qwen25_1point5b.log
--- a/examples/mcore/qwen25/generate_qwen25_32b_ptd.sh
+++ b/examples/mcore/qwen25/generate_qwen25_32b_ptd.sh
@ -30,7 +30,7 @@ torchrun $DISTRIBUTED_ARGS inference.py \
       --tensor-model-parallel-size ${TP} \
       --pipeline-model-parallel-size ${PP} \
       --load ${CHECKPOINT} \
-       --num-layers 60 \
+       --num-layers 64 \
       --hidden-size 5120  \
       --num-attention-heads 40  \
       --group-query-attention \
--- a/examples/mcore/qwen25/pretrain_qwen25_1point5b_32K_ptd.sh
+++ b/examples/mcore/qwen25/pretrain_qwen25_1point5b_32K_ptd.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+# please fill these path configurations
+CKPT_LOAD_DIR="your model ckpt path"
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_PATH="your tokenizer path"
+
+TP=1
+PP=1
+CP=4
+MBS=1
+GBS=16
+SEQ_LEN=32768
+CP_ALGO=megatron_cp_algo
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo ${CP_ALGO} \
+    --sequence-parallel \
+    --num-layers 28 \
+    --hidden-size 1536 \
+    --ffn-hidden-size 8960 \
+    --num-attention-heads 12 \
+    --group-query-attention \
+    --num-query-groups 2 \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 151936 \
+    --rotary-base 1000000 \
+    --train-iters 2000 \
+    --lr 1.25e-6 \
+    --min-lr 1.25e-7 \
+    --weight-decay 1e-1 \
+    --lr-decay-style cosine \
+    --lr-warmup-fraction 0.01 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --add-qkv-bias \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-06 \
+    --swiglu \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-fused-rmsnorm \
+    --overlap-grad-reduce \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --initial-loss-scale 4096 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --seed 42 \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 1000 \
+    --eval-interval 1000 \
+    --eval-iters 0 \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --load ${CKPT_LOAD_DIR} \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_mcore_qwen25_1point5b_32k.log
--- a/examples/mcore/qwen25/pretrain_qwen25_3b_32K_ptd.sh
+++ b/examples/mcore/qwen25/pretrain_qwen25_3b_32K_ptd.sh
@ -18,7 +18,7 @@ TOKENIZER_PATH="your tokenizer path"
 TP=1
 PP=2
 CP=4
-MBS=4
+MBS=1
 GBS=16
 SEQ_LEN=32768
 CP_ALGO=megatron_cp_algo