!1580 add gpt4 moe drop

Merge pull request !1580 from LeiZhenzhen/master
2024-12-05 05:17:40 +08:00 · 2024-09-03 06:15:24 +00:00 · 2024-09-03 06:15:24 +00:00 · 01b71a2a2e
commit 01b71a2a2e
parent 06acd01f78
6 changed files with 500 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -796,6 +796,31 @@ ModelLink 通过模型并行与数据并行来训练大语言模型，为了演
      <td><center>【昇腾】</td>
      <td>【Test】</td>
    </tr>
+    <tr>
+      <td rowspan="2">GPT4</td>
+      <td> 4x13B </td>
+      <td> 128K </td>
+      <th> Mcore </th>
+      <td> 8x8 </td>
+      <td> BF16 </td>
+      <td> 424 </td>
+      <td> 1066 </td>
+      <td> -- </td>
+      <td><center>【NAIE】</td>
+      <td>【Test】</td>
+    </tr>
+    <tr>
+      <td> 4x16B </td>
+      <td> 128K </td>
+      <th>Mcore</th>
+      <td> 8x8 </td>
+      <td> BF16 </td>
+      <td> 351 </td>
+      <td> 918 </td>
+      <td> -- </td>
+      <td><center>【昇腾】</td>
+      <td>【Test】</td>
+    </tr>
    <tr>
      <td rowspan="1"><a href="https://github.com/xai-org/grok-1">Grok1</a></td>
      <td><a href="https://github.com/xai-org/grok-1">8x5B</a></td>
--- a/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh
+++ b/examples/mcore/gpt4/pretrain_gpt4_moe_drop.sh
@ -0,0 +1,130 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=8
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DATA_PATH="your data path"
+VOCAB_FILE="your vocab file path"
+MERGE_FILE="your merge file path"
+CKPT_LOAD_DIR="your model ckpt path"
+CKPT_SAVE_DIR="your save ckpt path"
+
+TP=8
+PP=2
+EP=1
+CP=4
+CP_TYPE='megatron_cp_algo'
+NUM_LAYERS=10
+SEQ_LEN=131072
+MBS=1
+GBS=16
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    --node_rank $NODE_RANK
+"
+
+MOE_ARGS="
+    --num-experts 4 \
+    --expert-model-parallel-size ${EP} \
+    --moe-router-topk 2 \
+    --moe-router-load-balancing-type aux_loss \
+    --moe-aux-loss-coeff 0.01 \
+    --moe-permutation-async-comm \
+    --disable-bias-linear \
+    --moe-expert-capacity-factor 1.1 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-pad-expert-input-to-capacity
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo  ${CP_TYPE} \
+    --use-cp-send-recv-overlap \
+    --sequence-parallel \
+    --num-layers ${NUM_LAYERS} \
+    --hidden-size 12288 \
+    --num-attention-heads 96 \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --position-embedding-type rope \
+    --use-fused-rotary-pos-emb \
+    --tokenizer-type GPT2BPETokenizer \
+    --use-flash-attn \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --use-distributed-optimizer \
+    --recompute-granularity full \
+    --recompute-method block \
+    --recompute-num-layers ${NUM_LAYERS} \
+    --train-iters 2000 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --init-method-std 0.006 \
+    --clip-grad 1.0 \
+    --lr 6.0e-5 \
+    --lr-decay-style cosine \
+    --min-lr 6.0e-6 \
+    --lr-warmup-fraction .001 \
+    --lr-decay-iters 430000 \
+    --no-gradient-accumulation-fusion \
+    --no-masked-softmax-fusion \
+    --no-bias-gelu-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --no-shared-storage \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path ${DATA_PATH} \
+    --vocab-file ${VOCAB_FILE} \
+    --merge-file ${MERGE_FILE} \
+    --split 949,50,1
+"
+
+CKPT_ARGS="
+    --load ${CKPT_LOAD_DIR} \
+    --no-load-optim \
+    --no-load-rng \
+    --no-save-optim \
+    --no-save-rng \
+    --seed 1234 \
+    --save ${CKPT_SAVE_DIR}
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 1 \
+    --log-throughput
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $MOE_ARGS \
+    $DATA_ARGS \
+    $CKPT_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    | tee logs/pretrain_gpt4_mcore_moe_drop_tp${TP}_pp${PP}_ep${EP}_cp${CP}_layer${NUM_LAYERS}.log
+    
--- a/examples/mcore/gpt4/pretrain_gpt4_moe_drop_C.sh
+++ b/examples/mcore/gpt4/pretrain_gpt4_moe_drop_C.sh
@ -0,0 +1,129 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=8
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DATA_PATH="your data path"
+VOCAB_FILE="your vocab file path"
+MERGE_FILE="your merge file path"
+CKPT_LOAD_DIR="your model ckpt path"
+CKPT_SAVE_DIR="your save ckpt path"
+
+TP=8
+PP=2
+EP=1
+CP=8
+CP_TYPE='megatron_cp_algo'
+NUM_LAYERS=10
+SEQ_LEN=131072
+MBS=1
+GBS=16
+
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    --node_rank $NODE_RANK
+"
+
+MOE_ARGS="
+    --num-experts 4 \
+    --expert-model-parallel-size ${EP} \
+    --moe-router-topk 2 \
+    --moe-router-load-balancing-type aux_loss \
+    --moe-aux-loss-coeff 0.01 \
+    --moe-permutation-async-comm \
+    --disable-bias-linear \
+    --moe-expert-capacity-factor 1.1 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-pad-expert-input-to-capacity
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo  ${CP_TYPE} \
+    --use-cp-send-recv-overlap \
+    --sequence-parallel \
+    --num-layers ${NUM_LAYERS} \
+    --hidden-size 12288 \
+    --num-attention-heads 96 \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --position-embedding-type rope \
+    --use-fused-rotary-pos-emb \
+    --tokenizer-type GPT2BPETokenizer \
+    --use-flash-attn \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --use-distributed-optimizer \
+    --reuse-fp32-param \
+    --train-iters 2000 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --init-method-std 0.006 \
+    --clip-grad 1.0 \
+    --lr 6.0e-5 \
+    --lr-decay-style cosine \
+    --min-lr 6.0e-6 \
+    --lr-warmup-fraction .001 \
+    --lr-decay-iters 430000 \
+    --no-gradient-accumulation-fusion \
+    --no-masked-softmax-fusion \
+    --no-bias-gelu-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --no-shared-storage \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path ${DATA_PATH} \
+    --vocab-file ${VOCAB_FILE} \
+    --merge-file ${MERGE_FILE} \
+    --split 949,50,1
+"
+
+CKPT_ARGS="
+    --load ${CKPT_LOAD_DIR} \
+    --no-load-optim \
+    --no-load-rng \
+    --no-save-optim \
+    --no-save-rng \
+    --seed 1234 \
+    --save ${CKPT_SAVE_DIR}
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 1 \
+    --log-throughput
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $MOE_ARGS \
+    $DATA_ARGS \
+    $CKPT_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    | tee logs/pretrain_gpt4_mcore_moe_drop_tp${TP}_pp${PP}_ep${EP}_cp${CP}_layer${NUM_LAYERS}_C.log
+    
--- a/tests/README.md
+++ b/tests/README.md
@ -13,8 +13,8 @@
        <th>Memory</th>
    </tr>
    <tr>
-        <td rowspan="7">ST</td>
-        <td rowspan="5">Pretrain</td>
+        <td rowspan="8">ST</td>
+        <td rowspan="6">Pretrain</td>
        <td>Mcore</td>
        <td>TP，PP，VPP，重计算，enable_recompute_layers_per_pp_rank</td>
        <td><a href="st/shell_scripts/llama2_tp2_pp4_vpp2_ptd.sh">llama2_tp2_pp4_vpp2.sh</a></td>
@ -33,7 +33,7 @@
    <tr>
        <td>Mcore</td>
        <td>partial_rope</td>
-        <td><a href="st/shell_scripts//shell_scripts/chatglm3_tp1_pp2_rope.sh">chatglm3_tp1_pp2_rope.sh</a></td>
+        <td><a href="st/shell_scripts/shell_scripts/chatglm3_tp1_pp2_rope.sh">chatglm3_tp1_pp2_rope.sh</a></td>
        <td>Y</td>
        <td>Y</td>
        <td>Y</td>
@ -46,6 +46,14 @@
        <td>Y</td>
        <td>Y</td>
    </tr>
+    <tr>
+        <td>Mcore</td>
+        <td>moe_expert_capacity_factor，moe_alltoall，pad_to_capacity, topk_softmax_with_capacity</td>
+        <td><a href="st/shell_scripts/gpt4_mcore_tp4_cp2_32k_moe_drop.sh">gpt4_mcore_tp4_cp2_32k_moe_drop.sh</a></td>
+        <td>Y</td>
+        <td>Y</td>
+        <td>Y</td>
+    </tr>
    <tr>
        <td>Legacy</td>
        <td>TP，PP，VPP，SP，全重计算，fused_rmsnorm，fused_swiglu，fused_rope，overlap_grad_reduce</td>
--- a/tests/st/baseline_results/gpt4_mcore_tp4_cp2_32k_moe_drop.json
+++ b/tests/st/baseline_results/gpt4_mcore_tp4_cp2_32k_moe_drop.json
@ -0,0 +1,78 @@
+{
+    "lm loss": [
+        7.827757,
+        7.758149,
+        7.755681,
+        7.775759,
+        7.767005,
+        7.752036,
+        7.75515,
+        7.749176,
+        7.642195,
+        7.922398,
+        7.814493,
+        7.83521,
+        7.743257,
+        7.770623,
+        7.806659
+    ],
+    "throughput": [
+        79.3,
+        121.8,
+        122.0,
+        122.3,
+        122.2,
+        121.9,
+        122.5,
+        122.4,
+        119.0,
+        121.2,
+        122.4,
+        122.4,
+        122.6,
+        122.1,
+        122.3
+    ],
+    "memo info": [
+        {
+            "rank": 0,
+            "allocated memory": 33469.275390625,
+            "max allocated memory": 33756.32958984375
+        },
+        {
+            "rank": 1,
+            "allocated memory": 33469.275390625,
+            "max allocated memory": 33756.32958984375
+        },
+        {
+            "rank": 2,
+            "allocated memory": 33469.275390625,
+            "max allocated memory": 33756.32958984375
+        },
+        {
+            "rank": 3,
+            "allocated memory": 33469.275390625,
+            "max allocated memory": 33756.32958984375
+        },
+        {
+            "rank": 4,
+            "allocated memory": 33469.2548828125,
+            "max allocated memory": 33838.30322265625
+        },
+        {
+            "rank": 5,
+            "allocated memory": 33469.2548828125,
+            "max allocated memory": 33838.30322265625
+        },
+        {
+            "rank": 6,
+            "allocated memory": 33469.2548828125,
+            "max allocated memory": 33838.30322265625
+        },
+        {
+            "rank": 7,
+            "allocated memory": 33469.2548828125,
+            "max allocated memory": 33838.30322265625
+        }
+    ]
+}
--- a/tests/st/shell_scripts/gpt4_mcore_tp4_cp2_32k_moe_drop.sh
+++ b/tests/st/shell_scripts/gpt4_mcore_tp4_cp2_32k_moe_drop.sh
@ -0,0 +1,127 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+
+basepath=$(cd `dirname $0`; cd ../../../; pwd)
+
+DATA_PATH=/data/gpt/gpt-wiki_text_document
+VOCAB_FILE=/data/gpt/gpt2-vocab.json
+MERGE_FILE=/data/gpt/gpt2-merges.txt
+CKPT_LOAD_DIR=/data/gpt/gpt_drop_moe_32k_base
+CKPT_SAVE_DIR=/data/gpt/gpt_drop_moe_32k_test
+
+TP=4
+PP=1
+EP=1
+CP=2
+CP_TYPE='megatron_cp_algo'
+NUM_LAYERS=2
+SEQ_LEN=32768
+MBS=1
+GBS=4
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    --node_rank $NODE_RANK
+"
+
+MOE_ARGS="
+    --num-experts 4 \
+    --expert-model-parallel-size ${EP} \
+    --moe-router-topk 2 \
+    --moe-router-load-balancing-type aux_loss \
+    --moe-aux-loss-coeff 0.01 \
+    --moe-permutation-async-comm \
+    --disable-bias-linear \
+    --moe-expert-capacity-factor 1.1 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-pad-expert-input-to-capacity
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --context-parallel-algo  ${CP_TYPE} \
+    --use-cp-send-recv-overlap \
+    --sequence-parallel \
+    --num-layers ${NUM_LAYERS} \
+    --hidden-size 12288 \
+    --num-attention-heads 96 \
+    --seq-length ${SEQ_LEN} \
+    --max-position-embeddings ${SEQ_LEN} \
+    --position-embedding-type rope \
+    --use-fused-rotary-pos-emb \
+    --tokenizer-type GPT2BPETokenizer \
+    --use-flash-attn \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --overlap-grad-reduce \
+    --overlap-param-gather \
+    --use-distributed-optimizer \
+    --train-iters 65 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --initial-loss-scale 4096 \
+    --init-method-std 0.006 \
+    --clip-grad 1.0 \
+    --lr 6.0e-5 \
+    --lr-decay-style cosine \
+    --min-lr 6.0e-6 \
+    --lr-warmup-fraction .001 \
+    --lr-decay-iters 430000 \
+    --no-gradient-accumulation-fusion \
+    --no-masked-softmax-fusion \
+    --no-bias-gelu-fusion \
+    --attention-softmax-in-fp32 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --no-shared-storage \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path ${DATA_PATH} \
+    --vocab-file ${VOCAB_FILE} \
+    --merge-file ${MERGE_FILE} \
+    --split 949,50,1
+"
+
+CKPT_ARGS="
+    --load ${CKPT_LOAD_DIR} \
+    --no-load-optim \
+    --no-load-rng \
+    --no-save-optim \
+    --no-save-rng \
+    --seed 1234 \
+    --save ${CKPT_SAVE_DIR}
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 500 \
+    --eval-interval 500 \
+    --eval-iters 1 \
+    --log-throughput
+"
+
+torchrun $DISTRIBUTED_ARGS $basepath/pretrain_gpt.py \
+    $GPT_ARGS \
+    $MOE_ARGS \
+    $DATA_ARGS \
+    $CKPT_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl
+