!1828 llama3.1模型mcore适配

Merge pull request !1828 from njupt_sjj/master
2024-12-04 12:58:02 +08:00 · 2024-10-30 11:02:56 +00:00 · 2024-10-30 11:02:56 +00:00 · ee59b5208d
commit ee59b5208d
parent 9711128526
2 changed files with 180 additions and 0 deletions
--- a/examples/mcore/llama31/generate_llama31_8b_lora_ptd.sh
+++ b/examples/mcore/llama31/generate_llama31_8b_lora_ptd.sh
@ -0,0 +1,64 @@
+#!/bin/bash
+
+# The number of parameters is not aligned
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# please fill these path configurations
+CHECKPOINT="your model directory path"
+CHECKPOINT_LORA="your lora model directory path"
+TOKENIZER_PATH="your tokenizer directory path"
+TOKENIZER_MODEL="your tokenizer.model file path"
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS inference.py \
+       --use-mcore-models \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 8  \
+       --num-layers 32 \
+       --hidden-size 4096  \
+       --ffn-hidden-size 14436 \
+       --position-embedding-type rope \
+       --seq-length 8192 \
+       --max-new-tokens 256 \
+       --micro-batch-size 1 \
+       --global-batch-size 8 \
+       --num-attention-heads 32  \
+       --max-position-embeddings 8192 \
+       --swiglu \
+       --load "${CHECKPOINT}"  \
+       --tokenizer-type PretrainedFromHF  \
+       --tokenizer-name-or-path "${TOKENIZER_PATH}" \
+       --tokenizer-model "${TOKENIZER_MODEL}"  \
+       --tokenizer-not-use-fast \
+       --bf16 \
+       --normalization RMSNorm \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --attention-softmax-in-fp32 \
+       --no-load-optim \
+       --no-load-rng \
+       --no-masked-softmax-fusion \
+       --no-gradient-accumulation-fusion \
+       --exit-on-missing-checkpoint \
+       --lora-load ${CHECKPOINT_LORA}  \
+       --lora-r 8 \
+       --lora-alpha 16 \
+       --lora-fusion \
+       --lora-target-modules linear_qkv linear_proj linear_fc1 linear_fc2 \
+       --make-vocab-size-divisible-by 1 \
+       --vocab-size 128256 \
+       --padded-vocab-size 128256 \
+       --rotary-base 500000.0 \
+       --group-query-attention \
+       --num-query-groups 8 \
+       | tee logs/generate_llama31_8b_lora.log
--- a/examples/mcore/llama31/tune_llama31_8b_lora_ptd.sh
+++ b/examples/mcore/llama31/tune_llama31_8b_lora_ptd.sh
@ -0,0 +1,116 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6024
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+CKPT_LOAD_DIR="your model ckpt path"
+TOKENIZER_MODEL="your tokenizer path"
+
+TP=1
+PP=8
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --use-mcore-models \
+    --micro-batch-size 1 \
+    --global-batch-size 8 \
+    --sequence-parallel \
+    --use-flash-attn \
+    --prompt-type llama3 \
+    --variable-seq-lengths \
+    --use-rotary-position-embeddings \
+    --rope-scaling-type llama3 \
+    --rope-scaling-factor 8.0 \
+    --rotary-percent 1.0 \
+    --low-freq-factor 1.0 \
+    --high-freq-factor 4.0 \
+    --original-max-position-embeddings 8192 \
+    --tokenizer-type PretrainedFromHF \
+    --tokenizer-name-or-path ${TOKENIZER_MODEL} \
+    --tokenizer-not-use-fast \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 14336 \
+    --num-attention-heads 32 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --seq-length 8192 \
+    --max-position-embeddings 8192 \
+    --make-vocab-size-divisible-by 1 \
+    --padded-vocab-size 128256 \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.02 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --rotary-base 500000.0 \
+    --normalization RMSNorm \
+    --norm-epsilon 1e-5 \
+    --swiglu \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --lr 1e-6 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --weight-decay 0.0 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --initial-loss-scale 4096 \
+    --finetune \
+    --is-instruction-dataset \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --seed 42 \
+    --vocab-size 128256 \
+"
+
+FINETUNE_ARGS="
+    --lora-r 8 \
+    --lora-alpha 16 \
+    --lora-fusion \
+    --lora-target-modules linear_qkv linear_proj linear_fc1 linear_fc2 \
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0 \
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 10 \
+    --load ${CKPT_LOAD_DIR} \
+    --save ${CKPT_SAVE_DIR} \
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $FINETUNE_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    | tee logs/tune_llama31_8b_lora_ptd.log