diff --git a/README.md b/README.md index 3140a02a8..a7ac748eb 100644 --- a/README.md +++ b/README.md @@ -384,16 +384,16 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz 7B 1x8 BF16 - 2884 - 2884 + 4200 + 3850 训练 13B 1x8 BF16 - 1550 - 1750 + 1990 + 1920 训练 diff --git a/README_en.md b/README_en.md index e2612ebe9..4ca12f9f6 100644 --- a/README_en.md +++ b/README_en.md @@ -385,16 +385,16 @@ For the supported models listed above, we provide training scripts and readme in 7B 1x8 BF16 - 2884 - 2884 + 4200 + 3850 train 13B 1x8 BF16 - 1550 - 1750 + 1990 + 1920 train diff --git a/examples/llama2/README.md b/examples/llama2/README.md index fa1cb38e5..15feb6544 100755 --- a/examples/llama2/README.md +++ b/examples/llama2/README.md @@ -115,8 +115,8 @@ LLAMA2-7B 训练的硬件配置: --model-type GPT \ --loader llama2_hf \ --saver megatron \ - --target-tensor-parallel-size 8 \ - --target-pipeline-parallel-size 1 \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 2 \ --load-dir ./model_from_hf/llama-2-7b-hf/ \ --save-dir ./model_weights/llama-2-7b-hf-v0.1-tp8-pp1/ \ --tokenizer-model ./model_from_hf/llama-2-7b-hf/tokenizer.model @@ -253,10 +253,10 @@ python tools/checkpoint/convert_ckpt.py \ LLaMA2-7B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比: -| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/step) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) | +| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/p/s) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) | | :--: | :-------: | :----: | :---------------------: | :---------------------: | :-------------------: | :-------------------: | -| NPUs | LLaMA2-7B | 1024 | 5.63 | 2730 | 2.84 | 131.96 | -| 参考 | LLaMA2-7B | 1024 | 5.63 | 2884 | 2.84 | 131.96 | +| NPUs | LLaMA2-7B | 1024 | 1.03 | 4241 | 30.9 | 131.96 | +| 参考 | LLaMA2-7B | 1024 | 0.939 | 3850 | 34.06 | 131.96 | ## 推理-7B @@ -603,8 +603,8 @@ LLaMA2-13B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比: | 设备 | 模型 | 迭代数 | 样本吞吐 (samples/p/s) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) | | :--: | :--------: | :----: | :--------------------: | :---------------------: | :-------------------: | :-------------------: | -| NPUs | LLaMA2-13B | 5000 | 3.027 | 1550 | 5.285 | 133.77 | -| 参考 | LLaMA2-13B | -- | -- | 1750 | -- | -- | +| NPUs | LLaMA2-13B | 5000 | -- | 1990 | 131.71 | -- | +| 参考 | LLaMA2-13B | -- | -- | 1920 | -- | -- | ## 推理 diff --git a/examples/llama2/README_en.md b/examples/llama2/README_en.md index eed8b57bb..29990a28f 100644 --- a/examples/llama2/README_en.md +++ b/examples/llama2/README_en.md @@ -128,8 +128,8 @@ Here's a hardware summary of pre-training LLAMA2-7B: --model-type GPT \ --loader llama2_hf \ --saver megatron \ - --target-tensor-parallel-size 8 \ - --target-pipeline-parallel-size 1 \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 2 \ --load-dir ./model_from_hf/llama-2-7b-hf/ \ --save-dir ./model_weights/llama-2-7b-hf-v0.1-tp8-pp1/ \ --tokenizer-model ./model_from_hf/llama-2-7b-hf/tokenizer.model @@ -274,8 +274,8 @@ The performance of LLaMA2-7B in **Ascend NPU** and **Reference**: | Device | Model | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) | | :------: | :-----------: | :----------------: | :-----------------------------: | :----------------------------: | :-------------------------: | :-----------------------------------: | -| NPUs | LLaMA2-7B | 1024 | 5.19 | 2730 | 3.08 | 122.39 | -| Reference | LLaMA2-7B | 1024 | 5.63 | 2884 | 2.84 | 131.96 | +| NPUs | LLaMA2-7B | 1024 | 1.03 | 4241 | 30.9 | 122.39 | +| Reference | LLaMA2-7B | 1024 | 0.939 | 3850 | 34.06 | 131.96 | @@ -618,8 +618,8 @@ The performance of LLaMA2-13B in **Ascend NPU** and **Reference**: | Device | Model | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) | | :-------: | :--------: | :--------------: | :---------------------------: | :--------------------------: | :-----------------------: | :---------------------------------: | -| NPUs | LLaMA2-13B | 5000 | 3.027 | 1550 | 5.285 | 133.77 | -| Reference | LLaMA2-13B | -- | -- | 1750 | -- | -- | +| NPUs | LLaMA2-13B | 5000 | -- | 1990 | 65.870 | 133.77 | +| Reference | LLaMA2-13B | -- | -- | 1920 | 68.267 | -- | ## Inference diff --git a/examples/llama2/pretrain_llama2_13B_ptd_8p.sh b/examples/llama2/pretrain_llama2_13B_ptd_8p.sh index fd31a0287..4264d4b6b 100644 --- a/examples/llama2/pretrain_llama2_13B_ptd_8p.sh +++ b/examples/llama2/pretrain_llama2_13B_ptd_8p.sh @@ -1,5 +1,4 @@ #!/bin/bash - export CUDA_DEVICE_MAX_CONNECTIONS=1 export NPU_ASD_ENABLE=0 @@ -37,8 +36,8 @@ GPT_ARGS=" --tokenizer-model ${TOKENIZER_MODEL} \ --seq-length 4096 \ --max-position-embeddings 4096 \ - --micro-batch-size 2 \ - --global-batch-size 16 \ + --micro-batch-size 4 \ + --global-batch-size 512 \ --make-vocab-size-divisible-by 1 \ --lr 1e-6 \ --train-iters 5000 \ @@ -66,6 +65,9 @@ GPT_ARGS=" --load ${CKPT_LOAD_DIR} \ --no-load-optim \ --no-load-rng \ + --use-fused-swiglu \ + --use-fused-rotary-pos-emb \ + --use-mc2 \ --bf16 " diff --git a/examples/llama2/pretrain_llama2_7b_ptd.sh b/examples/llama2/pretrain_llama2_7b_ptd.sh index 43044d608..955f68555 100644 --- a/examples/llama2/pretrain_llama2_7b_ptd.sh +++ b/examples/llama2/pretrain_llama2_7b_ptd.sh @@ -14,8 +14,8 @@ CKPT_SAVE_DIR="your model save ckpt path" DATA_PATH="your data path" TOKENIZER_MODEL="your tokenizer path" CKPT_LOAD_DIR="your model ckpt path" -TP=8 -PP=1 +TP=1 +PP=2 DISTRIBUTED_ARGS=" --nproc_per_node $GPUS_PER_NODE \ @@ -37,8 +37,8 @@ GPT_ARGS=" --tokenizer-model ${TOKENIZER_MODEL} \ --seq-length 4096 \ --max-position-embeddings 4096 \ - --micro-batch-size 4 \ - --global-batch-size 16 \ + --micro-batch-size 1 \ + --global-batch-size 256 \ --make-vocab-size-divisible-by 1 \ --lr 1.25e-6 \ --train-iters 5000 \ @@ -65,6 +65,11 @@ GPT_ARGS=" --no-gradient-accumulation-fusion \ --no-load-optim \ --no-load-rng \ + --use-distributed-optimizer \ + --use-fused-swiglu \ + --use-fused-rotary-pos-emb \ + --overlap-grad-reduce \ + --overlap-param-gather \ --bf16 "