mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-05 05:17:40 +08:00
!1238 更新llama2 7b/13b 性能最优配置
Merge pull request !1238 from wwzhuo/master
This commit is contained in:
parent
a9f905b63f
commit
b2915bd2ab
@ -384,16 +384,16 @@ ModelLink旨在为华为 [昇腾芯片](https://open.codehub.huawei.com/OpenBaiz
|
||||
<td>7B</td>
|
||||
<td>1x8</td>
|
||||
<td>BF16 </td>
|
||||
<td> 2884 </td>
|
||||
<td> 2884 </td>
|
||||
<td> 4200 </td>
|
||||
<td> 3850 </td>
|
||||
<td> <a href="examples/llama2/pretrain_llama2_7b_ptd.sh">训练</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>13B</td>
|
||||
<td>1x8</td>
|
||||
<td>BF16 </td>
|
||||
<td> 1550 </td>
|
||||
<td> 1750 </td>
|
||||
<td> 1990 </td>
|
||||
<td> 1920 </td>
|
||||
<td> <a href="examples/llama2/pretrain_llama2_13B_ptd_8p.sh">训练</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
@ -385,16 +385,16 @@ For the supported models listed above, we provide training scripts and readme in
|
||||
<td>7B</td>
|
||||
<td>1x8</td>
|
||||
<td>BF16 </td>
|
||||
<td> 2884 </td>
|
||||
<td> 2884 </td>
|
||||
<td> 4200 </td>
|
||||
<td> 3850 </td>
|
||||
<td> <a href="examples/llama2/pretrain_llama2_7b_ptd.sh">train</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>13B</td>
|
||||
<td>1x8</td>
|
||||
<td>BF16 </td>
|
||||
<td> 1550 </td>
|
||||
<td> 1750 </td>
|
||||
<td> 1990 </td>
|
||||
<td> 1920 </td>
|
||||
<td> <a href="examples/llama2/pretrain_llama2_13B_ptd_8p.sh">train</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
|
@ -115,8 +115,8 @@ LLAMA2-7B 训练的硬件配置:
|
||||
--model-type GPT \
|
||||
--loader llama2_hf \
|
||||
--saver megatron \
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 1 \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 2 \
|
||||
--load-dir ./model_from_hf/llama-2-7b-hf/ \
|
||||
--save-dir ./model_weights/llama-2-7b-hf-v0.1-tp8-pp1/ \
|
||||
--tokenizer-model ./model_from_hf/llama-2-7b-hf/tokenizer.model
|
||||
@ -253,10 +253,10 @@ python tools/checkpoint/convert_ckpt.py \
|
||||
|
||||
LLaMA2-7B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比:
|
||||
|
||||
| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/step) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) |
|
||||
| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/p/s) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) |
|
||||
| :--: | :-------: | :----: | :---------------------: | :---------------------: | :-------------------: | :-------------------: |
|
||||
| NPUs | LLaMA2-7B | 1024 | 5.63 | 2730 | 2.84 | 131.96 |
|
||||
| 参考 | LLaMA2-7B | 1024 | 5.63 | 2884 | 2.84 | 131.96 |
|
||||
| NPUs | LLaMA2-7B | 1024 | 1.03 | 4241 | 30.9 | 131.96 |
|
||||
| 参考 | LLaMA2-7B | 1024 | 0.939 | 3850 | 34.06 | 131.96 |
|
||||
|
||||
## 推理-7B
|
||||
|
||||
@ -603,8 +603,8 @@ LLaMA2-13B 在 **昇腾芯片** 和 **参考芯片** 上的性能对比:
|
||||
|
||||
| 设备 | 模型 | 迭代数 | 样本吞吐 (samples/p/s) | tokens吞吐 (tokens/s/p) | 单步迭代时间 (s/step) | 浮点计算数 (TFLOPs/s) |
|
||||
| :--: | :--------: | :----: | :--------------------: | :---------------------: | :-------------------: | :-------------------: |
|
||||
| NPUs | LLaMA2-13B | 5000 | 3.027 | 1550 | 5.285 | 133.77 |
|
||||
| 参考 | LLaMA2-13B | -- | -- | 1750 | -- | -- |
|
||||
| NPUs | LLaMA2-13B | 5000 | -- | 1990 | 131.71 | -- |
|
||||
| 参考 | LLaMA2-13B | -- | -- | 1920 | -- | -- |
|
||||
|
||||
## 推理
|
||||
|
||||
|
@ -128,8 +128,8 @@ Here's a hardware summary of pre-training LLAMA2-7B:
|
||||
--model-type GPT \
|
||||
--loader llama2_hf \
|
||||
--saver megatron \
|
||||
--target-tensor-parallel-size 8 \
|
||||
--target-pipeline-parallel-size 1 \
|
||||
--target-tensor-parallel-size 1 \
|
||||
--target-pipeline-parallel-size 2 \
|
||||
--load-dir ./model_from_hf/llama-2-7b-hf/ \
|
||||
--save-dir ./model_weights/llama-2-7b-hf-v0.1-tp8-pp1/ \
|
||||
--tokenizer-model ./model_from_hf/llama-2-7b-hf/tokenizer.model
|
||||
@ -274,8 +274,8 @@ The performance of LLaMA2-7B in **Ascend NPU** and **Reference**:
|
||||
|
||||
| Device | Model | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) |
|
||||
| :------: | :-----------: | :----------------: | :-----------------------------: | :----------------------------: | :-------------------------: | :-----------------------------------: |
|
||||
| NPUs | LLaMA2-7B | 1024 | 5.19 | 2730 | 3.08 | 122.39 |
|
||||
| Reference | LLaMA2-7B | 1024 | 5.63 | 2884 | 2.84 | 131.96 |
|
||||
| NPUs | LLaMA2-7B | 1024 | 1.03 | 4241 | 30.9 | 122.39 |
|
||||
| Reference | LLaMA2-7B | 1024 | 0.939 | 3850 | 34.06 | 131.96 |
|
||||
|
||||
|
||||
|
||||
@ -618,8 +618,8 @@ The performance of LLaMA2-13B in **Ascend NPU** and **Reference**:
|
||||
|
||||
| Device | Model | total Iterations | throughput rate (samples/s/p) | throughput rate (tokens/s/p) | single-step time (s/step) | floating point operation (TFLOPs/s) |
|
||||
| :-------: | :--------: | :--------------: | :---------------------------: | :--------------------------: | :-----------------------: | :---------------------------------: |
|
||||
| NPUs | LLaMA2-13B | 5000 | 3.027 | 1550 | 5.285 | 133.77 |
|
||||
| Reference | LLaMA2-13B | -- | -- | 1750 | -- | -- |
|
||||
| NPUs | LLaMA2-13B | 5000 | -- | 1990 | 65.870 | 133.77 |
|
||||
| Reference | LLaMA2-13B | -- | -- | 1920 | 68.267 | -- |
|
||||
|
||||
## Inference
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
export NPU_ASD_ENABLE=0
|
||||
|
||||
@ -37,8 +36,8 @@ GPT_ARGS="
|
||||
--tokenizer-model ${TOKENIZER_MODEL} \
|
||||
--seq-length 4096 \
|
||||
--max-position-embeddings 4096 \
|
||||
--micro-batch-size 2 \
|
||||
--global-batch-size 16 \
|
||||
--micro-batch-size 4 \
|
||||
--global-batch-size 512 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
--lr 1e-6 \
|
||||
--train-iters 5000 \
|
||||
@ -66,6 +65,9 @@ GPT_ARGS="
|
||||
--load ${CKPT_LOAD_DIR} \
|
||||
--no-load-optim \
|
||||
--no-load-rng \
|
||||
--use-fused-swiglu \
|
||||
--use-fused-rotary-pos-emb \
|
||||
--use-mc2 \
|
||||
--bf16
|
||||
"
|
||||
|
||||
|
@ -14,8 +14,8 @@ CKPT_SAVE_DIR="your model save ckpt path"
|
||||
DATA_PATH="your data path"
|
||||
TOKENIZER_MODEL="your tokenizer path"
|
||||
CKPT_LOAD_DIR="your model ckpt path"
|
||||
TP=8
|
||||
PP=1
|
||||
TP=1
|
||||
PP=2
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
@ -37,8 +37,8 @@ GPT_ARGS="
|
||||
--tokenizer-model ${TOKENIZER_MODEL} \
|
||||
--seq-length 4096 \
|
||||
--max-position-embeddings 4096 \
|
||||
--micro-batch-size 4 \
|
||||
--global-batch-size 16 \
|
||||
--micro-batch-size 1 \
|
||||
--global-batch-size 256 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
--lr 1.25e-6 \
|
||||
--train-iters 5000 \
|
||||
@ -65,6 +65,11 @@ GPT_ARGS="
|
||||
--no-gradient-accumulation-fusion \
|
||||
--no-load-optim \
|
||||
--no-load-rng \
|
||||
--use-distributed-optimizer \
|
||||
--use-fused-swiglu \
|
||||
--use-fused-rotary-pos-emb \
|
||||
--overlap-grad-reduce \
|
||||
--overlap-param-gather \
|
||||
--bf16
|
||||
"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user