mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-05 05:17:40 +08:00
!236 修改LLama7B/13B 书生模型超参,提升性能
Merge pull request !236 from Kingsleyandher/master
This commit is contained in:
commit
02cb18d70d
@ -1117,7 +1117,7 @@ def _add_activation_checkpoint_args(parser):
|
||||
help='does a synchronize at the beginning and end of each checkpointed layer.')
|
||||
group.add_argument('--profile-backward', action='store_true',
|
||||
help='Enables backward pass profiling for checkpointed layers.')
|
||||
group.add_argument('--checkpoint_policy', type=str, default='full', choices=['full', 'block', 'custom'],
|
||||
group.add_argument('--checkpoint-policy', type=str, default='full', choices=['full', 'block', 'custom'],
|
||||
help="activation checkpoint policy")
|
||||
group.add_argument('--checkpoint_block_layer', type=int, default=25,
|
||||
help="activation checkpoint block layer number")
|
||||
|
@ -109,7 +109,7 @@ deepspeed pretrain_llama.py \
|
||||
--tokenizer-name-or-path $TOKENIZER_PATH \
|
||||
--tokenizer-not-use-fast \
|
||||
--checkpoint-activations \
|
||||
--checkpoint_policy block \
|
||||
--checkpoint-policy block \
|
||||
--checkpoint_block_layer 8 \
|
||||
--data-impl mmap \
|
||||
--split 949,50,1 \
|
||||
|
@ -100,7 +100,7 @@ deepspeed pretrain_baichuan.py \
|
||||
--eval-interval 1000 \
|
||||
--eval-iters 1 \
|
||||
--checkpoint-activations \
|
||||
--checkpoint_policy block \
|
||||
--checkpoint-policy block \
|
||||
--checkpoint_block_layer 30 \
|
||||
--triangle-attn \
|
||||
$ds_args \
|
||||
|
@ -89,5 +89,6 @@ deepspeed pretrain_intern.py \
|
||||
--eval-interval 1000 \
|
||||
--eval-iters 10 \
|
||||
--triangle-attn \
|
||||
--use-fused-rotary-pos-emb \
|
||||
$ds_args \
|
||||
--fp16 | tee logs/train.log
|
@ -53,4 +53,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
|
||||
--checkpoint-policy custom \
|
||||
--recomputation-layer-num 3 2 1 0 0 0 0 0 \
|
||||
--triangle-attn \
|
||||
--use-fused-rotary-pos-emb \
|
||||
--fp16 | tee logs/train_13B.log
|
||||
|
@ -93,5 +93,7 @@ deepspeed pretrain_llama.py \
|
||||
--save-interval 10000 \
|
||||
--eval-interval 1000 \
|
||||
--eval-iters 10 \
|
||||
--triangle-attn \
|
||||
--use-fused-rotary-pos-emb \
|
||||
$ds_args \
|
||||
--fp16 | tee logs/train_13B.log
|
||||
|
@ -92,5 +92,7 @@ deepspeed pretrain_llama.py \
|
||||
--save-interval 10000 \
|
||||
--eval-interval 1000 \
|
||||
--eval-iters 10 \
|
||||
--triangle-attn \
|
||||
--use-fused-rotary-pos-emb \
|
||||
$ds_args \
|
||||
--fp16 | tee logs/train_7B.log
|
||||
|
Loading…
Reference in New Issue
Block a user