!236 修改LLama7B/13B 书生模型超参,提升性能

Merge pull request !236 from Kingsleyandher/master
This commit is contained in:
i-robot 2023-10-31 08:15:54 +00:00 committed by Gitee
commit 02cb18d70d
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
7 changed files with 9 additions and 3 deletions

View File

@ -1117,7 +1117,7 @@ def _add_activation_checkpoint_args(parser):
help='does a synchronize at the beginning and end of each checkpointed layer.')
group.add_argument('--profile-backward', action='store_true',
help='Enables backward pass profiling for checkpointed layers.')
group.add_argument('--checkpoint_policy', type=str, default='full', choices=['full', 'block', 'custom'],
group.add_argument('--checkpoint-policy', type=str, default='full', choices=['full', 'block', 'custom'],
help="activation checkpoint policy")
group.add_argument('--checkpoint_block_layer', type=int, default=25,
help="activation checkpoint block layer number")

View File

@ -109,7 +109,7 @@ deepspeed pretrain_llama.py \
--tokenizer-name-or-path $TOKENIZER_PATH \
--tokenizer-not-use-fast \
--checkpoint-activations \
--checkpoint_policy block \
--checkpoint-policy block \
--checkpoint_block_layer 8 \
--data-impl mmap \
--split 949,50,1 \

View File

@ -100,7 +100,7 @@ deepspeed pretrain_baichuan.py \
--eval-interval 1000 \
--eval-iters 1 \
--checkpoint-activations \
--checkpoint_policy block \
--checkpoint-policy block \
--checkpoint_block_layer 30 \
--triangle-attn \
$ds_args \

View File

@ -89,5 +89,6 @@ deepspeed pretrain_intern.py \
--eval-interval 1000 \
--eval-iters 10 \
--triangle-attn \
--use-fused-rotary-pos-emb \
$ds_args \
--fp16 | tee logs/train.log

View File

@ -53,4 +53,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
--checkpoint-policy custom \
--recomputation-layer-num 3 2 1 0 0 0 0 0 \
--triangle-attn \
--use-fused-rotary-pos-emb \
--fp16 | tee logs/train_13B.log

View File

@ -93,5 +93,7 @@ deepspeed pretrain_llama.py \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--triangle-attn \
--use-fused-rotary-pos-emb \
$ds_args \
--fp16 | tee logs/train_13B.log

View File

@ -92,5 +92,7 @@ deepspeed pretrain_llama.py \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--triangle-attn \
--use-fused-rotary-pos-emb \
$ds_args \
--fp16 | tee logs/train_7B.log