llama 13b增加自定义选择重计算，兼容之前所有逻辑。

2024-12-05 05:17:40 +08:00 · 2023-10-20 11:05:38 +08:00 · 2023-10-20 11:05:38 +08:00 · a511221b5c
commit a511221b5c
parent 5e27f2e9b6
5 changed files with 32 additions and 91 deletions
--- a/README.md
+++ b/README.md
@ -9,10 +9,10 @@ AscendSpeed provides end-to-end test scripts for large language models on [Ascen
 Currently, the pre-training of the following models has been supported: 

 * <a href="https://github.com/baichuan-inc" style="color:green">Baichuan</a>-[[README: 7B/13B]](examples/baichuan/README.md)
-* <a href="https://arxiv.org/pdf/2108.12409.pdf" style="color:green">Bloom</a>-[[README: 7B1/176B]](examples/bloom/README.md)
-* <a href="https://internlm.intern-ai.org.cn/" style="color:green">InternLM</a>-7B
-* <a href="https://huggingface.co/docs/transformers/main/model_doc/llama" style="color:green">LLaMA</a>-7B/13B/[[README: 65B]](examples/llama/README.md)
-* <a href="https://huggingface.co/docs/transformers/main/model_doc/llama2" style="color:green">LLaMA2</a>-7B
+* <a href="https://arxiv.org/pdf/2108.12409.pdf" style="color:green">Bloom</a>-[[README: 7B/176B]](examples/bloom/README.md)
+* <a href="https://internlm.intern-ai.org.cn/" style="color:green">InternLM</a>-[[README: 7B]](examples/intern/README.md)
+* <a href="https://huggingface.co/docs/transformers/main/model_doc/llama" style="color:green">LLaMA</a>-[[README: 7B/13B/65B]](examples/llama/README.md)
+* <a href="https://huggingface.co/docs/transformers/main/model_doc/llama2" style="color:green">LLaMA2</a>-[[README: 7B]](examples/llama2/README.md)

 LLaMA-33B, LLaMA2-13B/70B, Aquila-7B are coming soon ...

@ -103,7 +103,7 @@ The plan for more tasks, like RLHF and RM, is under way ...
      <td> <a href="examples/bloom/pretrain_bloom_176b.sh">Train</a> </td>
    </tr>
    <tr>
-      <td>InternLM</td>
+      <td><a href="examples/intern/README.md">InternLM</td>
      <td>7B</td>
      <td>1x8</td>
      <td>FP16</td>
@ -113,7 +113,7 @@ The plan for more tasks, like RLHF and RM, is under way ...
      <td> <a href="examples/intern/pretrain_internlm_7b_zero.sh">Train</a> </td>
    </tr>
    <tr>
-      <td rowspan="4">LLaMA</td>
+      <td rowspan="4"><a href="examples/llama/README.md">LLaMA</td>
      <td>7B</td>
      <td>1x8</td>
      <td>FP16</td>
@ -143,7 +143,7 @@ The plan for more tasks, like RLHF and RM, is under way ...
      <td> <a href="examples/llama/pretrain_llama_65B_ptd_32p.sh">Train</a> </td>
    </tr>
    <tr>
-      <td>LLaMA2</td>
+      <td><a href="examples/llama2/README.md">LLaMA2</td>
      <td>7B</td>
      <td>1x8</td>
      <td>FP16 </td>
--- a/ascendspeed/arguments.py
+++ b/ascendspeed/arguments.py
@ -557,6 +557,11 @@ def _add_training_args(parser):
    group.add_argument('--checkpoint-activations', action='store_true',
                       help='Checkpoint activation to allow for training '
                       'with larger models, sequences, and batch sizes.')
+    group.add_argument('--recomputation-layer-num', nargs='+',
+                       type=int, help='Represents the number of layers to be recomputed at each stage of the pp. '
+                       'The default is None. If pp=4, each stage has 8 layers, '
+                       'if this parameter is equal to 4 4 4 4, '
+                       'it means that each stage only needs to recompute 4 layers.')
    group.add_argument('--distribute-checkpointed-activations',
                       action='store_true',
                       help='If set, distribute checkpointed activations '
@ -1110,7 +1115,7 @@ def _add_activation_checkpoint_args(parser):
                       help='does a synchronize at the beginning and end of each checkpointed layer.')
    group.add_argument('--profile-backward', action='store_true',
                       help='Enables backward pass profiling for checkpointed layers.')
-    group.add_argument('--checkpoint_policy', type=str, default='full', choices=['full', 'block'],
+    group.add_argument('--checkpoint_policy', type=str, default='full', choices=['full', 'block', 'custom'],
                       help="activation checkpoint policy")
    group.add_argument('--checkpoint_block_layer', type=int, default=25,
                       help="activation checkpoint block layer number")
--- a/ascendspeed/model/llama_model.py
+++ b/ascendspeed/model/llama_model.py
@ -779,6 +779,19 @@ class LlamaParallelTransformer(MegatronModule):
        check_divisible(args.num_layers, parallel_state.get_pipeline_model_parallel_world_size(), error_info)
        self.num_layers = args.num_layers // parallel_state.get_pipeline_model_parallel_world_size()

+        if self.checkpoint_policy == "block":
+            self.recomputation_layer_num = [self.checkpoint_block_layer] * \
+                parallel_state.get_pipeline_model_parallel_world_size()
+        elif self.checkpoint_policy == "custom":
+            if len(args.recomputation_layer_num) == \
+                parallel_state.get_pipeline_model_parallel_world_size():
+                self.recomputation_layer_num = args.recomputation_layer_num
+            else:
+                raise ValueError(f"`recomputation_layer_num` length must equal to PP stage number.")
+        else:
+            self.recomputation_layer_num = [self.num_layers] * \
+                parallel_state.get_pipeline_model_parallel_world_size()
+
        # Transformer layers.
        def build_layer(layer_number):
            return LlamaParallelTransformerLayer(
@ -834,7 +847,6 @@ class LlamaParallelTransformer(MegatronModule):

    def _checkpointed_forward(self, hidden_states, attention_mask):
        """Forward method with activation checkpointing."""
-
        def custom(start, end):
            def custom_forward(*inputs):
                x_ = inputs[0]
@ -848,32 +860,11 @@ class LlamaParallelTransformer(MegatronModule):

        # Make sure memory is freed.
        tensor_parallel.reset_checkpointed_activations_memory_buffer()
-        l = 0
-        while l < self.num_layers:
-            hidden_states = tensor_parallel.checkpoint(
-                custom(l, l + self.checkpoint_num_layers),
-                self.distribute_saved_activations,
-                hidden_states, attention_mask)
-            l += self.checkpoint_num_layers
-
-        return hidden_states
-
-    def _checkpointed_forward_block(self, hidden_states, attention_mask):
-        """Forward method with activation checkpointing."""
-        def custom(start, end):
-            def custom_forward(*inputs):
-                x_ = inputs[0]
-                attention_mask = inputs[1]
-                for index in range(start, end):
-                    layer = self._get_layer(index)
-                    x_ = layer(x_, attention_mask=attention_mask)
-                return x_
-
-            return custom_forward

        # Make sure memory is freed.
        for idx in range(self.num_layers):
-            if idx < self.checkpoint_block_layer:
+            if idx < self.recomputation_layer_num[
+                parallel_state.get_pipeline_model_parallel_rank()]:
                hidden_states = tensor_parallel.checkpoint(
                    custom(idx, idx + 1),
                    self.distribute_saved_activations,
@ -921,10 +912,8 @@ class LlamaParallelTransformer(MegatronModule):
                # See set_input_tensor()
                hidden_states = self.input_tensor

-        if self.checkpoint_activations and self.checkpoint_policy == 'full':
+        if self.checkpoint_activations:
            hidden_states = self._checkpointed_forward(hidden_states, attention_mask)
-        elif self.checkpoint_activations and self.checkpoint_policy == 'block':
-            hidden_states = self._checkpointed_forward_block(hidden_states, attention_mask)
        else:
            if get_key_value:
                presents = []
--- a/examples/llama/pretrain_llama_13B_ptd_8p.sh
+++ b/examples/llama/pretrain_llama_13B_ptd_8p.sh
@ -50,4 +50,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --eval-iters 10 \
       --initial-loss-scale 4096.0 \
       --checkpoint-activations \
+       --checkpoint-policy custom \
+       --recomputation-layer-num 3 2 1 0 0 0 0 0 \
+       --triangle-attn \
       --fp16 | tee logs/train_13B.log
--- a/examples/llama/pretrain_llama_16B_ptd_8p.sh
+++ b/examples/llama/pretrain_llama_16B_ptd_8p.sh
@ -1,56 +0,0 @@
-# This is an example: train llama using PTD.
-
-# The number of parameters is not aligned
-export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib:/root/miniconda3/lib:$LD_LIBRARY_PATH
-export HCCL_CONNECT_TIMEOUT=1200
-export COMBINED_ENABLE=1
-
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6001
-NNODES=1
-NODE_RANK=0
-NPUS_PER_NODE=8
-WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
-
-DATA_PATH=./dataset/llama_text_document
-CHECKPOINT_PATH=./ckpt
-DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
-
-# Main script
-python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_llama.py \
-       --DDP-impl local \
-       --use-distributed-optimizer \
-       --tensor-model-parallel-size 4 \
-       --pipeline-model-parallel-size 2 \
-       --num-layers 30 \
-       --hidden-size 6656 \
-       --ffn-hidden-size 17920 \
-       --num-attention-heads 52 \
-       --micro-batch-size 1 \
-       --global-batch-size 32 \
-       --seq-length 2048 \
-       --max-position-embeddings 2048 \
-       --train-iters 500000 \
-       --lr-decay-iters 320000 \
-       --save $CHECKPOINT_PATH \
-       --load $CHECKPOINT_PATH \
-       --data-path $DATA_PATH \
-       --tokenizer-name-or-path $TOKENIZER_PATH \
-       --tokenizer-not-use-fast \
-       --data-impl mmap \
-       --split 949,50,1 \
-       --distributed-backend nccl \
-       --lr 0.00015 \
-       --lr-decay-style cosine \
-       --min-lr 1.0e-5 \
-       --weight-decay 1e-2 \
-       --clip-grad 1.0 \
-       --lr-warmup-fraction .01 \
-       --log-interval 1 \
-       --save-interval 10000 \
-       --eval-interval 1000 \
-       --eval-iters 10 \
-       --initial-loss-scale 4096.0 \
-       --fp16 | tee logs/train.log