!155 送检整改： codecheck空行缩进修改

Merge pull request !155 from chenzhihong/codecheck
2024-12-05 05:17:40 +08:00 · 2023-10-11 08:21:53 +00:00 · 2023-10-11 08:21:53 +00:00 · bb09ab2962
commit bb09ab2962
parent a22bd383ca 9d5c53521f
24 changed files with 72 additions and 18 deletions
--- a/ascendspeed/checkpointing.py
+++ b/ascendspeed/checkpointing.py
@ -40,6 +40,7 @@ from ascendspeed.model.lora_utils import is_enable_lora, get_lora_state_dict, lo

 _CHECKPOINT_VERSION = None

+
 def set_checkpoint_version(value):
    global _CHECKPOINT_VERSION
    if _CHECKPOINT_VERSION is not None:
@ -47,10 +48,12 @@ def set_checkpoint_version(value):
            "checkpoint versions do not match"
    _CHECKPOINT_VERSION = value

+
 def get_checkpoint_version():
    global _CHECKPOINT_VERSION
    return _CHECKPOINT_VERSION

+
 def check_checkpoint_args(checkpoint_args):
    """Ensure fixed arguments for a model are the same for the input
    arguments and the one retrieved from checkpoint."""
@ -172,8 +175,8 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
            torch.save(state_dict, checkpoint_name)

    if args.deepspeed:
-        #ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
-        #state_dict is used by deepspeed for module saving so it needs to point to the right function
+        # ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
+        # state_dict is used by deepspeed for module saving so it needs to point to the right function
        if args.no_pipeline_parallel:
            original_state_dict = model[0].module.state_dict
            model[0].module.state_dict = model[0].module.state_dict_for_save_checkpoint
@ -234,7 +237,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
    # specific to self attention so should work for cross attention as well
    while hasattr(model, 'module'):
        model = model.module
-    #attention_module = model.language_model.encoder.layers[0].self_attention
+    # attention_module = model.language_model.encoder.layers[0].self_attention
    attention_module = model.language_model.encoder.layers[0].attention
    hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
    num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
@ -267,6 +270,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):

    return t

+
 def fix_query_key_value_ordering(model, checkpoint_version):
    """Fix up query/key/value matrix ordering if checkpoint
    version is smaller than 2.0
--- a/ascendspeed/data/Makefile
+++ b/ascendspeed/data/Makefile
@ -6,4 +6,4 @@ LIBEXT = $(shell python3-config --extension-suffix)
 default: $(LIBNAME)$(LIBEXT)

 %$(LIBEXT): %.cpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
+    $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
--- a/ascendspeed/data/decoder_packed_mtf_dataset.py
+++ b/ascendspeed/data/decoder_packed_mtf_dataset.py
@ -17,6 +17,7 @@ from ascendspeed.data.indexed_dataset import make_dataset as make_indexed_datase

 logger = logging.getLogger(__name__)

+
 def build_train_valid_test_datasets(
    data_prefix,
    data_impl,
@ -161,6 +162,7 @@ def build_dataset_group(

        return all_datasets

+
 def _build_single_datasets(
    data_prefix,
    range_string,
@ -391,6 +393,7 @@ def _build_index_mappings(

    return shuffle_idx

+
 def _build_shuffle_idx(nb_documents: int, np_rng):
    """Build the range [0, dataset_size) and shuffle."""
    dtype_ = np.int64
--- a/ascendspeed/enums.py
+++ b/ascendspeed/enums.py
@ -15,19 +15,23 @@

 import enum

+
 class LayerType(enum.Enum):
    encoder = 1
    decoder = 2
- 
+
+
 class AttnType(enum.Enum):
    self_attn = 1
    cross_attn = 2

+
 class AttnMaskType(enum.Enum):
    padding = 1
    causal = 2
    prefix = 3

+
 class PositionEmbeddingType(enum.Enum):
    rotary = 1
    absolute = 2
--- a/ascendspeed/learning_rates.py
+++ b/ascendspeed/learning_rates.py
@ -19,6 +19,7 @@ import math

 from ascendspeed import print_rank_0, get_args

+
 class AnnealingLR(object):
    """Anneals the learning rate."""

--- a/ascendspeed/model/fused_bias_gelu.py
+++ b/ascendspeed/model/fused_bias_gelu.py
@ -28,6 +28,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
 # actual gelu is:
 # x * 0.5 * (1.0 + torch.erf(x * 0.70710678))

+
@torch.jit.script
 def bias_gelu(bias, y):
    x = bias + y
@ -36,6 +37,8 @@ def bias_gelu(bias, y):
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
 # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+
+
@torch.jit.script
 def bias_gelu_back(g, bias, y):
    x = bias + y
@ -44,6 +47,7 @@ def bias_gelu_back(g, bias, y):
    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
    return ff*g

+
 class GeLUFunction(torch.autograd.Function):
    @staticmethod
    # bias is an optional argument
@ -57,4 +61,5 @@ class GeLUFunction(torch.autograd.Function):
        tmp = bias_gelu_back(grad_output, bias, input)
        return tmp, tmp

+
 bias_gelu_impl = GeLUFunction.apply
--- a/ascendspeed/model/glu_activations.py
+++ b/ascendspeed/model/glu_activations.py
@ -7,6 +7,7 @@ from ascendspeed.model.utils import log_debug_usage

 logger = logging.get_logger(__name__)

+
 class _GLUBaseModule(nn.Module):
    def __init__(self, activation_fn):
        super().__init__()
--- a/ascendspeed/model/language_model.py
+++ b/ascendspeed/model/language_model.py
@ -292,6 +292,7 @@ class Embedding(MegatronModule):
                print('***WARNING*** expected tokentype embeddings in the '
                      'checkpoint but could not find it', flush=True)

+
 class EmbeddingPipe(Embedding):

    def forward(self, inputs, **kwargs):
--- a/ascendspeed/model/positional_embeddings.py
+++ b/ascendspeed/model/positional_embeddings.py
@ -1,6 +1,5 @@
 # coding=utf-8
 # Copyright (c) 2023, HUAWEI CORPORATION.  All rights reserved.
-# Extracted from: https://github.com/EleutherAI/gpt-neox
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/ascendspeed/model/transformer.py
+++ b/ascendspeed/model/transformer.py
@ -59,6 +59,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
        hyperparameters: transformer hyperparameters
 """

+
 class ParallelMLP(MegatronModule):
    """MLP.

@ -119,6 +120,7 @@ class ParallelMLP(MegatronModule):
        output, output_bias = self.dense_4h_to_h(intermediate_parallel)
        return output, output_bias

+
 class ParallelAttention(MegatronModule):
    """Parallel self-attention layer abstract class.

@ -658,6 +660,7 @@ class ParallelTransformerLayer(MegatronModule):
        alibi = alibi.repeat(batch_size, 1, 1)
        return alibi

+
 class ParallelTransformerLayerPipe(ParallelTransformerLayer):
    """Extends ParallelTransformerLayer to forward attention_mask through the pipeline.

@ -697,6 +700,7 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer):
        else:
            raise RuntimeError('Received more inputs than understood.')

+
 class ParallelTransformer(MegatronModule):
    """Transformer class."""

@ -791,6 +795,7 @@ class ParallelTransformer(MegatronModule):
            global get_cuda_rng_tracker, checkpoint
            get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
            checkpoint = deepspeed.checkpointing.checkpoint
+
    def _get_layer(self, layer_number):
        return self.layers[layer_number]

--- a/ascendspeed/model/utils.py
+++ b/ascendspeed/model/utils.py
@ -21,6 +21,7 @@ import torch

 from ascendspeed import get_args

+
 def init_method_normal(sigma):
    """Init method based on N(0, sigma)."""
    def init_(tensor):
@ -61,15 +62,20 @@ def get_linear_layer(rows, columns, init_method):
        layer.bias.zero_()
    return layer

+
@torch.jit.script
 def gelu_impl(x):
    """OpenAI's gelu implementation."""
    return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
                                       (1.0 + 0.044715 * x * x)))
+
+
 def openai_gelu(x):
    return gelu_impl(x)

-#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
+
+
@torch.jit.script
 def erf_gelu(x):
    return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
--- a/ascendspeed/mpu/layers.py
+++ b/ascendspeed/mpu/layers.py
@ -277,7 +277,7 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
        grad_output = grad_output.reshape(grad_output.shape[0] * grad_output.shape[1],
                                       grad_output.shape[2])
        total_input = total_input.reshape(total_input.shape[0] * total_input.shape[1],
-				       total_input.shape[2])
+                       total_input.shape[2])

        if ctx.sequence_parallel:
            dim_size = list(input_.size())
@ -364,8 +364,11 @@ def linear_with_grad_accumulation_and_async_allreduce(

    with torch.cuda.amp.autocast(enabled=False):
        return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
+
+
 linear_with_grad_accumulation_and_async_allreduce.warned = False

+
 class ColumnParallelLinear(torch.nn.Module):
    """Linear layer with column parallelism.

@ -460,7 +463,6 @@ class ColumnParallelLinear(torch.nn.Module):

        self.sequence_parallel_enabled = sequence_parallel_enabled

-
    def forward(self, input_):
        # Set up backprop all-reduce.
        if self.is_expert_without_slicing or self.sequence_parallel_enabled: # non-expert only tensor parallelism
--- a/ascendspeed/mpu/mappings.py
+++ b/ascendspeed/mpu/mappings.py
@ -74,6 +74,7 @@ def _gather(input_):

    return output

+
 def _gather_along_first_dim(input_):
    """Gather tensors and concatinate along the first dimension."""

@ -92,6 +93,7 @@ def _gather_along_first_dim(input_):

    return output

+
 def _reduce_scatter_along_first_dim(input_):
    """Reduce-scatter the input tensor across model parallel group."""
    world_size = get_tensor_model_parallel_world_size()
@ -111,6 +113,7 @@ def _reduce_scatter_along_first_dim(input_):
                                           group=get_tensor_model_parallel_group())
    return output

+
 def _split_along_first_dim(input_):
    """Split the tensor along its first dimension and keep the
    corresponding slice."""
@ -132,6 +135,7 @@ def _split_along_first_dim(input_):

    return output

+
 class _ScatterToSequenceParallelRegion(torch.autograd.Function):
    """Split the input and keep only the corresponding chuck to the rank."""

@ -238,7 +242,6 @@ class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
        return _gather_along_first_dim(grad_output)


-
 class _GatherFromModelParallelRegion(torch.autograd.Function):
    """Gather the input from model parallel region and concatinate."""

@ -274,11 +277,14 @@ def scatter_to_tensor_model_parallel_region(input_):
 def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)

+
 def gather_from_tensor_model_parallel_region(input_):
    return _GatherFromModelParallelRegion.apply(input_)

+
 def reduce_scatter_to_sequence_parallel_region(input_):
    return _ReduceScatterToSequenceParallelRegion.apply(input_)

+
 def scatter_to_sequence_parallel_region(input_):
    return _ScatterToSequenceParallelRegion.apply(input_)
--- a/ascendspeed/optimizer/init.py
+++ b/ascendspeed/optimizer/init.py
@ -26,6 +26,7 @@ from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
 from .distrib_optimizer import DistributedOptimizer

+
 def _get_params_for_weight_decay_optimization(modules):
    """Divide params into with-weight-decay and without-weight-decay groups.
    Layernorms and baises will have no weight decay but the rest will.
--- a/ascendspeed/optimizer/clip_grads.py
+++ b/ascendspeed/optimizer/clip_grads.py
@ -33,6 +33,7 @@ from ascendspeed.core import parallel_state
 from ascendspeed.model.module import param_is_not_shared
 from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate

+
 def clip_grad_norm_fp32(parameters, grads_for_norm,
                        max_norm, norm_type=2,
                        model_parallel_group=None):
--- a/ascendspeed/optimizer/grad_scaler.py
+++ b/ascendspeed/optimizer/grad_scaler.py
@ -21,6 +21,7 @@ from abc import abstractmethod
 import torch
 from deepspeed.accelerator import get_accelerator

+
 class MegatronGradScaler(ABC):

    def __init__(self, initial_scale):
--- a/ascendspeed/text_generation_utils.py
+++ b/ascendspeed/text_generation_utils.py
@ -196,6 +196,7 @@ def generate_samples_input_from_file(model):
 # and drop in the https://github.com/EleutherAI/lm-evaluation-harness
 # codebase. The lm-evaluation-harness code can now call this function
 # similar to their current generate function call used for gpt style models.
+
 def generate_samples_eval(model, context, max_gen_length, eos_token_id):
    # Generate samples for lm evaluation
    # NEED TO THINK ABOUT eos token
--- a/ascendspeed/utils.py
+++ b/ascendspeed/utils.py
@ -34,6 +34,7 @@ from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate
 from ascendspeed import get_num_microbatches
 from deepspeed.accelerator import get_accelerator

+
 def unwrap_model(model, module_instances=(torchDDP)):
    return_list = True
    if not isinstance(model, list):
@ -259,6 +260,7 @@ def get_parameters_in_billions(model):

    return approx_parameters_in_billions*gpus_per_model/(1e9)

+
 def throughput_calculator(model, args, iteration_time, total_iterations):
    gpus_per_model = torch.distributed.get_world_size(group = parallel_state.get_model_parallel_group())
    batch_size = args.micro_batch_size * get_num_microbatches() * args.data_parallel_size
@ -285,6 +287,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
    tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12))
    return samples_per_second, tflops, approx_parameters_in_billions

+
 def checkpoint_throughput_calculator(model, latency_second):
    approx_parameters_in_billions = get_parameters_in_billions(model)
    checkpoint_multiplier = 14  # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4)
--- a/ci/access_control_test.py
+++ b/ci/access_control_test.py
@ -6,9 +6,9 @@ from pathlib import Path
 import xmlrunner


-#=============================
+# =============================
 # ST test, run with shell
-#=============================
+# =============================
 def success_check(res):
    if res != 0:
        sys.exit(1)
@ -44,9 +44,10 @@ class ST_Test:
        for shell_file in self.shell_file_list:
            success_check(os.system("sh {}".format(shell_file)))

-#===============================================
+# ===============================================
 # UT test, run with pytest, waiting for more ...
-#===============================================
+# ===============================================
+

 if __name__ == "__main__":
    st_test = ST_Test()
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@ -16,8 +16,8 @@ BLOOM model is from: [A 176B-Parameter Open-Access Multilingual Language Model](
  - [Script](#script)
  
  - [Performance](#performance)
-  	- [Machine performance](#machine-performance)
-  	- [Accuracy of the loss](#accuracy-of-the-loss)
+      - [Machine performance](#machine-performance)
+      - [Accuracy of the loss](#accuracy-of-the-loss)
  
 - [Fine-tune and Evaluation](#fine-tune-and-evaluation)

--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@ -16,8 +16,8 @@ LLaMA model is from: [LLaMA: OPen and Efficient Foundation Language Models](http
  - [Script](#script)
  
  - [Performance](#performance)
-  	- [Machine performance](#machine-performance)
-  	- [Accuracy of the loss](#accuracy-of-the-loss)
+      - [Machine performance](#machine-performance)
+      - [Accuracy of the loss](#accuracy-of-the-loss)
  
 - [Citation](#citation)

--- a/pretrain_bloom.py
+++ b/pretrain_bloom.py
@ -133,6 +133,7 @@ def get_batch(data_iterator):

    return tokens, labels, loss_mask, attention_mask, position_ids

+
 def data_post_process(data, data_sampler_state_dict):
    args = get_args()
    if args.data_efficiency_curriculum_learning:
@ -158,6 +159,7 @@ def data_post_process(data, data_sampler_state_dict):
            args.data_efficiency_curriculum_learning_seqlen_type = None
    return data

+
 def get_batch_pipe(data):
    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
    args = get_args()
@ -212,6 +214,7 @@ def loss_func(loss_mask, output_tensor):
            loss = loss + moe_loss
            return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss}

+
 def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask):
    mos_loss = 0
    alpha = args.kd_alpha_ce
@ -242,6 +245,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at
        mos_loss = mos_loss.div(args.seq_length) * beta
    return mos_loss

+
 def forward_step(data_iterator, model):
    """Forward step."""
    args = get_args()
@ -259,6 +263,7 @@ def forward_step(data_iterator, model):

    return output_tensor, partial(loss_func, loss_mask)

+
 def train_valid_test_datasets_provider(train_val_test_num_samples):
    """Build train, valid, and test datasets."""
    args = get_args()
--- a/pretrain_llama.py
+++ b/pretrain_llama.py
@ -184,6 +184,7 @@ def get_batch_pipe(data):
        args.eod_mask_loss)
    return (tokens, attention_mask), (labels, loss_mask)

+
 def loss_func(loss_mask, output_tensor):
    args = get_args()

@ -232,6 +233,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

    return train_ds, valid_ds, test_ds

+
 if __name__ == "__main__":
    torch.npu.set_compile_mode(jit_compile=True)
    pretrain(train_valid_test_datasets_provider,
--- a/tests/st/test_llama/run_llama_ptd.py
+++ b/tests/st/test_llama/run_llama_ptd.py
@ -220,12 +220,14 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

    return train_ds, valid_ds, test_ds

+
@timeout(1200)
 def test():
    pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder,forward_step,
             args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
             data_post_process=data_post_process)

+
 if __name__ == "__main__":
    os.makedirs("./ckpt_llama", exist_ok=True)
    test()