!155 送检整改: codecheck空行缩进修改

Merge pull request !155 from chenzhihong/codecheck
This commit is contained in:
i-robot 2023-10-11 08:21:53 +00:00 committed by Gitee
commit bb09ab2962
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
24 changed files with 72 additions and 18 deletions

View File

@ -40,6 +40,7 @@ from ascendspeed.model.lora_utils import is_enable_lora, get_lora_state_dict, lo
_CHECKPOINT_VERSION = None
def set_checkpoint_version(value):
global _CHECKPOINT_VERSION
if _CHECKPOINT_VERSION is not None:
@ -47,10 +48,12 @@ def set_checkpoint_version(value):
"checkpoint versions do not match"
_CHECKPOINT_VERSION = value
def get_checkpoint_version():
global _CHECKPOINT_VERSION
return _CHECKPOINT_VERSION
def check_checkpoint_args(checkpoint_args):
"""Ensure fixed arguments for a model are the same for the input
arguments and the one retrieved from checkpoint."""
@ -172,8 +175,8 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
torch.save(state_dict, checkpoint_name)
if args.deepspeed:
#ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
#state_dict is used by deepspeed for module saving so it needs to point to the right function
# ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
# state_dict is used by deepspeed for module saving so it needs to point to the right function
if args.no_pipeline_parallel:
original_state_dict = model[0].module.state_dict
model[0].module.state_dict = model[0].module.state_dict_for_save_checkpoint
@ -234,7 +237,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
# specific to self attention so should work for cross attention as well
while hasattr(model, 'module'):
model = model.module
#attention_module = model.language_model.encoder.layers[0].self_attention
# attention_module = model.language_model.encoder.layers[0].self_attention
attention_module = model.language_model.encoder.layers[0].attention
hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
@ -267,6 +270,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
return t
def fix_query_key_value_ordering(model, checkpoint_version):
"""Fix up query/key/value matrix ordering if checkpoint
version is smaller than 2.0

View File

@ -6,4 +6,4 @@ LIBEXT = $(shell python3-config --extension-suffix)
default: $(LIBNAME)$(LIBEXT)
%$(LIBEXT): %.cpp
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@

View File

@ -17,6 +17,7 @@ from ascendspeed.data.indexed_dataset import make_dataset as make_indexed_datase
logger = logging.getLogger(__name__)
def build_train_valid_test_datasets(
data_prefix,
data_impl,
@ -161,6 +162,7 @@ def build_dataset_group(
return all_datasets
def _build_single_datasets(
data_prefix,
range_string,
@ -391,6 +393,7 @@ def _build_index_mappings(
return shuffle_idx
def _build_shuffle_idx(nb_documents: int, np_rng):
"""Build the range [0, dataset_size) and shuffle."""
dtype_ = np.int64

View File

@ -15,19 +15,23 @@
import enum
class LayerType(enum.Enum):
encoder = 1
decoder = 2
class AttnType(enum.Enum):
self_attn = 1
cross_attn = 2
class AttnMaskType(enum.Enum):
padding = 1
causal = 2
prefix = 3
class PositionEmbeddingType(enum.Enum):
rotary = 1
absolute = 2

View File

@ -19,6 +19,7 @@ import math
from ascendspeed import print_rank_0, get_args
class AnnealingLR(object):
"""Anneals the learning rate."""

View File

@ -28,6 +28,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
# actual gelu is:
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
@torch.jit.script
def bias_gelu(bias, y):
x = bias + y
@ -36,6 +37,8 @@ def bias_gelu(bias, y):
# gradient of tanh approximation of gelu
# gradient of actual gelu is:
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@torch.jit.script
def bias_gelu_back(g, bias, y):
x = bias + y
@ -44,6 +47,7 @@ def bias_gelu_back(g, bias, y):
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
return ff*g
class GeLUFunction(torch.autograd.Function):
@staticmethod
# bias is an optional argument
@ -57,4 +61,5 @@ class GeLUFunction(torch.autograd.Function):
tmp = bias_gelu_back(grad_output, bias, input)
return tmp, tmp
bias_gelu_impl = GeLUFunction.apply

View File

@ -7,6 +7,7 @@ from ascendspeed.model.utils import log_debug_usage
logger = logging.get_logger(__name__)
class _GLUBaseModule(nn.Module):
def __init__(self, activation_fn):
super().__init__()

View File

@ -292,6 +292,7 @@ class Embedding(MegatronModule):
print('***WARNING*** expected tokentype embeddings in the '
'checkpoint but could not find it', flush=True)
class EmbeddingPipe(Embedding):
def forward(self, inputs, **kwargs):

View File

@ -1,6 +1,5 @@
# coding=utf-8
# Copyright (c) 2023, HUAWEI CORPORATION. All rights reserved.
# Extracted from: https://github.com/EleutherAI/gpt-neox
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

View File

@ -59,6 +59,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
hyperparameters: transformer hyperparameters
"""
class ParallelMLP(MegatronModule):
"""MLP.
@ -119,6 +120,7 @@ class ParallelMLP(MegatronModule):
output, output_bias = self.dense_4h_to_h(intermediate_parallel)
return output, output_bias
class ParallelAttention(MegatronModule):
"""Parallel self-attention layer abstract class.
@ -658,6 +660,7 @@ class ParallelTransformerLayer(MegatronModule):
alibi = alibi.repeat(batch_size, 1, 1)
return alibi
class ParallelTransformerLayerPipe(ParallelTransformerLayer):
"""Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
@ -697,6 +700,7 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer):
else:
raise RuntimeError('Received more inputs than understood.')
class ParallelTransformer(MegatronModule):
"""Transformer class."""
@ -791,6 +795,7 @@ class ParallelTransformer(MegatronModule):
global get_cuda_rng_tracker, checkpoint
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
checkpoint = deepspeed.checkpointing.checkpoint
def _get_layer(self, layer_number):
return self.layers[layer_number]

View File

@ -21,6 +21,7 @@ import torch
from ascendspeed import get_args
def init_method_normal(sigma):
"""Init method based on N(0, sigma)."""
def init_(tensor):
@ -61,15 +62,20 @@ def get_linear_layer(rows, columns, init_method):
layer.bias.zero_()
return layer
@torch.jit.script
def gelu_impl(x):
"""OpenAI's gelu implementation."""
return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
(1.0 + 0.044715 * x * x)))
def openai_gelu(x):
return gelu_impl(x)
#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
@torch.jit.script
def erf_gelu(x):
return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))

View File

@ -277,7 +277,7 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
grad_output = grad_output.reshape(grad_output.shape[0] * grad_output.shape[1],
grad_output.shape[2])
total_input = total_input.reshape(total_input.shape[0] * total_input.shape[1],
total_input.shape[2])
total_input.shape[2])
if ctx.sequence_parallel:
dim_size = list(input_.size())
@ -364,8 +364,11 @@ def linear_with_grad_accumulation_and_async_allreduce(
with torch.cuda.amp.autocast(enabled=False):
return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
linear_with_grad_accumulation_and_async_allreduce.warned = False
class ColumnParallelLinear(torch.nn.Module):
"""Linear layer with column parallelism.
@ -460,7 +463,6 @@ class ColumnParallelLinear(torch.nn.Module):
self.sequence_parallel_enabled = sequence_parallel_enabled
def forward(self, input_):
# Set up backprop all-reduce.
if self.is_expert_without_slicing or self.sequence_parallel_enabled: # non-expert only tensor parallelism

View File

@ -74,6 +74,7 @@ def _gather(input_):
return output
def _gather_along_first_dim(input_):
"""Gather tensors and concatinate along the first dimension."""
@ -92,6 +93,7 @@ def _gather_along_first_dim(input_):
return output
def _reduce_scatter_along_first_dim(input_):
"""Reduce-scatter the input tensor across model parallel group."""
world_size = get_tensor_model_parallel_world_size()
@ -111,6 +113,7 @@ def _reduce_scatter_along_first_dim(input_):
group=get_tensor_model_parallel_group())
return output
def _split_along_first_dim(input_):
"""Split the tensor along its first dimension and keep the
corresponding slice."""
@ -132,6 +135,7 @@ def _split_along_first_dim(input_):
return output
class _ScatterToSequenceParallelRegion(torch.autograd.Function):
"""Split the input and keep only the corresponding chuck to the rank."""
@ -238,7 +242,6 @@ class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
return _gather_along_first_dim(grad_output)
class _GatherFromModelParallelRegion(torch.autograd.Function):
"""Gather the input from model parallel region and concatinate."""
@ -274,11 +277,14 @@ def scatter_to_tensor_model_parallel_region(input_):
def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
def gather_from_tensor_model_parallel_region(input_):
return _GatherFromModelParallelRegion.apply(input_)
def reduce_scatter_to_sequence_parallel_region(input_):
return _ReduceScatterToSequenceParallelRegion.apply(input_)
def scatter_to_sequence_parallel_region(input_):
return _ScatterToSequenceParallelRegion.apply(input_)

View File

@ -26,6 +26,7 @@ from .grad_scaler import ConstantGradScaler, DynamicGradScaler
from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
from .distrib_optimizer import DistributedOptimizer
def _get_params_for_weight_decay_optimization(modules):
"""Divide params into with-weight-decay and without-weight-decay groups.
Layernorms and baises will have no weight decay but the rest will.

View File

@ -33,6 +33,7 @@ from ascendspeed.core import parallel_state
from ascendspeed.model.module import param_is_not_shared
from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate
def clip_grad_norm_fp32(parameters, grads_for_norm,
max_norm, norm_type=2,
model_parallel_group=None):

View File

@ -21,6 +21,7 @@ from abc import abstractmethod
import torch
from deepspeed.accelerator import get_accelerator
class MegatronGradScaler(ABC):
def __init__(self, initial_scale):

View File

@ -196,6 +196,7 @@ def generate_samples_input_from_file(model):
# and drop in the https://github.com/EleutherAI/lm-evaluation-harness
# codebase. The lm-evaluation-harness code can now call this function
# similar to their current generate function call used for gpt style models.
def generate_samples_eval(model, context, max_gen_length, eos_token_id):
# Generate samples for lm evaluation
# NEED TO THINK ABOUT eos token

View File

@ -34,6 +34,7 @@ from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate
from ascendspeed import get_num_microbatches
from deepspeed.accelerator import get_accelerator
def unwrap_model(model, module_instances=(torchDDP)):
return_list = True
if not isinstance(model, list):
@ -259,6 +260,7 @@ def get_parameters_in_billions(model):
return approx_parameters_in_billions*gpus_per_model/(1e9)
def throughput_calculator(model, args, iteration_time, total_iterations):
gpus_per_model = torch.distributed.get_world_size(group = parallel_state.get_model_parallel_group())
batch_size = args.micro_batch_size * get_num_microbatches() * args.data_parallel_size
@ -285,6 +287,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12))
return samples_per_second, tflops, approx_parameters_in_billions
def checkpoint_throughput_calculator(model, latency_second):
approx_parameters_in_billions = get_parameters_in_billions(model)
checkpoint_multiplier = 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4)

View File

@ -6,9 +6,9 @@ from pathlib import Path
import xmlrunner
#=============================
# =============================
# ST test, run with shell
#=============================
# =============================
def success_check(res):
if res != 0:
sys.exit(1)
@ -44,9 +44,10 @@ class ST_Test:
for shell_file in self.shell_file_list:
success_check(os.system("sh {}".format(shell_file)))
#===============================================
# ===============================================
# UT test, run with pytest, waiting for more ...
#===============================================
# ===============================================
if __name__ == "__main__":
st_test = ST_Test()

View File

@ -16,8 +16,8 @@ BLOOM model is from: [A 176B-Parameter Open-Access Multilingual Language Model](
- [Script](#script)
- [Performance](#performance)
- [Machine performance](#machine-performance)
- [Accuracy of the loss](#accuracy-of-the-loss)
- [Machine performance](#machine-performance)
- [Accuracy of the loss](#accuracy-of-the-loss)
- [Fine-tune and Evaluation](#fine-tune-and-evaluation)

View File

@ -16,8 +16,8 @@ LLaMA model is from: [LLaMA: OPen and Efficient Foundation Language Models](http
- [Script](#script)
- [Performance](#performance)
- [Machine performance](#machine-performance)
- [Accuracy of the loss](#accuracy-of-the-loss)
- [Machine performance](#machine-performance)
- [Accuracy of the loss](#accuracy-of-the-loss)
- [Citation](#citation)

View File

@ -133,6 +133,7 @@ def get_batch(data_iterator):
return tokens, labels, loss_mask, attention_mask, position_ids
def data_post_process(data, data_sampler_state_dict):
args = get_args()
if args.data_efficiency_curriculum_learning:
@ -158,6 +159,7 @@ def data_post_process(data, data_sampler_state_dict):
args.data_efficiency_curriculum_learning_seqlen_type = None
return data
def get_batch_pipe(data):
"""Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
args = get_args()
@ -212,6 +214,7 @@ def loss_func(loss_mask, output_tensor):
loss = loss + moe_loss
return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss}
def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask):
mos_loss = 0
alpha = args.kd_alpha_ce
@ -242,6 +245,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at
mos_loss = mos_loss.div(args.seq_length) * beta
return mos_loss
def forward_step(data_iterator, model):
"""Forward step."""
args = get_args()
@ -259,6 +263,7 @@ def forward_step(data_iterator, model):
return output_tensor, partial(loss_func, loss_mask)
def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()

View File

@ -184,6 +184,7 @@ def get_batch_pipe(data):
args.eod_mask_loss)
return (tokens, attention_mask), (labels, loss_mask)
def loss_func(loss_mask, output_tensor):
args = get_args()
@ -232,6 +233,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
return train_ds, valid_ds, test_ds
if __name__ == "__main__":
torch.npu.set_compile_mode(jit_compile=True)
pretrain(train_valid_test_datasets_provider,

View File

@ -220,12 +220,14 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
return train_ds, valid_ds, test_ds
@timeout(1200)
def test():
pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder,forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
data_post_process=data_post_process)
if __name__ == "__main__":
os.makedirs("./ckpt_llama", exist_ok=True)
test()