mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-05 05:17:40 +08:00
!155 送检整改: codecheck空行缩进修改
Merge pull request !155 from chenzhihong/codecheck
This commit is contained in:
commit
bb09ab2962
@ -40,6 +40,7 @@ from ascendspeed.model.lora_utils import is_enable_lora, get_lora_state_dict, lo
|
||||
|
||||
_CHECKPOINT_VERSION = None
|
||||
|
||||
|
||||
def set_checkpoint_version(value):
|
||||
global _CHECKPOINT_VERSION
|
||||
if _CHECKPOINT_VERSION is not None:
|
||||
@ -47,10 +48,12 @@ def set_checkpoint_version(value):
|
||||
"checkpoint versions do not match"
|
||||
_CHECKPOINT_VERSION = value
|
||||
|
||||
|
||||
def get_checkpoint_version():
|
||||
global _CHECKPOINT_VERSION
|
||||
return _CHECKPOINT_VERSION
|
||||
|
||||
|
||||
def check_checkpoint_args(checkpoint_args):
|
||||
"""Ensure fixed arguments for a model are the same for the input
|
||||
arguments and the one retrieved from checkpoint."""
|
||||
@ -172,8 +175,8 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler):
|
||||
torch.save(state_dict, checkpoint_name)
|
||||
|
||||
if args.deepspeed:
|
||||
#ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
|
||||
#state_dict is used by deepspeed for module saving so it needs to point to the right function
|
||||
# ascendspeed model uses state_dict_for_save_checkpointing instead of the standard state_dict
|
||||
# state_dict is used by deepspeed for module saving so it needs to point to the right function
|
||||
if args.no_pipeline_parallel:
|
||||
original_state_dict = model[0].module.state_dict
|
||||
model[0].module.state_dict = model[0].module.state_dict_for_save_checkpoint
|
||||
@ -234,7 +237,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
|
||||
# specific to self attention so should work for cross attention as well
|
||||
while hasattr(model, 'module'):
|
||||
model = model.module
|
||||
#attention_module = model.language_model.encoder.layers[0].self_attention
|
||||
# attention_module = model.language_model.encoder.layers[0].self_attention
|
||||
attention_module = model.language_model.encoder.layers[0].attention
|
||||
hidden_size_per_attention_head = attention_module.hidden_size_per_attention_head
|
||||
num_attention_heads_per_partition = attention_module.num_attention_heads_per_partition
|
||||
@ -267,6 +270,7 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
|
||||
|
||||
return t
|
||||
|
||||
|
||||
def fix_query_key_value_ordering(model, checkpoint_version):
|
||||
"""Fix up query/key/value matrix ordering if checkpoint
|
||||
version is smaller than 2.0
|
||||
|
@ -6,4 +6,4 @@ LIBEXT = $(shell python3-config --extension-suffix)
|
||||
default: $(LIBNAME)$(LIBEXT)
|
||||
|
||||
%$(LIBEXT): %.cpp
|
||||
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
|
||||
$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
|
||||
|
@ -17,6 +17,7 @@ from ascendspeed.data.indexed_dataset import make_dataset as make_indexed_datase
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_train_valid_test_datasets(
|
||||
data_prefix,
|
||||
data_impl,
|
||||
@ -161,6 +162,7 @@ def build_dataset_group(
|
||||
|
||||
return all_datasets
|
||||
|
||||
|
||||
def _build_single_datasets(
|
||||
data_prefix,
|
||||
range_string,
|
||||
@ -391,6 +393,7 @@ def _build_index_mappings(
|
||||
|
||||
return shuffle_idx
|
||||
|
||||
|
||||
def _build_shuffle_idx(nb_documents: int, np_rng):
|
||||
"""Build the range [0, dataset_size) and shuffle."""
|
||||
dtype_ = np.int64
|
||||
|
@ -15,19 +15,23 @@
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class LayerType(enum.Enum):
|
||||
encoder = 1
|
||||
decoder = 2
|
||||
|
||||
|
||||
|
||||
class AttnType(enum.Enum):
|
||||
self_attn = 1
|
||||
cross_attn = 2
|
||||
|
||||
|
||||
class AttnMaskType(enum.Enum):
|
||||
padding = 1
|
||||
causal = 2
|
||||
prefix = 3
|
||||
|
||||
|
||||
class PositionEmbeddingType(enum.Enum):
|
||||
rotary = 1
|
||||
absolute = 2
|
||||
|
@ -19,6 +19,7 @@ import math
|
||||
|
||||
from ascendspeed import print_rank_0, get_args
|
||||
|
||||
|
||||
class AnnealingLR(object):
|
||||
"""Anneals the learning rate."""
|
||||
|
||||
|
@ -28,6 +28,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
|
||||
# actual gelu is:
|
||||
# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def bias_gelu(bias, y):
|
||||
x = bias + y
|
||||
@ -36,6 +37,8 @@ def bias_gelu(bias, y):
|
||||
# gradient of tanh approximation of gelu
|
||||
# gradient of actual gelu is:
|
||||
# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def bias_gelu_back(g, bias, y):
|
||||
x = bias + y
|
||||
@ -44,6 +47,7 @@ def bias_gelu_back(g, bias, y):
|
||||
ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
|
||||
return ff*g
|
||||
|
||||
|
||||
class GeLUFunction(torch.autograd.Function):
|
||||
@staticmethod
|
||||
# bias is an optional argument
|
||||
@ -57,4 +61,5 @@ class GeLUFunction(torch.autograd.Function):
|
||||
tmp = bias_gelu_back(grad_output, bias, input)
|
||||
return tmp, tmp
|
||||
|
||||
|
||||
bias_gelu_impl = GeLUFunction.apply
|
||||
|
@ -7,6 +7,7 @@ from ascendspeed.model.utils import log_debug_usage
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
class _GLUBaseModule(nn.Module):
|
||||
def __init__(self, activation_fn):
|
||||
super().__init__()
|
||||
|
@ -292,6 +292,7 @@ class Embedding(MegatronModule):
|
||||
print('***WARNING*** expected tokentype embeddings in the '
|
||||
'checkpoint but could not find it', flush=True)
|
||||
|
||||
|
||||
class EmbeddingPipe(Embedding):
|
||||
|
||||
def forward(self, inputs, **kwargs):
|
||||
|
@ -1,6 +1,5 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2023, HUAWEI CORPORATION. All rights reserved.
|
||||
# Extracted from: https://github.com/EleutherAI/gpt-neox
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
|
@ -59,6 +59,7 @@ torch._C._jit_override_can_fuse_on_gpu(True)
|
||||
hyperparameters: transformer hyperparameters
|
||||
"""
|
||||
|
||||
|
||||
class ParallelMLP(MegatronModule):
|
||||
"""MLP.
|
||||
|
||||
@ -119,6 +120,7 @@ class ParallelMLP(MegatronModule):
|
||||
output, output_bias = self.dense_4h_to_h(intermediate_parallel)
|
||||
return output, output_bias
|
||||
|
||||
|
||||
class ParallelAttention(MegatronModule):
|
||||
"""Parallel self-attention layer abstract class.
|
||||
|
||||
@ -658,6 +660,7 @@ class ParallelTransformerLayer(MegatronModule):
|
||||
alibi = alibi.repeat(batch_size, 1, 1)
|
||||
return alibi
|
||||
|
||||
|
||||
class ParallelTransformerLayerPipe(ParallelTransformerLayer):
|
||||
"""Extends ParallelTransformerLayer to forward attention_mask through the pipeline.
|
||||
|
||||
@ -697,6 +700,7 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer):
|
||||
else:
|
||||
raise RuntimeError('Received more inputs than understood.')
|
||||
|
||||
|
||||
class ParallelTransformer(MegatronModule):
|
||||
"""Transformer class."""
|
||||
|
||||
@ -791,6 +795,7 @@ class ParallelTransformer(MegatronModule):
|
||||
global get_cuda_rng_tracker, checkpoint
|
||||
get_cuda_rng_tracker = deepspeed.checkpointing.get_cuda_rng_tracker
|
||||
checkpoint = deepspeed.checkpointing.checkpoint
|
||||
|
||||
def _get_layer(self, layer_number):
|
||||
return self.layers[layer_number]
|
||||
|
||||
|
@ -21,6 +21,7 @@ import torch
|
||||
|
||||
from ascendspeed import get_args
|
||||
|
||||
|
||||
def init_method_normal(sigma):
|
||||
"""Init method based on N(0, sigma)."""
|
||||
def init_(tensor):
|
||||
@ -61,15 +62,20 @@ def get_linear_layer(rows, columns, init_method):
|
||||
layer.bias.zero_()
|
||||
return layer
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def gelu_impl(x):
|
||||
"""OpenAI's gelu implementation."""
|
||||
return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
|
||||
(1.0 + 0.044715 * x * x)))
|
||||
|
||||
|
||||
def openai_gelu(x):
|
||||
return gelu_impl(x)
|
||||
|
||||
#This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
|
||||
# This is actually Python equivalent of torch.nn.functional.gelu(), also with type hints for ONNX exporter
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def erf_gelu(x):
|
||||
return x * 0.5 * (torch.erf(x / 1.41421).to(dtype=x.dtype)+torch.ones_like(x).to(dtype=x.dtype))
|
||||
|
@ -277,7 +277,7 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
|
||||
grad_output = grad_output.reshape(grad_output.shape[0] * grad_output.shape[1],
|
||||
grad_output.shape[2])
|
||||
total_input = total_input.reshape(total_input.shape[0] * total_input.shape[1],
|
||||
total_input.shape[2])
|
||||
total_input.shape[2])
|
||||
|
||||
if ctx.sequence_parallel:
|
||||
dim_size = list(input_.size())
|
||||
@ -364,8 +364,11 @@ def linear_with_grad_accumulation_and_async_allreduce(
|
||||
|
||||
with torch.cuda.amp.autocast(enabled=False):
|
||||
return LinearWithGradAccumulationAndAsyncCommunication.apply(*args)
|
||||
|
||||
|
||||
linear_with_grad_accumulation_and_async_allreduce.warned = False
|
||||
|
||||
|
||||
class ColumnParallelLinear(torch.nn.Module):
|
||||
"""Linear layer with column parallelism.
|
||||
|
||||
@ -460,7 +463,6 @@ class ColumnParallelLinear(torch.nn.Module):
|
||||
|
||||
self.sequence_parallel_enabled = sequence_parallel_enabled
|
||||
|
||||
|
||||
def forward(self, input_):
|
||||
# Set up backprop all-reduce.
|
||||
if self.is_expert_without_slicing or self.sequence_parallel_enabled: # non-expert only tensor parallelism
|
||||
|
@ -74,6 +74,7 @@ def _gather(input_):
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _gather_along_first_dim(input_):
|
||||
"""Gather tensors and concatinate along the first dimension."""
|
||||
|
||||
@ -92,6 +93,7 @@ def _gather_along_first_dim(input_):
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _reduce_scatter_along_first_dim(input_):
|
||||
"""Reduce-scatter the input tensor across model parallel group."""
|
||||
world_size = get_tensor_model_parallel_world_size()
|
||||
@ -111,6 +113,7 @@ def _reduce_scatter_along_first_dim(input_):
|
||||
group=get_tensor_model_parallel_group())
|
||||
return output
|
||||
|
||||
|
||||
def _split_along_first_dim(input_):
|
||||
"""Split the tensor along its first dimension and keep the
|
||||
corresponding slice."""
|
||||
@ -132,6 +135,7 @@ def _split_along_first_dim(input_):
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class _ScatterToSequenceParallelRegion(torch.autograd.Function):
|
||||
"""Split the input and keep only the corresponding chuck to the rank."""
|
||||
|
||||
@ -238,7 +242,6 @@ class _ReduceScatterToSequenceParallelRegion(torch.autograd.Function):
|
||||
return _gather_along_first_dim(grad_output)
|
||||
|
||||
|
||||
|
||||
class _GatherFromModelParallelRegion(torch.autograd.Function):
|
||||
"""Gather the input from model parallel region and concatinate."""
|
||||
|
||||
@ -274,11 +277,14 @@ def scatter_to_tensor_model_parallel_region(input_):
|
||||
def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
|
||||
return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
|
||||
|
||||
|
||||
def gather_from_tensor_model_parallel_region(input_):
|
||||
return _GatherFromModelParallelRegion.apply(input_)
|
||||
|
||||
|
||||
def reduce_scatter_to_sequence_parallel_region(input_):
|
||||
return _ReduceScatterToSequenceParallelRegion.apply(input_)
|
||||
|
||||
|
||||
def scatter_to_sequence_parallel_region(input_):
|
||||
return _ScatterToSequenceParallelRegion.apply(input_)
|
@ -26,6 +26,7 @@ from .grad_scaler import ConstantGradScaler, DynamicGradScaler
|
||||
from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer
|
||||
from .distrib_optimizer import DistributedOptimizer
|
||||
|
||||
|
||||
def _get_params_for_weight_decay_optimization(modules):
|
||||
"""Divide params into with-weight-decay and without-weight-decay groups.
|
||||
Layernorms and baises will have no weight decay but the rest will.
|
||||
|
@ -33,6 +33,7 @@ from ascendspeed.core import parallel_state
|
||||
from ascendspeed.model.module import param_is_not_shared
|
||||
from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate
|
||||
|
||||
|
||||
def clip_grad_norm_fp32(parameters, grads_for_norm,
|
||||
max_norm, norm_type=2,
|
||||
model_parallel_group=None):
|
||||
|
@ -21,6 +21,7 @@ from abc import abstractmethod
|
||||
import torch
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
|
||||
class MegatronGradScaler(ABC):
|
||||
|
||||
def __init__(self, initial_scale):
|
||||
|
@ -196,6 +196,7 @@ def generate_samples_input_from_file(model):
|
||||
# and drop in the https://github.com/EleutherAI/lm-evaluation-harness
|
||||
# codebase. The lm-evaluation-harness code can now call this function
|
||||
# similar to their current generate function call used for gpt style models.
|
||||
|
||||
def generate_samples_eval(model, context, max_gen_length, eos_token_id):
|
||||
# Generate samples for lm evaluation
|
||||
# NEED TO THINK ABOUT eos token
|
||||
|
@ -34,6 +34,7 @@ from ascendspeed.mpu.layers import param_is_not_tensor_parallel_duplicate
|
||||
from ascendspeed import get_num_microbatches
|
||||
from deepspeed.accelerator import get_accelerator
|
||||
|
||||
|
||||
def unwrap_model(model, module_instances=(torchDDP)):
|
||||
return_list = True
|
||||
if not isinstance(model, list):
|
||||
@ -259,6 +260,7 @@ def get_parameters_in_billions(model):
|
||||
|
||||
return approx_parameters_in_billions*gpus_per_model/(1e9)
|
||||
|
||||
|
||||
def throughput_calculator(model, args, iteration_time, total_iterations):
|
||||
gpus_per_model = torch.distributed.get_world_size(group = parallel_state.get_model_parallel_group())
|
||||
batch_size = args.micro_batch_size * get_num_microbatches() * args.data_parallel_size
|
||||
@ -285,6 +287,7 @@ def throughput_calculator(model, args, iteration_time, total_iterations):
|
||||
tflops = flops_per_iteration / (elapsed_time_per_iter * args.world_size * (10**12))
|
||||
return samples_per_second, tflops, approx_parameters_in_billions
|
||||
|
||||
|
||||
def checkpoint_throughput_calculator(model, latency_second):
|
||||
approx_parameters_in_billions = get_parameters_in_billions(model)
|
||||
checkpoint_multiplier = 14 # fp16 weights (2), fp32 weights (4), fp32 momentum (4), fp32 variance (4)
|
||||
|
@ -6,9 +6,9 @@ from pathlib import Path
|
||||
import xmlrunner
|
||||
|
||||
|
||||
#=============================
|
||||
# =============================
|
||||
# ST test, run with shell
|
||||
#=============================
|
||||
# =============================
|
||||
def success_check(res):
|
||||
if res != 0:
|
||||
sys.exit(1)
|
||||
@ -44,9 +44,10 @@ class ST_Test:
|
||||
for shell_file in self.shell_file_list:
|
||||
success_check(os.system("sh {}".format(shell_file)))
|
||||
|
||||
#===============================================
|
||||
# ===============================================
|
||||
# UT test, run with pytest, waiting for more ...
|
||||
#===============================================
|
||||
# ===============================================
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
st_test = ST_Test()
|
||||
|
@ -16,8 +16,8 @@ BLOOM model is from: [A 176B-Parameter Open-Access Multilingual Language Model](
|
||||
- [Script](#script)
|
||||
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
|
||||
- [Fine-tune and Evaluation](#fine-tune-and-evaluation)
|
||||
|
||||
|
@ -16,8 +16,8 @@ LLaMA model is from: [LLaMA: OPen and Efficient Foundation Language Models](http
|
||||
- [Script](#script)
|
||||
|
||||
- [Performance](#performance)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
- [Machine performance](#machine-performance)
|
||||
- [Accuracy of the loss](#accuracy-of-the-loss)
|
||||
|
||||
- [Citation](#citation)
|
||||
|
||||
|
@ -133,6 +133,7 @@ def get_batch(data_iterator):
|
||||
|
||||
return tokens, labels, loss_mask, attention_mask, position_ids
|
||||
|
||||
|
||||
def data_post_process(data, data_sampler_state_dict):
|
||||
args = get_args()
|
||||
if args.data_efficiency_curriculum_learning:
|
||||
@ -158,6 +159,7 @@ def data_post_process(data, data_sampler_state_dict):
|
||||
args.data_efficiency_curriculum_learning_seqlen_type = None
|
||||
return data
|
||||
|
||||
|
||||
def get_batch_pipe(data):
|
||||
"""Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
|
||||
args = get_args()
|
||||
@ -212,6 +214,7 @@ def loss_func(loss_mask, output_tensor):
|
||||
loss = loss + moe_loss
|
||||
return loss, {'lm loss': averaged_loss[0], 'moe loss': moe_loss}
|
||||
|
||||
|
||||
def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, attention_mask):
|
||||
mos_loss = 0
|
||||
alpha = args.kd_alpha_ce
|
||||
@ -242,6 +245,7 @@ def calculate_mos_loss(args, stu_output, teacher_model, tokens, position_ids, at
|
||||
mos_loss = mos_loss.div(args.seq_length) * beta
|
||||
return mos_loss
|
||||
|
||||
|
||||
def forward_step(data_iterator, model):
|
||||
"""Forward step."""
|
||||
args = get_args()
|
||||
@ -259,6 +263,7 @@ def forward_step(data_iterator, model):
|
||||
|
||||
return output_tensor, partial(loss_func, loss_mask)
|
||||
|
||||
|
||||
def train_valid_test_datasets_provider(train_val_test_num_samples):
|
||||
"""Build train, valid, and test datasets."""
|
||||
args = get_args()
|
||||
|
@ -184,6 +184,7 @@ def get_batch_pipe(data):
|
||||
args.eod_mask_loss)
|
||||
return (tokens, attention_mask), (labels, loss_mask)
|
||||
|
||||
|
||||
def loss_func(loss_mask, output_tensor):
|
||||
args = get_args()
|
||||
|
||||
@ -232,6 +233,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
|
||||
|
||||
return train_ds, valid_ds, test_ds
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
pretrain(train_valid_test_datasets_provider,
|
||||
|
@ -220,12 +220,14 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
|
||||
|
||||
return train_ds, valid_ds, test_ds
|
||||
|
||||
|
||||
@timeout(1200)
|
||||
def test():
|
||||
pretrain(train_valid_test_datasets_provider, model_provider, ModelType.encoder_or_decoder,forward_step,
|
||||
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
|
||||
data_post_process=data_post_process)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs("./ckpt_llama", exist_ok=True)
|
||||
test()
|
||||
|
Loading…
Reference in New Issue
Block a user