diff --git a/README.md b/README.md index 50d106ea8..5ec7a1cde 100644 --- a/README.md +++ b/README.md @@ -744,6 +744,29 @@ ModelLink 通过模型并行与数据并行来训练大语言模型,为了演
【GTS】 【Test】 + + Gemma2 + 9B + 8K + Mcore + 1x8 + BF16 + 1713 + 1595 +
【GTS】 + 【Test】 + + + 27B + 8K + Mcore + 2x8 + BF16 + 827 + 800 +
【GTS】 + 【Test】 + GPT3 175B diff --git a/examples/README.md b/examples/README.md index 9e5f74bce..4ef99ca82 100644 --- a/examples/README.md +++ b/examples/README.md @@ -785,6 +785,7 @@ ModelLink已支持模型评估分数如下: | Bloom-176B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 64.5% | -- | ChatGLM3-6B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 61.5% | 61.4% | | CodeLLaMA-34B | Human Eval | 48.78% | [48.8%](https://paperswithcode.com/sota/code-generation-on-humaneval) | Gemma-2B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 39.6% | 39.7% | | Gemma-7B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 52.2% | 52.2% | InternLM-7B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 48.7% | [51.0](https://huggingface.co/internlm/internlm-7b) | +| Gemma2-9B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 70.7% | [71.3%](https://huggingface.co/google/gemma-2-9b) | Gemma2-27B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 75.5% | [75.2%](https://huggingface.co/google/gemma-2-27b) | | LLaMA-7B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 74.6% | [75.4](https://hub.opencompass.org.cn/dataset-detail/BoolQ) | LLaMA-13B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 79.6% | [78.7](https://hub.opencompass.org.cn/dataset-detail/BoolQ) | | LLaMA-33B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 83.2% | [83.1](https://paperswithcode.com/sota/question-answering-on-boolq) | LLaMA-65B | [BoolQ](https://github.com/google-research-datasets/boolean-questions) | 85.7% | [86.6](https://paperswithcode.com/sota/question-answering-on-boolq) | | LLaMA2-7B | [MMLU](https://paperswithcode.com/dataset/mmlu) | 45.7% | 45.3% | LLaMA2-13B | [BoolQ](https://paperswithcode.com/dataset/boolq) | 82.2% | [81.7](https://paperswithcode.com/sota/question-answering-on-boolq) | diff --git a/examples/mcore/gemma2/ckpt_convert_gemma2_hf2mcore.sh b/examples/mcore/gemma2/ckpt_convert_gemma2_hf2mcore.sh new file mode 100644 index 000000000..24dc210c9 --- /dev/null +++ b/examples/mcore/gemma2/ckpt_convert_gemma2_hf2mcore.sh @@ -0,0 +1,17 @@ +# 修改 ascend-toolkit 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +# 权重格式转换 +python tools/checkpoint/convert_ckpt.py \ + --use-mcore-models \ + --model-type-hf gemma2 \ + --model-type GPT \ + --loader hf_mcore \ + --saver mg_mcore \ + --params-dtype bf16 \ + --post-norm \ + --target-tensor-parallel-size 8 \ + --target-pipeline-parallel-size 1 \ + --load-dir ./model_from_hf/gemma2_hf/ \ + --save-dir ./model_weights/gemma2_mcore/ \ + --tokenizer-model ./model_from_hf/gemma2_hf/tokenizer.json \ No newline at end of file diff --git a/examples/mcore/gemma2/data_convert_gemma2_pretrain.sh b/examples/mcore/gemma2/data_convert_gemma2_pretrain.sh new file mode 100644 index 000000000..ffd27cfe0 --- /dev/null +++ b/examples/mcore/gemma2/data_convert_gemma2_pretrain.sh @@ -0,0 +1,11 @@ +# 请按照您的真实环境修改 set_env.sh 路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +mkdir ./dataset + +python ./preprocess_data.py \ + --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \ + --tokenizer-name-or-path ./model_from_hf/gemma2_hf/ \ + --tokenizer-type PretrainedFromHF + --output-prefix ./dataset/enwiki \ + --workers 4 \ + --log-interval 1000 \ diff --git a/examples/mcore/gemma2/evaluate_gemma2_27b_ptd.sh b/examples/mcore/gemma2/evaluate_gemma2_27b_ptd.sh new file mode 100644 index 000000000..b25ff3a18 --- /dev/null +++ b/examples/mcore/gemma2/evaluate_gemma2_27b_ptd.sh @@ -0,0 +1,69 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# distributed config +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +# modify script model path and tokenizer path +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your model directory path" + +# configure task and data path +DATA_PATH="./mmlu/test/" +TASK="mmlu" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT" + +# configure generation parameters +torchrun $DISTRIBUTED_ARGS evaluation.py \ + --task-data-path ${DATA_PATH} \ + --task ${TASK}\ + --load ${CHECKPOINT} \ + --use-mcore-models \ + --tensor-model-parallel-size 8 \ + --pipeline-model-parallel-size 1 \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 144 \ + --output-logit-softcapping 30.0 \ + --attn-logit-softcapping 50.0 \ + --interleave-sliding-window 4096 \ + --group-query-attention \ + --num-query-groups 16 \ + --num-layers 46 \ + --hidden-size 4608 \ + --ffn-hidden-size 36864 \ + --num-attention-heads 32 \ + --kv-channels 128 \ + --max-position-embeddings 8192 \ + --seq-length 8192 \ + --max-new-tokens 1 \ + --position-embedding-type rope \ + --disable-bias-linear \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --input-embeds-norm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --norm-epsilon 1e-06 \ + --evaluation-batch-size 1 \ + --micro-batch-size 1 \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --exit-on-missing-checkpoint \ + --no-load-rng \ + --no-load-optim \ + --vocab-size 256000 \ + --make-vocab-size-divisible-by 1 \ + --bf16 \ + --seed 42 | tee logs/evaluation_gemma2_27b_mcore_${TASK}.log diff --git a/examples/mcore/gemma2/evaluate_gemma2_9b_ptd.sh b/examples/mcore/gemma2/evaluate_gemma2_9b_ptd.sh new file mode 100644 index 000000000..e8d955821 --- /dev/null +++ b/examples/mcore/gemma2/evaluate_gemma2_9b_ptd.sh @@ -0,0 +1,69 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# distributed config +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 + +# modify script model path and tokenizer path +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your model directory path" + +# configure task and data path +DATA_PATH="./mmlu/test/" +TASK="mmlu" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT" + +# configure generation parameters +torchrun $DISTRIBUTED_ARGS evaluation.py \ + --task-data-path ${DATA_PATH} \ + --task ${TASK}\ + --load ${CHECKPOINT} \ + --use-mcore-models \ + --tensor-model-parallel-size 8 \ + --pipeline-model-parallel-size 1 \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 256 \ + --output-logit-softcapping 30.0 \ + --attn-logit-softcapping 50.0 \ + --interleave-sliding-window 4096 \ + --group-query-attention \ + --num-query-groups 8 \ + --num-layers 42 \ + --hidden-size 3584 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 16 \ + --kv-channels 256 \ + --max-position-embeddings 8192 \ + --seq-length 8192 \ + --max-new-tokens 1 \ + --position-embedding-type rope \ + --disable-bias-linear \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --input-embeds-norm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --norm-epsilon 1e-06 \ + --evaluation-batch-size 1 \ + --micro-batch-size 1 \ + --use-fused-rmsnorm \ + --no-masked-softmax-fusion \ + --exit-on-missing-checkpoint \ + --no-load-rng \ + --no-load-optim \ + --vocab-size 256000 \ + --make-vocab-size-divisible-by 1 \ + --bf16 \ + --seed 42 | tee logs/evaluation_gemma2_9b_mcore_${TASK}.log diff --git a/examples/mcore/gemma2/generate_gemma2_27b_ptd.sh b/examples/mcore/gemma2/generate_gemma2_27b_ptd.sh new file mode 100644 index 000000000..9272313bc --- /dev/null +++ b/examples/mcore/gemma2/generate_gemma2_27b_ptd.sh @@ -0,0 +1,66 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Change for multinode config +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your model directory path" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT" + +torchrun $DISTRIBUTED_ARGS inference.py \ + --tensor-model-parallel-size 8 \ + --pipeline-model-parallel-size 1 \ + --use-mcore-models \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 144 \ + --output-logit-softcapping 30.0 \ + --attn-logit-softcapping 50.0 \ + --interleave-sliding-window 4096 \ + --group-query-attention \ + --num-query-groups 16 \ + --load ${CHECKPOINT} \ + --num-layers 46 \ + --hidden-size 4608 \ + --kv-channels 128 \ + --ffn-hidden-size 36864 \ + --num-attention-heads 32 \ + --position-embedding-type rope \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --max-new-tokens 256 \ + --micro-batch-size 1 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --norm-epsilon 1e-06 \ + --input-embeds-norm \ + --disable-bias-linear \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --attention-softmax-in-fp32 \ + --no-load-optim \ + --no-load-rng \ + --no-masked-softmax-fusion \ + --no-gradient-accumulation-fusion \ + --exit-on-missing-checkpoint \ + --make-vocab-size-divisible-by 1 \ + --vocab-size 256000 \ + --bf16 \ + --seed 42 \ + | tee logs/generate_gemma2_27b_mcore.log diff --git a/examples/mcore/gemma2/generate_gemma2_9b_ptd.sh b/examples/mcore/gemma2/generate_gemma2_9b_ptd.sh new file mode 100644 index 000000000..75a2d4396 --- /dev/null +++ b/examples/mcore/gemma2/generate_gemma2_9b_ptd.sh @@ -0,0 +1,66 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +# Change for multinode config +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6001 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +# please fill these path configurations +TOKENIZER_PATH="your tokenizer directory path" +CHECKPOINT="your model directory path" + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT" + +torchrun $DISTRIBUTED_ARGS inference.py \ + --tensor-model-parallel-size 8 \ + --pipeline-model-parallel-size 1 \ + --use-mcore-models \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 256 \ + --output-logit-softcapping 30.0 \ + --attn-logit-softcapping 50.0 \ + --interleave-sliding-window 4096 \ + --group-query-attention \ + --num-query-groups 8 \ + --load ${CHECKPOINT} \ + --num-layers 42 \ + --hidden-size 3584 \ + --kv-channels 256 \ + --ffn-hidden-size 14436 \ + --num-attention-heads 16 \ + --position-embedding-type rope \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --max-new-tokens 256 \ + --micro-batch-size 1 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_PATH} \ + --tokenizer-not-use-fast \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --norm-epsilon 1e-06 \ + --input-embeds-norm \ + --disable-bias-linear \ + --hidden-dropout 0 \ + --attention-dropout 0 \ + --attention-softmax-in-fp32 \ + --no-load-optim \ + --no-load-rng \ + --no-masked-softmax-fusion \ + --no-gradient-accumulation-fusion \ + --exit-on-missing-checkpoint \ + --make-vocab-size-divisible-by 1 \ + --vocab-size 256000 \ + --bf16 \ + --seed 42 \ + | tee logs/generate_gemma2_9b_mcore.log diff --git a/examples/mcore/gemma2/pretrain_gemma2_27b_ptd.sh b/examples/mcore/gemma2/pretrain_gemma2_27b_ptd.sh new file mode 100644 index 000000000..cdf3cc09c --- /dev/null +++ b/examples/mcore/gemma2/pretrain_gemma2_27b_ptd.sh @@ -0,0 +1,105 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=2 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_MODEL="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=8 +PP=2 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --use-mcore-models \ + --use-mc2 \ + --use-fused-rmsnorm \ + --use-fused-rotary-pos-emb \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 144 \ + --output-logit-softcapping 30.0 \ + --interleave-sliding-window 4096 \ + --num-layers 46 \ + --num-layer-list 20,26 \ + --hidden-size 4608 \ + --ffn-hidden-size 36864 \ + --num-attention-heads 32 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_MODEL} \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --micro-batch-size 1 \ + --global-batch-size 64 \ + --kv-channels 128 \ + --group-query-attention \ + --num-query-groups 16 \ + --make-vocab-size-divisible-by 1 \ + --lr 1.25e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --norm-epsilon 1e-06 \ + --input-embeds-norm \ + --use-flash-attn \ + --use-distributed-optimizer \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tokenizer-padding-side left \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --vocab-size 256000 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 1000 \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_gemma2_27b_mcore.log \ No newline at end of file diff --git a/examples/mcore/gemma2/pretrain_gemma2_9b_ptd.sh b/examples/mcore/gemma2/pretrain_gemma2_9b_ptd.sh new file mode 100644 index 000000000..50134dd27 --- /dev/null +++ b/examples/mcore/gemma2/pretrain_gemma2_9b_ptd.sh @@ -0,0 +1,104 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +CKPT_SAVE_DIR="your model save ckpt path" +DATA_PATH="your data path" +TOKENIZER_MODEL="your tokenizer path" +CKPT_LOAD_DIR="your model ckpt path" + +TP=8 +PP=1 + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --sequence-parallel \ + --use-mcore-models \ + --use-mc2 \ + --use-fused-rmsnorm \ + --use-fused-rotary-pos-emb \ + --gelu-tanh \ + --post-norm \ + --query-pre-attn-scalar 256 \ + --output-logit-softcapping 30.0 \ + --interleave-sliding-window 4096 \ + --num-layers 42 \ + --hidden-size 3584 \ + --ffn-hidden-size 14336 \ + --num-attention-heads 16 \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path ${TOKENIZER_MODEL} \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --micro-batch-size 1 \ + --global-batch-size 64 \ + --kv-channels 256 \ + --group-query-attention \ + --num-query-groups 8 \ + --make-vocab-size-divisible-by 1 \ + --lr 1.25e-6 \ + --train-iters 2000 \ + --lr-decay-style cosine \ + --disable-bias-linear \ + --attention-dropout 0.0 \ + --init-method-std 0.01 \ + --hidden-dropout 0.0 \ + --position-embedding-type rope \ + --normalization RMSNorm \ + --add-rmsnorm-offset \ + --norm-epsilon 1e-06 \ + --input-embeds-norm \ + --use-flash-attn \ + --use-distributed-optimizer \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --min-lr 1.25e-7 \ + --weight-decay 1e-1 \ + --lr-warmup-fraction 0.01 \ + --clip-grad 1.0 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --tokenizer-padding-side left \ + --initial-loss-scale 4096 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --vocab-size 256000 \ + --bf16 +" + +DATA_ARGS=" + --data-path $DATA_PATH \ + --split 100,0,0 +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 2000 \ + --eval-interval 1000 \ + --eval-iters 0 \ +" + +torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ + $GPT_ARGS \ + $DATA_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl \ + --load ${CKPT_LOAD_DIR} \ + --save ${CKPT_SAVE_DIR} \ + | tee logs/train_gemma2_9b_mcore.log \ No newline at end of file diff --git a/modellink/arguments.py b/modellink/arguments.py index 88519f554..aa052a146 100644 --- a/modellink/arguments.py +++ b/modellink/arguments.py @@ -381,6 +381,13 @@ def _add_network_args(parser): help='Geglu activate function.') group.add_argument('--input-embeds-norm', action='store_true', default=False, help='input normalization.') + group.add_argument('--gelu-tanh', action='store_true', default=False, + help='Tanh Geglu activate function.') + group.add_argument('--output-logit-softcapping', type=float, help='output logit softcapping.') + group.add_argument('--attn-logit-softcapping', type=float, help='attention logit softcapping.') + group.add_argument('--query-pre-attn-scalar', type=int, help='attention scalar.') + group.add_argument('--interleave-sliding-window', type=int, + help='Window size when use interleave sliding window attention.') return parser diff --git a/modellink/core/models/gpt/gpt_model.py b/modellink/core/models/gpt/gpt_model.py index 2bc491032..e26ed5a6e 100644 --- a/modellink/core/models/gpt/gpt_model.py +++ b/modellink/core/models/gpt/gpt_model.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import torch from torch import Tensor from functools import wraps @@ -106,6 +106,10 @@ def gpt_model_forward(self, input_ids: Tensor, if args.output_multiplier_scale: logits = logits * args.output_multiplier_scale + if args.output_logit_softcapping: + logits = logits / args.output_logit_softcapping + logits = torch.tanh(logits) + logits = logits * args.output_logit_softcapping if labels is None: # [s b h] => [b s h] diff --git a/modellink/core/transformer/dot_product_attention.py b/modellink/core/transformer/dot_product_attention.py index 2a6ffad27..122501e0f 100644 --- a/modellink/core/transformer/dot_product_attention.py +++ b/modellink/core/transformer/dot_product_attention.py @@ -4,10 +4,11 @@ import math from functools import wraps +import torch from torch import Tensor import torch_npu from megatron.training import get_args -from megatron.core import mpu +from megatron.core import mpu, parallel_state, tensor_parallel from mindspeed.core.context_parallel.ring_context_parallel import ringattn_context_parallel from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid_ring, get_context_parallel_for_hybrid_ring_world_size, @@ -60,6 +61,12 @@ def dot_product_attention_init_wrapper(fn): config.context_parallel_size = cp_size args = get_args() + self.attn_logit_softcapping = args.attn_logit_softcapping + if args.query_pre_attn_scalar: + self.norm_factor = args.query_pre_attn_scalar ** 0.5 + self.scale_mask_softmax.scale = 1.0 + self.softmax_scale = 1.0 / self.norm_factor + if args.multi_head_latent_attention: self.scale_mask_softmax.scale = True self.hidden_size_per_partition = args.num_attention_heads * args.v_head_dim @@ -85,7 +92,119 @@ def dot_product_attention_forward_wrapper(fn): if get_args().use_flash_attn: return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params) - return fn(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params) + + assert packed_seq_params is None, ( + "Packed sequence is not supported by DotProductAttention." + "Please use TEDotProductAttention instead." + ) + + # =================================== + # Raw attention scores. [b, n/p, s, s] + # =================================== + + # expand the key and value [sk, b, ng, hn] -> [sk, b, np, hn] + # This is a noop for normal attention where ng == np. When using group query attention this + # creates a view that has the keys and values virtually repeated along their dimension to + # match the number of queries. + + # attn_mask_type is not used. + if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: + key = key.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + value = value.repeat_interleave( + self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 + ) + + # [b, np, sq, sk] + output_size = ( + query.size(1), + query.size(2), + query.size(0), + key.size(0), + ) + + # [sq, b, np, hn] -> [sq, b * np, hn] + # This will be a simple view when doing normal attention, but in group query attention + # the key and value tensors are repeated to match the queries so you can't use simple strides + # to extract the queries. + query = query.reshape(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key = key.view(output_size[3], output_size[0] * output_size[1], -1) + + # preallocting input tensor: [b * np, sq, sk] + matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor( + (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu", + ) + + # Raw attention scores. [b * np, sq, sk] + matmul_result = torch.baddbmm( + matmul_input_buffer, + query.transpose(0, 1), # [b * np, sq, hn] + key.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=(1.0 / self.norm_factor), + ) + + if self.attn_logit_softcapping is not None: + matmul_result = matmul_result / self.attn_logit_softcapping + matmul_result = torch.tanh(matmul_result) + matmul_result = matmul_result * self.attn_logit_softcapping + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # =========================== + # Attention probs and dropout + # =========================== + + # attention scores and attention mask [b, np, sq, sk] + attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + + if not self.config.sequence_parallel: + with tensor_parallel.get_cuda_rng_tracker().fork(): + attention_probs = self.attention_dropout(attention_probs) + else: + attention_probs = self.attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = ( + value.size(1), + value.size(2), + query.size(0), + value.size(3), + ) + + # change view [sk, b * np, hn] + value = value.view(value.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + + # matmul: [b * np, sq, hn] + context = torch.bmm(attention_probs, value.transpose(0, 1)) + + # change view [b, np, sq, hn] + context = context.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context = context.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_shape = context.size()[:-2] + (self.hidden_size_per_partition,) + context = context.view(*new_context_shape) + + return context return wrapper diff --git a/modellink/core/transformer/mlp.py b/modellink/core/transformer/mlp.py index 0cbd49e1c..c0eb975ed 100644 --- a/modellink/core/transformer/mlp.py +++ b/modellink/core/transformer/mlp.py @@ -13,7 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import math from functools import wraps import torch import torch.nn.functional as F @@ -85,6 +85,14 @@ def core_mlp_init(self, config, submodules, is_expert=False, input_size=None): self.config.activation_func = F.gelu self.config.bias_gelu_fusion = False + if _args.gelu_tanh: + def gelu_tanh_approximation(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + self.config.gated_linear_unit = True + self.config.activation_func = gelu_tanh_approximation + self.config.bias_gelu_fusion = False + ffn_hidden_size = self.config.ffn_hidden_size if self.config.gated_linear_unit: ffn_hidden_size *= 2 diff --git a/modellink/core/transformer/transformer_block.py b/modellink/core/transformer/transformer_block.py index 330228eb9..a51c1f0b8 100644 --- a/modellink/core/transformer/transformer_block.py +++ b/modellink/core/transformer/transformer_block.py @@ -142,7 +142,8 @@ def transformer_block_forward( # already creates viewless tensors. That said, make_viewless_tensor() # is called here to be future-proof and corner-case-proof. if self.input_embeds_norm and self.pre_process: - hidden_states = hidden_states * (self.hidden_size ** 0.5) + normalizer = torch.tensor(self.hidden_size ** 0.5, dtype=hidden_states.dtype) + hidden_states = hidden_states * normalizer hidden_states = make_viewless_tensor( inp=hidden_states, requires_grad=True, keep_graph=True, diff --git a/modellink/core/transformer/transformer_layer.py b/modellink/core/transformer/transformer_layer.py index 9895175e4..ad0351d42 100644 --- a/modellink/core/transformer/transformer_layer.py +++ b/modellink/core/transformer/transformer_layer.py @@ -17,6 +17,7 @@ from functools import wraps from dataclasses import dataclass, field from typing import Dict, Union +import torch from megatron.core.transformer.identity_op import IdentityFuncOp, IdentityOp from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.moe.moe_layer import MoELayer @@ -77,6 +78,9 @@ def transformer_layer_init_wrapper(fn): expert.layer_number = self.layer_number else: self.mlp.layer_number = self.layer_number + + self.is_sliding = not bool((self.layer_number - 1) % 2) + self.interleave_sliding_window = args_pos_norm.interleave_sliding_window return wrapper @@ -88,6 +92,15 @@ def transformer_layer_forward(self, hidden_states, attention_mask, context=None, # hidden_states: [s, b, h] args_pos_norm = get_args() + if self.interleave_sliding_window is not None and self.is_sliding and attention_mask is not None: + min_dtype = torch.finfo(hidden_states.dtype).min + sliding_window_mask = torch.tril( + torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.interleave_sliding_window + ) + attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask).bool() + # when decoding + if attention_mask.shape[-1] <= 1: + attention_mask = attention_mask[:, :, :, -self.interleave_sliding_window:] # Residual connection. residual = hidden_states diff --git a/tools/checkpoint/convert_ckpt.py b/tools/checkpoint/convert_ckpt.py index 7f921017e..16b15708a 100644 --- a/tools/checkpoint/convert_ckpt.py +++ b/tools/checkpoint/convert_ckpt.py @@ -63,7 +63,7 @@ def main(): help='Do not perform checking on the name and ordering of weights', dest='checking') parser.add_argument('--model-type-hf', type=str, default="llama2", - choices=['llama2', 'mixtral', 'chatglm3', 'gemma', 'bloom', 'qwen'], help='model-type') + choices=['llama2', 'mixtral', 'chatglm3', 'gemma', 'gemma2', 'bloom', 'qwen'], help='model-type') known_args, _ = parser.parse_known_args() loader = load_plugin('loader', known_args.loader) saver = load_plugin('saver', known_args.saver) diff --git a/tools/checkpoint/loader_hf_mcore.py b/tools/checkpoint/loader_hf_mcore.py index 366da2b2d..82394c613 100644 --- a/tools/checkpoint/loader_hf_mcore.py +++ b/tools/checkpoint/loader_hf_mcore.py @@ -55,6 +55,8 @@ def add_arguments(parser): 'This is added for computational efficiency reasons.') group.add_argument('--use-mcore-models', action='store_true', help='Use the implementation from megatron core') + group.add_argument('--post-norm', action='store_true', + help='post norm after attention or mlp.') def verify_transformers_version(): @@ -116,11 +118,17 @@ def get_message_preprocess(model, md): return message -def get_message_layer_norm(message, model, layer_idx, md): +def get_message_layer_norm(message, model, layer_idx, md, args=None): # Get non-parallel tensors from tp_rank 0. message["input norm weight"] = model.get_layers_input_layernorm_weight(layer_idx=layer_idx) message["post norm weight"] = model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx) + if args.post_norm: + message["post norm weight"] = model.get_layers_self_attention_post_attention_layernorm_weight( + layer_idx=layer_idx) + message["pre mlp norm weight"] = model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx) + message["post mlp norm weight"] = model.get_layers_self_attention_post_mlp_layernorm_weight(layer_idx=layer_idx) + if md.norm_has_bias: message["input norm bias"] = model.get_layers_input_layernorm_bias(layer_idx=layer_idx) message["post norm bias"] = model.get_layers_self_attention_pre_mlp_layernorm_bias(layer_idx=layer_idx) @@ -272,7 +280,7 @@ def _load_checkpoint(queue, args): for layer_idx in range(margs.num_layers): # Grab all parallel tensors for this layer. message = {} - message = get_message_layer_norm(message, model_mg, layer_idx, md) + message = get_message_layer_norm(message, model_mg, layer_idx, md, args) message = get_message_layer_attn(message, model_mg, layer_idx, md, args) message = get_message_layer_mlp(message, model_mg, layer_idx, md) diff --git a/tools/checkpoint/model_cfg.json b/tools/checkpoint/model_cfg.json index 408a30655..db2b848e5 100644 --- a/tools/checkpoint/model_cfg.json +++ b/tools/checkpoint/model_cfg.json @@ -94,11 +94,26 @@ "gemma": { "__base__": "base", "config_set_value": { - "seq_length": 4096, + "seq_length": 8192, "tie_word_embeddings": true, "kv_channels": 256 } }, + "gemma2": { + "__base__": "base", + "config_set_value": { + "seq_length": 8192, + "tie_word_embeddings": true + }, + "config_hf_key_mapping": { + "kv_channels": "head_dim" + }, + "model_hf_key_mapping": { + "layers_self_attention_post_attention_layernorm": "model.layers[layer_idx].post_attention_layernorm", + "layers_self_attention_pre_mlp_layernorm": "model.layers[layer_idx].pre_feedforward_layernorm", + "layers_self_attention_post_mlp_layernorm": "model.layers[layer_idx].post_feedforward_layernorm" + } + }, "bloom": { "__base__": "base", "config_set_value": { diff --git a/tools/checkpoint/models.py b/tools/checkpoint/models.py index 72f79d306..0a504d110 100644 --- a/tools/checkpoint/models.py +++ b/tools/checkpoint/models.py @@ -160,10 +160,16 @@ class ModelBase(abc.ABC): self.set_attn_state(layer_idx, src_model) self.set_mlp_state(layer_idx, src_model) input_layernorm_weight = src_model.get_layers_input_layernorm_weight(layer_idx=layer_idx) - pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx) self.set_layers_input_layernorm_weight(layer_idx=layer_idx, data=input_layernorm_weight) - self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx, data=pre_mlp_layernorm_weight) - + if self.args.post_norm: + post_attn_layernorm_weight = src_model.get_layers_self_attention_post_attention_layernorm_weight( + layer_idx=layer_idx) + self.set_layers_self_attention_post_attention_layernorm_weight(layer_idx=layer_idx, + data=post_attn_layernorm_weight) + else: + pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx) + self.set_layers_self_attention_pre_mlp_layernorm_weight(layer_idx=layer_idx, data=pre_mlp_layernorm_weight) + if self.has_layers_input_layernorm_bias(layer_idx=layer_idx): input_layernorm_bias = src_model.get_layers_input_layernorm_bias(layer_idx=layer_idx) self.set_layers_input_layernorm_bias(layer_idx=layer_idx, data=input_layernorm_bias) @@ -198,6 +204,12 @@ class ModelBase(abc.ABC): fc2_bias = src_model.get_layers_mlp_linear_fc2_bias(**kwargs) self.set_layers_mlp_linear_fc2_bias(data=fc2_bias, **kwargs) + if self.args.post_norm: + pre_mlp_layernorm_weight = src_model.get_layers_self_attention_pre_mlp_layernorm_weight(**kwargs) + post_mlp_layernorm_weight = src_model.get_layers_self_attention_post_mlp_layernorm_weight(**kwargs) + self.set_layers_self_attention_pre_mlp_layernorm_weight(data=pre_mlp_layernorm_weight, **kwargs) + self.set_layers_self_attention_post_mlp_layernorm_weight(data=post_mlp_layernorm_weight, **kwargs) + def set_mlp_state(self, layer_idx, src_model): args = src_model.get_args() kwargs = {'layer_idx': layer_idx} @@ -301,6 +313,7 @@ class HuggingfaceModel(ModelBase): self.args = SimpleNamespace(**self.args) self.args.add_qkv_bias = self.args_cmd.add_qkv_bias self.args.add_dense_bias = self.args_cmd.add_dense_bias + self.args.post_norm = self.args_cmd.post_norm def get_modules_from_pretrained(self, device_map="cpu", trust_remote_code=True): # Load Huggingface model. @@ -575,6 +588,7 @@ class MegatronModel(ModelBase): self.args.w_pack = self.args_cmd.w_pack self.args.add_qkv_bias = self.args_cmd.add_qkv_bias self.args.add_dense_bias = self.args_cmd.add_dense_bias + self.args.post_norm = self.args_cmd.post_norm self.args.tokenizer_model = getattr(self.args_cmd, 'tokenizer_model', None) self.args.make_vocab_size_divisible_by = getattr(self.args_cmd, 'make_vocab_size_divisible_by', None) if self.args_cmd.params_dtype == 'bf16': @@ -870,7 +884,7 @@ class MegatronMCoreModel(MegatronModel): "layers_self_attention_linear_qkv": module_layer + "self_attention.linear_qkv", "layers_self_attention_q_layernorm": module_layer + "self_attention.q_layernorm", "layers_self_attention_k_layernorm": module_layer + "self_attention.k_layernorm", - "layers_self_attention_post_attention_layernorm": module_layer + "pre_mlp_layernorm", + "layers_self_attention_post_attention_layernorm": module_layer + "post_attn_norm", "layers_self_attention_pre_mlp_layernorm": module_layer + "pre_mlp_layernorm", "layers_mlp_linear_fc1": module_layer + "mlp.linear_fc1", "layers_mlp_linear_fc2": module_layer + "mlp.linear_fc2", diff --git a/tools/checkpoint/saver_mg_mcore.py b/tools/checkpoint/saver_mg_mcore.py index 81c9a63f0..22049e3bf 100644 --- a/tools/checkpoint/saver_mg_mcore.py +++ b/tools/checkpoint/saver_mg_mcore.py @@ -164,6 +164,10 @@ def set_model_layer_norm(model_mg, msg, md, **kwargs): margs = model_mg.get_args() + post_norm = margs.post_norm + if post_norm: + pre_mlp_norm_weight = msg.pop("pre mlp norm weight") + post_mlp_norm_weight = msg.pop("post mlp norm weight") # Save them to the model for ep_rank in range(margs.expert_model_parallel_size): kwargs["ep_rank"] = ep_rank @@ -174,6 +178,10 @@ def set_model_layer_norm(model_mg, msg, md, **kwargs): if input_norm_bias is not None: model_mg.set_layers_input_layernorm_bias(**kwargs, data=input_norm_bias) model_mg.set_layers_self_attention_pre_mlp_layernorm_weight(**kwargs, data=post_norm_weight) + if post_norm: + model_mg.set_layers_self_attention_pre_mlp_layernorm_weight(**kwargs, data=pre_mlp_norm_weight) + model_mg.set_layers_self_attention_post_attention_layernorm_weight(**kwargs, data=post_norm_weight) + model_mg.set_layers_self_attention_post_mlp_layernorm_weight(**kwargs, data=post_mlp_norm_weight) if post_norm_bias is not None: model_mg.set_layers_self_attention_pre_mlp_layernorm_bias(**kwargs, data=post_norm_bias)