!1658 【mcore】添加baichuan2-13B和deepseek2-coder适配

Merge pull request !1658 from xiongliangcheng/deepseek-coder
2024-11-29 18:38:39 +08:00 · 2024-09-12 01:04:21 +00:00 · 2024-09-12 01:04:21 +00:00 · 2b84f8f52a
commit 2b84f8f52a
parent b3c029ddd9
14 changed files with 545 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -186,10 +186,10 @@ ModelLink 通过模型并行与数据并行来训练大语言模型，为了演
    <tr>
      <td><a href="https://huggingface.co/baichuan-inc/Baichuan2-13B-Base/tree/main">13B</a></td>
      <td>4K</td>
-      <th>Legacy</th>
+      <th>Mcore</th>
      <td> 1x8</td>
      <td> BF16 </td>
-      <td> 1668 </td>
+      <td> 1754 </td>
      <td> -- </td>
      <td> 2062 </td>
      <td><center>【昇腾】</td>
--- a/convert_ckpt.py
+++ b/convert_ckpt.py
@ -72,7 +72,7 @@ def main():
                        help='Do not perform checking on the name and ordering of weights',
                        dest='checking')
    parser.add_argument('--model-type-hf', type=str, default="llama2",
-                        choices=['llama2', 'mixtral', 'chatglm3', 'gemma', 'gemma2', 'bloom', 'qwen', 'internlm2', 'deepseek2', 'minicpm', 'minicpm-moe'],
+                        choices=['baichuan2', 'llama2', 'mixtral', 'chatglm3', 'gemma', 'gemma2', 'bloom', 'qwen', 'internlm2', 'deepseek2', 'minicpm', 'minicpm-moe'],
                        help='model type of huggingface')
    known_args, _ = parser.parse_known_args()

--- a/examples/mcore/baichuan2/ckpt_convert_baichuan2_13b_hf2mcore.sh
+++ b/examples/mcore/baichuan2/ckpt_convert_baichuan2_13b_hf2mcore.sh
@ -0,0 +1,16 @@
+# 修改 ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# 权重格式转换
+python convert_ckpt.py \
+    --use-mcore-models \
+    --model-type-hf baichuan2 \
+    --model-type GPT \
+    --load-model-type hf \
+    --save-model-type mg \
+    --params-dtype bf16 \
+    --target-tensor-parallel-size 8 \
+    --target-pipeline-parallel-size 1 \
+    --load-dir ./model_from_hf/Baichuan2-13B_hf/ \
+    --save-dir ./model_weights/Baichuan2-13B_mcore/ \
+    --tokenizer-model ./model_from_hf/Baichuan2-13B_hf/tokenizer.model
--- a/examples/mcore/baichuan2/ckpt_convert_baichuan2_13b_mcore2hf.sh
+++ b/examples/mcore/baichuan2/ckpt_convert_baichuan2_13b_mcore2hf.sh
@ -0,0 +1,15 @@
+# 修改 ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# 权重格式转换
+python convert_ckpt.py \
+    --use-mcore-models \
+    --model-type-hf baichuan2 \
+    --model-type GPT \
+    --load-model-type mg \
+    --save-model-type hf \
+    --params-dtype bf16 \
+    --target-tensor-parallel-size 1 \
+    --target-pipeline-parallel-size 1 \
+    --load-dir ./model_weights/Baichuan2-13B_mcore/ \
+    --save-dir ./model_from_hf/Baichuan2-13B_hf/
--- a/examples/mcore/baichuan2/data_convert_baichuan2_pretrain.sh
+++ b/examples/mcore/baichuan2/data_convert_baichuan2_pretrain.sh
@ -0,0 +1,12 @@
+# 请按照您的真实环境修改 set_env.sh 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+mkdir ./dataset
+
+# 数据集下载地址 https://huggingface.co/datasets/tatsu-lab/alpaca/resolve/main/data/train-00000-of-00001-a09b74b3ef9c3b56.parquet
+python ./preprocess_data.py \
+    --input ./dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet \
+    --tokenizer-name-or-path ./model_from_hf/Baichuan-hf/ \
+    --output-prefix ./dataset/alpaca \
+    --workers 4 \
+    --log-interval 1000  \
+    --tokenizer-type PretrainedFromHF
--- a/examples/mcore/baichuan2/evaluate_baichuan2_13b_ptd.sh
+++ b/examples/mcore/baichuan2/evaluate_baichuan2_13b_ptd.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+CHECKPOINT="Your ckpt file path"
+TOKENIZER_PATH="Your tokenizer path"
+DATA_PATH="./boolq/data/test/"
+TASK="boolq"
+
+# Different task needs different max_new_tokens value, please follow the instruction in readme.
+torchrun $DISTRIBUTED_ARGS evaluation.py \
+    --task-data-path ${DATA_PATH} \
+    --task ${TASK} \
+    --seq-length 4096 \
+    --max-new-tokens 1 \
+    --max-position-embeddings 4096 \
+    --tensor-model-parallel-size 8  \
+    --pipeline-model-parallel-size 1  \
+    --num-layers 40  \
+    --hidden-size 5120  \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 40  \
+    --disable-bias-linear \
+    --swiglu \
+    --position-embedding-type alibi \
+    --square-alibi-mask \
+    --fill-neg-inf \
+    --load $CHECKPOINT \
+    --normalization RMSNorm \
+    --tokenizer-type PretrainedFromHF  \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --tokenizer-not-use-fast \
+    --fp16 \
+    --micro-batch-size 1  \
+    --use-fused-rmsnorm \
+    --exit-on-missing-checkpoint \
+    --no-load-rng \
+    --no-load-optim \
+    --untie-embeddings-and-output-weights \
+    --no-masked-softmax-fusion \
+    --make-vocab-size-divisible-by 32 \
+    --use-mcore-models \
+    --seed 42 | tee logs/eval_baichuan2_13b_mcore_${TASK}.log
--- a/examples/mcore/baichuan2/generate_baichuan2_13b_ptd.sh
+++ b/examples/mcore/baichuan2/generate_baichuan2_13b_ptd.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+# please fill these path configurations
+CHECKPOINT="your model directory path"
+TOKENIZER_PATH="your tokenizer directory path"
+
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+NNODES=1
+NODE_RANK=0
+NPUS_PER_NODE=8
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+DISTRIBUTED_ARGS="--nproc_per_node $NPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+torchrun $DISTRIBUTED_ARGS inference.py \
+    --use-mcore-models \
+    --tensor-model-parallel-size 8  \
+    --pipeline-model-parallel-size 1  \
+    --num-layers 40 \
+    --hidden-size 5120  \
+    --ffn-hidden-size 13696 \
+    --seq-length 1024 \
+    --max-new-tokens 256 \
+    --micro-batch-size 1 \
+    --global-batch-size 16 \
+    --num-attention-heads 40  \
+    --max-position-embeddings 2048 \
+    --position-embedding-type alibi \
+    --square-alibi-mask \
+    --fill-neg-inf \
+    --swiglu \
+    --load ${CHECKPOINT}  \
+    --tokenizer-type PretrainedFromHF  \
+    --tokenizer-name-or-path ${TOKENIZER_PATH} \
+    --tokenizer-not-use-fast \
+    --fp16 \
+    --normalization RMSNorm \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-softmax-in-fp32 \
+    --no-load-optim \
+    --no-load-rng \
+    --no-masked-softmax-fusion \
+    --no-gradient-accumulation-fusion \
+    --exit-on-missing-checkpoint \
+    --make-vocab-size-divisible-by 32 \
+    | tee logs/generate_baichuan2_13b_mcore.log
+
+
--- a/examples/mcore/baichuan2/pretrain_baichuan2_13b_ptd.sh
+++ b/examples/mcore/baichuan2/pretrain_baichuan2_13b_ptd.sh
@ -0,0 +1,100 @@
+#!/bin/bash
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_MODEL="your tokenizer model path"
+CKPT_LOAD_DIR="your model load ckpt path"
+
+TP=8
+PP=1
+
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --sequence-parallel \
+    --use-mc2 \
+    --num-layers 40 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 13696 \
+    --num-attention-heads 40 \
+    --tokenizer-type Llama2Tokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --seq-length 4096 \
+    --disable-bias-linear \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 2 \
+    --global-batch-size 128 \
+    --untie-embeddings-and-output-weights \
+    --no-gradient-accumulation-fusion \
+    --make-vocab-size-divisible-by 32 \
+    --lr 1e-5 \
+    --load ${CKPT_LOAD_DIR} \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --attention-dropout 0.0 \
+    --init-method-std 0.01 \
+    --position-embedding-type alibi \
+    --hidden-dropout 0.0 \
+    --norm-epsilon 1e-6 \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --use-flash-attn \
+    --use-fused-swiglu \
+    --use-mc2 \
+    --swiglu \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --square-alibi-mask \
+    --fill-neg-inf \
+    --min-lr 1e-8 \
+    --weight-decay 1e-4 \
+    --clip-grad 1.0 \
+    --seed 1234 \
+    --adam-beta1 0.9 \
+    --initial-loss-scale 8188.0 \
+    --adam-beta2 0.98 \
+    --adam-eps 1.0e-8 \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path ${DATA_PATH} \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 0
+"
+
+torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl \
+    --save ${CKPT_SAVE_DIR} \
+    | tee logs/train_baichuan2_13b_mcore.log
--- a/examples/mcore/deepseek2_coder/ckpt_convert_deepseek2_hf2mcore.sh
+++ b/examples/mcore/deepseek2_coder/ckpt_convert_deepseek2_hf2mcore.sh
@ -0,0 +1,18 @@
+# 修改 ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# 权重格式转换，设置需要的并行策略
+python convert_ckpt.py \
+   --use-mcore-models \
+   --moe-grouped-gemm \
+   --model-type-hf deepseek2 \
+   --model-type GPT \
+   --load-model-type hf \
+   --save-model-type mg \
+   --params-dtype bf16 \
+   --target-tensor-parallel-size 1 \
+   --target-pipeline-parallel-size 1 \
+   --target-expert-parallel-size 8 \
+   --load-dir ./model_from_hf/deepseek2-coder-hf/ \
+   --save-dir ./model_weights/deepseek2-coder-mcore/ \
+   --tokenizer-model ./model_from_hf/deepseek2-coder-hf/
--- a/examples/mcore/deepseek2_coder/ckpt_convert_deepseek2_mcore2hf.sh
+++ b/examples/mcore/deepseek2_coder/ckpt_convert_deepseek2_mcore2hf.sh
@ -0,0 +1,17 @@
+# 修改 ascend-toolkit 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+# 权重格式转换
+python convert_ckpt.py \
+   --use-mcore-models \
+   --moe-grouped-gemm \
+   --model-type-hf deepseek2 \
+   --model-type GPT \
+   --load-model-type mg \
+   --save-model-type hf \
+   --params-dtype bf16 \
+   --target-tensor-parallel-size 1 \
+   --target-pipeline-parallel-size 1 \
+   --target-expert-parallel-size 1 \
+   --load-dir ./model_weights/deepseek2-coder-mcore/ \
+   --save-dir ./model_from_hf/deepseek2-coder-hf/
--- a/examples/mcore/deepseek2_coder/data_convert_deepseek2_pretrain.sh
+++ b/examples/mcore/deepseek2_coder/data_convert_deepseek2_pretrain.sh
@ -0,0 +1,14 @@
+# 请按照您的真实环境修改 set_env.sh 路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+mkdir ./dataset
+
+# 数据集下载地址 https://huggingface.co/datasets/lsb/enwiki20230101/blob/main/data/train-00000-of-00042-d964455e17e96d5a.parquet
+python ./preprocess_data.py \
+    --input ./dataset/train-00000-of-00042-d964455e17e96d5a.parquet \
+    --tokenizer-name-or-path ./model_from_hf/deepseek2-coder-hf/ \
+    --tokenizer-type PretrainedFromHF \
+    --handler-name GeneralPretrainHandler \
+    --output-prefix ./dataset/enwiki \
+    --json-keys text \
+    --workers 4 \
+    --log-interval 1000 
--- a/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh
+++ b/examples/mcore/deepseek2_coder/pretrain_deepseek2_ptd_8p.sh
@ -0,0 +1,147 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CKPT_SAVE_DIR="your model save ckpt path"
+DATA_PATH="your data path"
+TOKENIZER_MODEL="your tokenizer path"
+CKPT_LOAD_DIR="your model ckpt path"
+
+TP=1
+PP=1
+EP=8
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+MLA_ARGS="
+    --multi-head-latent-attention \
+    --qk-rope-head-dim 64 \
+    --qk-nope-head-dim 128 \
+    --q-lora-rank 1536 \
+    --kv-lora-rank 512 \
+    --v-head-dim 128 \
+    --qk-layernorm \
+"
+
+MOE_ARGS="
+    --moe-grouped-gemm \
+    --moe-permutation-async-comm \
+    --moe-token-dispatcher-type allgather \
+    --first-k-dense-replace 1 \
+    --moe-layer-freq 1 \
+    --n-shared-experts 2 \
+    --num-experts 160 \
+    --moe-router-topk 6 \
+    --moe-intermediate-size 1536 \
+    --moe-router-load-balancing-type group_limited_greedy \
+    --topk-group 3 \
+    --moe-aux-loss-coeff 0.003 \
+    --moe-device-level-aux-loss-coeff 0.05 \
+    --moe-comm-aux-loss-coeff 0.02 \
+    --routed-scaling-factor 16.0 \
+    --seq-aux
+"
+
+ROPE_ARGS="
+    --rope-scaling-beta-fast 32 \
+    --rope-scaling-beta-slow 1 \
+    --rope-scaling-factor  40 \
+    --rope-scaling-mscale 0.707 \
+    --rope-scaling-mscale-all-dim  0.707 \
+    --rope-scaling-original-max-position-embeddings 4096 \
+    --rope-scaling-type yarn
+"
+
+GPT_ARGS="
+    --load $CKPT_LOAD_DIR \
+    --use-distributed-optimizer \
+    --use-flash-attn \
+    --shape-order BNSD \
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --expert-model-parallel-size ${EP} \
+    --sequence-parallel \
+    --output-layer-slice-num 10 \
+    --num-layers 2 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 12288 \
+    --num-attention-heads 128 \
+    --tokenizer-type PretrainedFromHF  \
+    --tokenizer-name-or-path ${TOKENIZER_MODEL} \
+    --seq-length 8192 \
+    --max-position-embeddings 163840 \
+    --micro-batch-size 1 \
+    --global-batch-size 64 \
+    --make-vocab-size-divisible-by 1 \
+    --lr 1.0e-5 \
+    --train-iters 2000 \
+    --lr-decay-style cosine \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --attention-dropout 0.0 \
+    --init-method-std 0.02 \
+    --hidden-dropout 0.0 \
+    --position-embedding-type rope \
+    --normalization RMSNorm \
+    --use-fused-rotary-pos-emb \
+    --use-rotary-position-embeddings \
+    --use-fused-swiglu \
+    --use-fused-rmsnorm \
+    --swiglu \
+    --no-masked-softmax-fusion \
+    --attention-softmax-in-fp32 \
+    --min-lr 1.0e-7 \
+    --weight-decay 1e-2 \
+    --lr-warmup-iters 500 \
+    --clip-grad 1.0 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --initial-loss-scale 65536 \
+    --vocab-size 102400 \
+    --padded-vocab-size 102400 \
+    --rotary-base 10000 \
+    --no-gradient-accumulation-fusion \
+    --norm-epsilon 1e-6 \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16
+"
+
+DATA_ARGS="
+    --data-path $DATA_PATH \
+    --split 100,0,0
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 2000 \
+    --eval-interval 2000 \
+    --eval-iters 0 \
+    --no-save-optim \
+    --no-save-rng
+"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS pretrain_gpt.py \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    $MLA_ARGS \
+    $ROPE_ARGS \
+    $MOE_ARGS \
+    --distributed-backend nccl \
+    --save $CKPT_SAVE_DIR \
+    | tee logs/pretrain_deepseek2_ptd_8p.log
--- a/modellink/core/transformer/dot_product_attention.py
+++ b/modellink/core/transformer/dot_product_attention.py
@ -17,6 +17,7 @@ from mindspeed.core.parallel_state import (get_context_parallel_group_for_hybrid
 from modellink.model.transformer import get_attention_mask
 from modellink.core.models.common.embeddings.rotary_pos_embedding import yarn_get_mscale
 from modellink.utils import get_actual_seq_len
+from modellink.model.alibi import Alibi

 try:
    from einops import rearrange
@ -63,6 +64,19 @@ def dot_product_attention_init_wrapper(fn):

        args = get_args()
        self.attn_logit_softcapping = args.attn_logit_softcapping
+        self.square_alibi_mask = args.square_alibi_mask
+        self.fill_neg_inf = args.fill_neg_inf
+        self.beta = 1.0
+        self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling
+        if self.apply_query_key_layer_scaling:
+            self.beta = 1.0 / self.layer_number
+
+        if args.position_embedding_type == 'alibi':
+            get_alibi(self, args.seq_length)
+            self.alibi_output_size = None
+        else:
+            self.alibi = None
+
        if args.query_pre_attn_scalar:
            self.norm_factor = args.query_pre_attn_scalar ** 0.5
            self.scale_mask_softmax.scale = 1.0
@ -87,6 +101,21 @@ def dot_product_attention_init_wrapper(fn):
    return wrapper


+def get_alibi(self, seq_length):
+    args = get_args()
+    self.alibi = Alibi()
+    alibi = self.alibi._build_alibi_tensor(seq_length,
+                                           args.num_attention_heads,
+                                           args.square_alibi_mask,
+                                           args.fill_neg_inf,
+                                           ).to(torch.cuda.current_device())
+    if args.params_dtype == torch.float16:
+        alibi = alibi.to(torch.float16)
+    elif args.params_dtype == torch.bfloat16:
+        alibi = alibi.to(torch.bfloat16)
+    self.alibi.alibi = alibi
+
+
 def dot_product_attention_forward_wrapper(fn):
    @wraps(fn)
    def wrapper(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params):
@ -134,18 +163,27 @@ def dot_product_attention_forward_wrapper(fn):
        key = key.view(output_size[3], output_size[0] * output_size[1], -1)

        # preallocting input tensor: [b * np, sq, sk]
-        matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
-            (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu",
-        )
+        if self.alibi is None:
+            matmul_input_buffer = parallel_state.get_global_memory_buffer().get_tensor(
+                (output_size[0] * output_size[1], output_size[2], output_size[3]), query.dtype, "mpu",
+            )

-        # Raw attention scores. [b * np, sq, sk]
-        matmul_result = torch.baddbmm(
-            matmul_input_buffer,
-            query.transpose(0, 1),  # [b * np, sq, hn]
-            key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0,
-            alpha=(1.0 / self.norm_factor),
-        )
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query.transpose(0, 1),  # [b * np, sq, hn]
+                key.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+        else:
+            if self.alibi.alibi_pse is None or self.alibi.output_size != output_size:
+                self.alibi.output_size = output_size
+                self.alibi.get_alibi_pse(attention_mask, output_size[0], output_size[2], output_size[3])
+
+            q_trans = query.transpose(0, 1).contiguous()
+            k_trans = key.transpose(0, 1).transpose(1, 2).contiguous()
+            matmul_result = self.beta * self.alibi.alibi_pse + torch.bmm(q_trans, k_trans) * (1.0 / self.norm_factor)

        if self.attn_logit_softcapping is not None:
            matmul_result = matmul_result / self.attn_logit_softcapping
@ -160,7 +198,13 @@ def dot_product_attention_forward_wrapper(fn):
        # ===========================

        # attention scores and attention mask [b, np, sq, sk]
-        attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)
+        if self.square_alibi_mask:
+            attention_scores = torch.max(
+                attention_scores, torch.tensor(torch.finfo(attention_scores.dtype).min)
+            )
+            attention_probs = torch.nn.functional.softmax(attention_scores, -1)
+        else:
+            attention_probs: Tensor = self.scale_mask_softmax(attention_scores, attention_mask)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
@ -224,7 +268,7 @@ def dot_product_attention_forward(

    args = get_args()

-    seq_length, _, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
+    seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
    actual_seq_len = None
    if args.reset_attention_mask or args.reset_position_ids:
        query, key, value = [rearrange(x, 's b h d -> (s b) h d') for x in [query, key, value]]
@ -241,9 +285,18 @@ def dot_product_attention_forward(
        raise AssertionError("self.hidden_size_per_attention_head should not be ZERO.")
    scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) \
        if self.scale_mask_softmax.scale is None else self.softmax_scale
-    if attention_mask is None or args.reset_attention_mask or args.reset_position_ids:
-        attention_mask = get_attention_mask()
-
+    if not hasattr(self, 'attention_mask') or \
+            self.attention_mask is None or \
+            self.attention_mask.shape[0] != seq_length or \
+            args.reset_attention_mask or args.reset_position_ids:
+        if self.alibi is not None:
+            self.attention_mask = torch.triu(
+                torch.ones(seq_length, seq_length),
+                1).bool().npu()
+        elif attention_mask is None:
+            self.attention_mask = get_attention_mask()
+        else:
+            self.attention_mask = attention_mask
    if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']:
        return do_ring_context_parallel(
            query, key, value, head_num=n_head, softmax_scale=scale, attn_mask=attention_mask)
@ -253,11 +306,27 @@ def dot_product_attention_forward(
        if use_sliding_windows:
            args.pre_tockens = args.sliding_window

+        pse = None
+        size_record = key.shape
+        if self.alibi is not None and (self.alibi.output_size != size_record) and pse is None:
+            if args.shape_order != 'SBH':
+                raise ValueError(
+                    'FlashAttention with Alibi requires for SBH shape_order, but is {}.'.format(args.shape_order))
+
+            self.alibi.output_size = size_record
+            self.alibi.get_alibi_pse(self.attention_mask, batch_size, query.shape[0], key.shape[0])
+
+        if self.alibi and pse is None:
+            pse = self.alibi.alibi_pse.reshape(
+                batch_size, n_head, self.alibi.alibi_pse.size(1), -1) * self.beta * self.norm_factor
+            args.pre_tockens = seq_length
+            args.sparse_mode = 0
+
        output = torch_npu.npu_fusion_attention(
            query, key, value, n_head, args.shape_order,
-            pse=None,
+            pse=pse,
            padding_mask=None,
-            atten_mask=attention_mask,
+            atten_mask=self.attention_mask,
            actual_seq_qlen=actual_seq_len,
            actual_seq_kvlen=actual_seq_len,
            scale=scale,
--- a/modellink/tasks/checkpoint/model_cfg.json
+++ b/modellink/tasks/checkpoint/model_cfg.json
@ -259,6 +259,16 @@
        "layers_mlp_linear_fc2": "model.layers[layer_idx].mlp.experts[expert_idx].w2",
        "final_layernorm": "model.norm"
      }
+    },
+    "baichuan2": {
+      "__base__": "base",
+      "config_set_value": {
+        "qkv_type": "pack_gqa",
+        "max_position_embeddings": 4096
+      },
+      "model_hf_key_mapping": {
+        "layers_self_attention_linear_qkv_pack": "model.layers[layer_idx].self_attn.W_pack"
+      }
    }
  }
 }