!1622 权重转换代码位置优化&修复反向依赖

Merge pull request !1622 from sunjunjie/ckpt_position
This commit is contained in:
sunjunjie 2024-09-09 06:37:36 +00:00 committed by i-robot
parent de65e81113
commit 58e8133311
81 changed files with 149 additions and 139 deletions

View File

@ -57,7 +57,7 @@
## 公开接口声明
ModelLink 暂时未发布wheel包无正式对外公开接口所有功能均通过shell脚本调用。5个入口脚本分别为[pretrain_gpt.py](https://gitee.com/ascend/ModelLink/blob/master/pretrain_gpt.py), [inference.py](https://gitee.com/ascend/ModelLink/blob/master/inference.py), [evaluation.py](https://gitee.com/ascend/ModelLink/blob/master/evaluation.py), [preprocess_data.py](https://gitee.com/ascend/ModelLink/blob/master/preprocess_data.py) 和 [convert_ckpt.py](https://gitee.com/ascend/ModelLink/blob/master/tools/checkpoint/convert_ckpt.py)。
ModelLink 暂时未发布wheel包无正式对外公开接口所有功能均通过shell脚本调用。5个入口脚本分别为[pretrain_gpt.py](https://gitee.com/ascend/ModelLink/blob/master/pretrain_gpt.py), [inference.py](https://gitee.com/ascend/ModelLink/blob/master/inference.py), [evaluation.py](https://gitee.com/ascend/ModelLink/blob/master/evaluation.py), [preprocess_data.py](https://gitee.com/ascend/ModelLink/blob/master/preprocess_data.py) 和 [convert_ckpt.py](https://gitee.com/ascend/ModelLink/blob/master/convert_ckpt.py)。
## 通信安全加固

View File

@ -21,14 +21,17 @@ import sys
from functools import wraps
import torch.multiprocessing as mp
import modellink
from pretrain_gpt import model_provider
MODULE_ROOT = "modellink.tasks.checkpoint"
def load_plugin(plugin_type, name):
module_name = f"{plugin_type}_{name}"
module_name = f"{MODULE_ROOT}.{plugin_type}_{name}"
try:
plugin = importlib.import_module(module_name)
except ModuleNotFoundError:
module_name = name
module_name = f"{MODULE_ROOT}.{name}"
try:
plugin = importlib.import_module(module_name)
except ModuleNotFoundError:
@ -76,11 +79,11 @@ def main():
queue = mp.Queue(maxsize=args.max_queue_size)
print("Starting saver...")
saver_proc = mp.Process(target=saver.save_model_checkpoint, args=(queue, args))
saver_proc = mp.Process(target=saver.save_model_checkpoint, args=(model_provider, queue, args))
saver_proc.start()
print("Starting loader...")
loader.load_checkpoint(queue, args)
loader.load_checkpoint(model_provider, queue, args)
print("Waiting for saver to complete...")
saver_proc.join()

View File

@ -111,7 +111,7 @@ cd ../../
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \
@ -164,7 +164,7 @@ bash examples/llama2/ckpt_convert_llama2_hf2legacy.sh
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
@ -202,7 +202,7 @@ bash examples/llama2/ckpt_convert_llama2_legacy2hf.sh
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \
@ -227,7 +227,7 @@ bash examples/llama2/ckpt_convert_llama2_legacy2legacy_lora.sh
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
#选择你需要的并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--load-dir ./model_from_hf/Aquila-hf/ \
--save-dir ./model_weights/Aquila-legacy/ \

View File

@ -1,6 +1,6 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
#选择你需要的并行策略,--params-dtype bf16 结合需要选择使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--load-dir ./model_from_hf/Aquila2-hf/ \
--save-dir ./model_weights/Aquila2-legacy/ \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# --params-dtype bf16 结合需要使用
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 选择你需要的并行策略,--params-dtype bf16 \ 结合需要使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置你需要的并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -2,12 +2,12 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
#设置你需要的并行策略,--params-dtype bf16 结合需要使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 1 \
--target-tensor-parallel-size 8 \
--target-pipeline-parallel-size 1 \
--load-dir --load-dir ./model_from_hf/Bloom-hf/ \
--save-dir --save-dir ./model_weights/Bloom-legacy/ \
--tokenizer-model None \

View File

@ -1,17 +1,16 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader mg_mcore \
--saver mg_mcore \
--save-model-type huggingface \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/Bloom-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Bloom-hf/mg2hg/
--load-dir ./model_weights/Bloom-legacy/
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--model-type-hf bloom \
--add-qkv-bias \
--add-dense-bias
--add-dense-bias \
--load-dir ./model_weights/Bloom-legacy/ \
--save-dir ./model_from_hf/Bloom-hf/ # <-- 需要填入原始HF模型路径新权重会存于./model_from_hf/Bloom-hf/mg2hg/
# --params-dtype bf16 \ 结合需要使用

View File

@ -101,7 +101,7 @@ ChatGLM3-6B 训练的硬件配置:
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader chatglm3_hf \
--saver megatron \
@ -121,7 +121,7 @@ ChatGLM3-6B 训练的硬件配置:
```shell
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -101,7 +101,7 @@ Here's a hardware summary of pre-training ChatGLM3-6B:
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# convert to ptd weights
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader chatglm3_hf \
--saver megatron \
@ -120,7 +120,7 @@ Here's a hardware summary of pre-training ChatGLM3-6B:
```shell
# Modify the ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader mg_mcore \
--saver mg_mcore \

View File

@ -1,7 +1,7 @@
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type-hf gemma \
--model-type GPT \
--loader hf_mcore \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type-hf gemma \
--model-type GPT \
--loader mg_mcore \

View File

@ -1,6 +1,6 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换,设置需要的并行配置,--num-layers-per-virtual-pipeline-stage 5--params-dtype bf16 结合需要使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# --num-layers-per-virtual-pipeline-stage 5 \ 结合需要使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type GPT \
--model-type-hf llama2 \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf llama2 \
--save-model-type huggingface \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换,设置需要的并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--moe-grouped-gemm \
--model-type-hf deepseek2 \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--moe-grouped-gemm \
--model-type-hf deepseek2 \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf gemma \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置并行策略
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf gemma \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf gemma2 \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf internlm2 \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换,设置需要的并行配置,--num-layers-per-virtual-pipeline-stage 5--params-dtype bf16 结合需要使用
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf llama2 \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置你需要的并行参数
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -4,7 +4,7 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置需要的并行配置
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -1,7 +1,7 @@
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置需要的权重转换参数
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type GPT \
--loader hf_mcore \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf llama2 \
--model-type GPT \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--use-mcore-models \
--model-type-hf llama2 \
--save-model-type huggingface \

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 设置你需要的并行参数
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--model-type GPT \

View File

@ -6,7 +6,7 @@
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader hf_mcore \
--saver mg_mcore \

View File

@ -1,4 +1,4 @@
# 修改modelling_qwen.py文件第39行
# 修改modellink_qwen.py文件第39行
# SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
# 修改为:
# SUPPORT_FP16 = True
@ -6,14 +6,14 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader mg_mcore \
--saver mg_mcore \
--save-model-type huggingface \
--target-tensor-parallel-size 1 \
--target-pipeline-parallel-size 1 \
--save-dir ./model_from_hf/qwen-hf/ \ # 需要填入原始HF模型路径新权重会存于./model_from_hf/qwen-hf/mg2hg/
--load-dir ./model_weights/qwen-legacy/ \
--model-type-hf qwen \
--add-qkv-bias
--add-qkv-bias \
--load-dir ./model_weights/qwen-legacy/ \
--save-dir ./model_from_hf/qwen-hf/ # 需要填入原始HF模型路径新权重会存于./model_from_hf/qwen-hf/mg2hg/

View File

@ -2,7 +2,7 @@
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 权重格式转换
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader megatron \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py \
python convert_ckpt.py \
--model-type GPT \
--loader llama2_hf \
--saver megatron \

View File

@ -1,7 +1,7 @@
# 请按照您的真实环境修改 set_env.sh 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
python tools/checkpoint/convert_ckpt.py --model-type GPT \
python convert_ckpt.py --model-type GPT \
--loader megatron \
--saver megatron \
--save-model-type save_huggingface_llama \

View File

@ -0,0 +1,14 @@
# coding=utf-8
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -19,8 +19,8 @@ import types
import logging as logger
import torch
import transformers
from models import get_megatron_model
from models import get_huggingface_model
from .models import get_megatron_model
from .models import get_huggingface_model
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@ -300,7 +300,7 @@ def get_message_output_layer(model, md):
return message
def _load_checkpoint(queue, args):
def _load_checkpoint(model_provider, queue, args):
# Llama-2 requires HF transformers >=4.31.0.
verify_transformers_version()
@ -316,7 +316,7 @@ def _load_checkpoint(queue, args):
args_hf = model_hf.get_args()
args_hf.moe_grouped_gemm = args.moe_grouped_gemm
model_mg = get_megatron_model(args_cmd=args)
model_mg = get_megatron_model(model_provider, args_cmd=args)
model_mg.initialize_megatron_args(args_hf, queue)
model_mg.set_tensor_model_parallel_world_size(model_mg.args.tensor_model_parallel_size)
@ -366,9 +366,9 @@ def _load_checkpoint(queue, args):
queue.put("done")
def load_checkpoint(queue, args):
def load_checkpoint(model_provider, queue, args):
try:
_load_checkpoint(queue, args)
_load_checkpoint(model_provider, queue, args)
except:
queue.put("exit")
raise

View File

@ -178,10 +178,9 @@ def set_layer_state(args, model, hf_model, layer_idx):
layer.post_attention_norm.weight.data.copy_(hf_layer.post_attention_layernorm.weight)
def load_checkpoint_to_model(args):
def load_checkpoint_to_model(model_provider, args):
'''Set model params.'''
from pretrain_gpt import model_provider
from transformers import AutoModelForCausalLM
# Load Huggingface model.
@ -199,7 +198,7 @@ def load_checkpoint_to_model(args):
return model
def _load_checkpoint(queue, args):
def _load_checkpoint(model_provider, queue, args):
# Llama-2 requires HF transformers >=4.31.0.
verify_transformers_version()
@ -329,7 +328,7 @@ def _load_checkpoint(queue, args):
# Get first pipe stage.
mpu.set_tensor_model_parallel_rank(0)
mpu.set_pipeline_model_parallel_rank(0)
model = load_checkpoint_to_model(margs)
model = load_checkpoint_to_model(model_provider, margs)
queue.put(md)
@ -433,9 +432,9 @@ def _load_checkpoint(queue, args):
queue.put("done")
def load_checkpoint(queue, args):
def load_checkpoint(model_provider, queue, args):
try:
_load_checkpoint(queue, args)
_load_checkpoint(model_provider, queue, args)
except:
queue.put("exit")
raise

View File

@ -133,10 +133,8 @@ def _load_checkpoint(queue, args):
# Determine how to make our models
if args.model_type == 'GPT':
from pretrain_gpt import model_provider
margs.model_type = ModelType.encoder_or_decoder
elif args.model_type == 'BERT':
from pretrain_bert import model_provider
margs.model_type = ModelType.encoder_or_decoder
else:
raise Exception(f'unrecognized model type: {args.model_type}')
@ -402,9 +400,9 @@ def _load_checkpoint(queue, args):
queue.put("done")
def load_checkpoint(queue, args):
def load_checkpoint(model_provider, queue, args):
try:
_load_checkpoint(queue, args)
_load_checkpoint(model_provider, queue, args)
except:
queue.put("exit")
raise

View File

@ -7,7 +7,7 @@ import sys
import types
import logging as logger
import torch
from models import get_megatron_model
from .models import get_megatron_model
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@ -315,7 +315,7 @@ def to_detach(message):
message[key] = value.detach()
def _load_checkpoint(queue, args):
def _load_checkpoint(model_provider, queue, args):
# Search in directory above this
sys.path.append(os.path.abspath(
@ -324,7 +324,7 @@ def _load_checkpoint(queue, args):
if args.megatron_path is not None:
sys.path.insert(0, args.megatron_path)
model_mg = get_megatron_model(args_cmd=args)
model_mg = get_megatron_model(model_provider, args_cmd=args)
model_mg.initialize_megatron_args(queue=queue, loader_megatron=True)
model_mg.set_tensor_model_parallel_world_size(model_mg.args.tensor_model_parallel_size)
@ -384,9 +384,9 @@ def _load_checkpoint(queue, args):
queue.put("done")
def load_checkpoint(queue, args):
def load_checkpoint(model_provider, queue, args):
try:
_load_checkpoint(queue, args)
_load_checkpoint(model_provider, queue, args)
except:
queue.put("exit")
raise

View File

@ -18,7 +18,6 @@ from megatron.training.checkpointing import load_args_from_checkpoint
from megatron.training.global_vars import set_args
from megatron.training.checkpointing import load_checkpoint
from megatron.core import tensor_parallel
from pretrain_gpt import model_provider
from modellink.utils import parse_args
from modellink.training import model_provider_func_wrapper
from modellink.checkpointing import load_checkpoint_wrapper
@ -26,7 +25,6 @@ from modellink.checkpointing import load_checkpoint_wrapper
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
model_provider = model_provider_func_wrapper(model_provider)
load_checkpoint = load_checkpoint_wrapper(load_checkpoint)
@ -381,7 +379,7 @@ class HuggingfaceModel(ModelBase):
else:
load_dir = self.args_cmd.load_dir
self.module = [AutoModelForCausalLM.from_pretrained(load_dir, device_map=device_map, trust_remote_code=trust_remote_code)]
if self.args.torch_dtype in ["float16", "bfloat16"]:
if hasattr(self.args, "torch_dtype") and self.args.torch_dtype in ["float16", "bfloat16"]:
self.module[0] = self.module[0].to(eval(f'torch.{self.args.torch_dtype}'))
def get_module_mapping(self):
@ -639,8 +637,9 @@ class HuggingfaceModel(ModelBase):
class MegatronModel(ModelBase):
def __init__(self, args_cmd, md=None):
def __init__(self, model_provider, args_cmd, md=None):
super(MegatronModel, self).__init__(args_cmd)
self.model_provider = model_provider_func_wrapper(model_provider)
self.md = md
self.pp_stage_cache = []
@ -846,7 +845,7 @@ class MegatronModel(ModelBase):
pre_process = mpu.is_pipeline_first_stage()
post_process = mpu.is_pipeline_last_stage()
expert_parallel_size = mpu.get_expert_model_parallel_world_size()
this_model = model_provider(
this_model = self.model_provider(
pre_process=pre_process,
post_process=post_process
).to(self.args.params_dtype)
@ -854,7 +853,7 @@ class MegatronModel(ModelBase):
else:
pre_process = mpu.is_pipeline_first_stage()
post_process = mpu.is_pipeline_last_stage()
model_ = [model_provider(pre_process, post_process).to(self.args.params_dtype)]
model_ = [self.model_provider(pre_process, post_process).to(self.args.params_dtype)]
self.args.consumed_train_samples = 0
self.args.consumed_valid_samples = 0
if from_pretrained:
@ -1015,8 +1014,8 @@ class MegatronModel(ModelBase):
class MegatronLegacyModel(MegatronModel):
def __init__(self, args_cmd, md=None):
super(MegatronLegacyModel, self).__init__(args_cmd, md)
def __init__(self, model_provider, args_cmd, md=None):
super(MegatronLegacyModel, self).__init__(model_provider, args_cmd, md)
def get_module_mapping(self):
module_layer = "language_model.encoder.layers[layer_idx]."
@ -1042,8 +1041,8 @@ class MegatronLegacyModel(MegatronModel):
class MegatronMCoreModel(MegatronModel):
def __init__(self, args_cmd, md=None):
super(MegatronMCoreModel, self).__init__(args_cmd, md)
def __init__(self, model_provider, args_cmd, md=None):
super(MegatronMCoreModel, self).__init__(model_provider, args_cmd, md)
def get_module_mapping(self):
module_layer = "decoder.layers[layer_idx]."
@ -1098,11 +1097,11 @@ class MegatronMCoreModel(MegatronModel):
"layers_mlp_experts_weight2"] = module_layer + "mlp.experts.weight2"
def get_megatron_model(args_cmd, md=None):
def get_megatron_model(model_provider, args_cmd, md=None):
if args_cmd.use_mcore_models:
return MegatronMCoreModel(args_cmd=args_cmd, md=md)
return MegatronMCoreModel(model_provider, args_cmd=args_cmd, md=md)
else:
return MegatronLegacyModel(args_cmd=args_cmd, md=md)
return MegatronLegacyModel(model_provider, args_cmd=args_cmd, md=md)
def get_huggingface_model(args_cmd):

View File

@ -344,7 +344,7 @@ def vocab_padding(md, margs, orig_tensor, _vocab_size_with_padding):
return full_word_embed
def save_model_checkpoint(queue, args):
def save_model_checkpoint(model_provider, queue, args):
# Search in directory above this
sys.path.append(os.path.abspath(
@ -503,10 +503,8 @@ def save_model_checkpoint(queue, args):
# Determine how to make our models
if md.model_type == 'GPT':
from pretrain_gpt import model_provider
margs.model_type = ModelType.encoder_or_decoder
elif md.model_type == 'BERT':
from pretrain_bert import model_provider
margs.model_type = ModelType.encoder_or_decoder
else:
raise Exception(f'unrecognized model type: {args.model_type}')

View File

@ -20,7 +20,7 @@ import logging as logger
import torch
from megatron.training.checkpointing import save_checkpoint
from megatron.core import mpu
from models import get_megatron_model
from .models import get_megatron_model
logger.basicConfig(format="")
logger.getLogger().setLevel(logger.INFO)
@ -416,7 +416,7 @@ def save_model(model_mg, md, **kwargs):
def save_huggingface(args, model):
'''Set model params.'''
from models import get_huggingface_model
from .models import get_huggingface_model
model_hf = get_huggingface_model(args)
model_hf.get_modules_from_pretrained()
args_cmd = model_hf.get_args_cmd()
@ -428,7 +428,7 @@ def save_huggingface(args, model):
model_hf.get_model_item().save_pretrained(save_dir)
def save_model_checkpoint(queue, args):
def save_model_checkpoint(model_provider, queue, args):
# Search in directory above this
sys.path.append(os.path.abspath(
os.path.join(os.path.dirname(__file__),
@ -470,7 +470,7 @@ def save_model_checkpoint(queue, args):
os.environ["WORLD_SIZE"] = f'{args.target_tensor_parallel_size * args.target_pipeline_parallel_size}'
# We want all arguments to come from us
model_mg = get_megatron_model(args_cmd=args, md=md)
model_mg = get_megatron_model(model_provider=model_provider, args_cmd=args, md=md)
model_mg.initialize_megatron_args(queue=queue, saver_megatron=True)
# Make models for first pipeline stage and fill in embeddings

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[6], "iter_0000001")

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[12], "iter_0000001")

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[12], "iter_0000001")

View File

@ -23,7 +23,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[10], "iter_0000001")

View File

@ -37,7 +37,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
"""
# run convert weight
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)

View File

@ -31,7 +31,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")

View File

@ -31,7 +31,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")

View File

@ -31,7 +31,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")

View File

@ -30,7 +30,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")

View File

@ -37,7 +37,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")

View File

@ -31,7 +31,7 @@ class TestConvertCkptFromHuggingface(unittest.TestCase):
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")

View File

@ -117,7 +117,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parents[3]
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -143,7 +143,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parents[3]
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -168,7 +168,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parents[3]
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -191,7 +191,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parents[3]
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -216,7 +216,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -255,7 +255,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,
@ -290,7 +290,7 @@ class TestConvertCkptFromHuggingface:
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parents[3]
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,

View File

@ -24,7 +24,7 @@ class TestConvertCkptFromMegatron:
args = CovertCkptFromMegatronArgs()
base_dir = Path(__file__).absolute().parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,

View File

@ -35,7 +35,7 @@ class TestConvertCkptFromHuggingface:
dk = 128
base_dir = Path(__file__).absolute().parent.parent.parent
file_path = os.path.join(base_dir, "tools/checkpoint/convert_ckpt.py")
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = [
"--model-type", args.model_type,
"--loader", args.loader,