!1640 门禁、流水整改三:工具代码框架整合,预留流水打样例子

Merge pull request !1640 from guoxinjie/master
This commit is contained in:
guoxinjie 2024-09-12 12:35:17 +00:00 committed by i-robot
parent a2552c6a49
commit 9e590514e9
123 changed files with 117 additions and 7468 deletions

View File

@ -1,8 +1,46 @@
import os
import stat
import sys
from pathlib import Path
import subprocess
def read_files_from_txt(txt_file):
with open(txt_file, "r") as f:
return [line.strip() for line in f.readlines()]
def is_examples(file):
return file.startswith("example/")
def is_pipecase(file):
return file.startswith("tests/pipeline")
def is_markdown(file):
return file.endswith(".md")
def skip_ci_file(files, skip_cond):
for file in files:
if not any(condition(file) for condition in skip_cond):
return False
return True
def alter_skip_ci():
parent_dir = Path(__file__).absolute().parents[2]
raw_txt_file = os.path.join(parent_dir, "modify.txt")
if not os.path.exists(raw_txt_file):
return False
file_list = read_files_from_txt(raw_txt_file)
skip_conds = [
is_examples,
is_pipecase,
is_markdown
]
return skip_ci_file(file_list, skip_conds)
def acquire_exitcode(command):
@ -46,18 +84,8 @@ class ST_Test:
st_dir = "st"
self.st_shell = os.path.join(
test_dir, st_dir, "run.sh"
test_dir, st_dir, "st_run.sh"
)
llama_instruction_shell_file = os.path.join(
test_dir, st_dir, "test_llama_instruction_ptd.sh")
llama_pretrain_ha_save_shell_file = os.path.join(
test_dir, st_dir, "test_llama_pretrain_ha_save_ptd.sh")
llama_pretrain_ha_load_shell_file = os.path.join(
test_dir, st_dir, "test_llama_pretrain_ha_load_ptd.sh")
self.st_file_list = [
llama_instruction_shell_file
]
def run_st(self):
rectify_case = f"bash {self.st_shell}"
@ -65,24 +93,22 @@ class ST_Test:
if rectify_code != 0:
print("rectify case failed, check it.")
exit(1)
all_success = True
for shell_file in self.st_file_list:
command = f"sh {shell_file}"
st_exitcode = acquire_exitcode(command)
if st_exitcode != 0:
all_success = False
print(f"ST run {shell_file} failed")
exit(1)
if all_success:
print("ST test success")
else:
print("ST failed")
exit(1)
def run_tests():
ut = UT_Test()
st = ST_Test()
ut.run_ut()
st.run_st()
def main():
if alter_skip_ci():
print("Skipping CI")
else:
run_tests()
if __name__ == "__main__":
ut = UT_Test()
ut.run_ut()
st = ST_Test()
st.run_st()
main()

View File

@ -1,124 +0,0 @@
# GPT3 $\color{black}{\bf\tiny{【社区贡献模型】}}$
<p align="left">
<b>简体中文</b> |
<b><a href="README.md">English</a> </b>
</p>
# 目录
- [GPT3](#GPT3)
- [目录](#目录)
- [GPT3-175B](#GPT3-175B)
- [训练-175B](#训练)
- [脚本](#脚本)
# GPT3-175B
## 训练
GPT3-175B 训练的硬件配置:
| 硬件 | 配置 |
| :--: | :-------------: |
| NPU | 128 x Ascend NPUs |
### 脚本
1. 克隆仓库到本地服务器:
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout core_r0.6.0
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir vocab_file
mkdir dataset
```
2. 搭建环境
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# 安装 torch 和 torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# 修改 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 安装 MindSpeed
git clone https://gitee.com/ascend/MindSpeed.git
cd MindSpeed
git checkout 2b0edd2
pip install -r requirements.txt
pip3 install -e .
cd ..
# 安装其他依赖
pip install -r requirements.txt
```
3. 准备数据、词表来拉起模型
3.1 准备数据
可以从 [这里](https://huggingface.co/datasets/wikipedia/tree/main/data/20220301.en) 下载原始数据
```shell
# 下载 enwiki 数据
# 总共有 41 个文件,我们可以选择部分来制作数据
cd ./dataset
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00000-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00001-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00002-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00003-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00004-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00005-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00006-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00007-of-00041.parquet
cd ..
# 下载 vocab file 和 merge table
cd vocab_file
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
cd ..
# 处理成训练数据
python ./preprocess_data.py \
--input ./dataset/ \
--output-prefix ./dataset/gpt_text_sentence \
--tokenizer-type GPT2BPETokenizer \
--vocab-file ./vocab_file/gpt2-vocab.json \
--merge-file ./vocab_file/gpt2-merges.txt \
--append-eod \
--workers 4 \
--log-interval 1000
```
3.2 用 ptd 模式进行预训练
配置 GPT3-175B PTD 预训练脚本: examples/gpt3/pretrain_gpt3_175B_ptd.sh
```shell
# 请根据真实情况配置 ascend-toolkit 路径
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# 请根据真实存放路径配置以下参数
VOCAB_FILE="./vocab_file/gpt2-vocab.json" # 词表
MERGE_FILE="./vocab_file/gpt2-merges.txt" # BPE 合并表
DATA_PATH="./dataset/gpt_text_sentence" # 数据路径
```
拉起 GPT3-175B PTD 预训练脚本: examples/gpt3/pretrain_gpt3_175B_ptd.sh
```shell
bash examples/gpt3/pretrain_gpt3_175B_ptd.sh
```

View File

@ -1,124 +0,0 @@
# GPT3 $\color{black}{\rm\tiny{【model}}$ $\color{black}{\rm\tiny{contributed}}$ $\color{black}{\rm\tiny{by}}$ $\color{black}{\rm\tiny{Community】}}$
<p align="left">
<b>English</b> |
<b><a href="README_en.md">English</a> </b>
</p>
# Contents
- [GPT3](#GPT3)
- [Contents](#contents)
- [GPT3-175B](#GPT3-175B)
- [Training-175B](#training)
- [Script](#script)
# GPT3-175B
## Training
Here is a hardware summary of pre-trianing GPT3-175B:
| Hardware | Value |
| :--: | :-------------: |
| NPU | 128 x Ascend NPUs |
### Script
1. Clone repository to your local server:
```shell
git clone https://gitee.com/ascend/ModelLink.git
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout core_r0.6.0
cp -r megatron ../ModelLink/
cd ..
cd ModelLink
mkdir logs
mkdir vocab_file
mkdir dataset
```
2. Build environment
```bash
# python3.8
conda create -n test python=3.8
conda activate test
# install torch and torch_npu
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
# modify ascend-toolkit path
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# install MindSpeed
git clone https://gitee.com/ascend/MindSpeed.git
cd MindSpeed
git checkout 2b0edd2
pip install -r requirements.txt
pip3 install -e .
cd ..
# install other packages
pip install -r requirements.txt
```
3. Prepare dataset and vocab file for pretrain
3.1 Prepare dataset
Download the GPT raw dataset from [here](https://huggingface.co/datasets/wikipedia/tree/main/data/20220301.en)
```shell
# download enwiki raw data
# There are 41 files in total, we can just select part to make our datasets.
cd ./dataset
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00000-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00001-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00002-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00003-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00004-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00005-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00006-of-00041.parquet
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00007-of-00041.parquet
cd ..
# download vocab file and merge table
cd vocab_file
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
cd ..
# process formal dataset
python ./preprocess_data.py \
--input ./dataset/ \
--output-prefix ./dataset/gpt_text_sentence \
--tokenizer-type GPT2BPETokenizer \
--vocab-file ./vocab_file/gpt2-vocab.json \
--merge-file ./vocab_file/gpt2-merges.txt \
--append-eod \
--workers 4 \
--log-interval 1000
```
3.2 pre-training in ptd mode
Config GPT3-175B PTD pre-training script: examples/gpt3/pretrain_gpt3_175B_ptd.sh
```shell
# modify ascend-toolkit path according to your own config
source /usr/local/Ascend/ascend-toolkit/set_env.sh
# modify config according to your own actual situation
VOCAB_FILE="./vocab_file/gpt2-vocab.json" # vocab file for training
MERGE_FILE="./vocab_file/gpt2-merges.txt" # BPE merge file for training
DATA_PATH="./dataset/gpt_text_sentence" # dataset path
```
Launch GPT3-175B PTD pre-training script: examples/gpt3/pretrain_gpt3_175B_ptd.sh
```shell
bash examples/gpt3/pretrain_gpt3_175B_ptd.sh
```

View File

@ -84,6 +84,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--jit-compile \
--distributed-backend nccl 2>&1
--distributed-backend nccl \
| tee ./logs/pretrain_gpt3_175B_8layers.log

View File

@ -85,6 +85,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--jit-compile \
--distributed-backend nccl 2>&1
--distributed-backend nccl \
| tee ./logs/pretrain_gpt3_175B.log

View File

@ -41,3 +41,30 @@ def pytest_fixture_setup(fixturedef, request):
if getattr(fixturedef.func, "is_dist_fixture", False):
dist_fixture_class = fixturedef.func()
dist_fixture_class(request)
# we still want to configure path argument by ourselves
# for different prefix_name of different scripts so we use this method.
# One more thing, as you can see, it has more scalibility.
def pytest_addoption(parser: pytest.Parser):
parser.addoption("--baseline-json", action="store", default=None,
help="Path to the baseline JSON file")
parser.addoption("--generate-log", action="store", default=None,
help="Path to the generate log file")
parser.addoption("--generate-json", action="store", default=None,
help="Path to the generate JSON file")
@pytest.fixture(autouse=True)
def baseline_json(request: pytest.FixtureRequest):
return request.config.getoption("--baseline-json")
@pytest.fixture(autouse=True)
def generate_log(request: pytest.FixtureRequest):
return request.config.getoption("--generate-log")
@pytest.fixture(autouse=True)
def generate_json(request: pytest.FixtureRequest):
return request.config.getoption("--generate-json")

View File

@ -1,6 +0,0 @@
#!/bin/bash
# Provide uniform access for piepline.
python tests/pipeline/Aquila2-7B/test_convert_weight_from_huggingface.py
pytest -s tests/pipeline/Aquila2-7B/test_generation.py
pytest -s tests/pipeline/Aquila2-7B/test_evaluation.py

View File

@ -1,60 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "11008",
"--num-attention-heads", "32",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--max-position-embeddings", "2048",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/Aquila2-7B-tp8-pp1/"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/Aquila2-7B-hf"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "512",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"EVALUATION_PARAM": [
"--tokenizer-not-use-fast",
"--task-data-path", "/home/dataset/eval_dataset/boolq/test",
"--task", "boolq",
"--max-new-tokens", "1"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "8",
"--global-batch-size", "64",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "2048"
],
"CONVERT_WEIGHT_PARAM": [
"--model-type", "GPT",
"--load-dir", "/home/dataset/Aquila2-7B-hf",
"--save-dir", "/home/dataset/Aquila2-7B-tp8-pp1",
"--loader", "llama2_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--tokenizer-model", "/home/dataset/Aquila2-7B-hf/tokenizer.json"
]
}

View File

@ -1,64 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
import torch
from utils import ParamConfig
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self):
# configure params, the index starts from 1
sys.argv = [sys.argv[0]] + ParamConfig.convert_weight_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(sys.argv[4], "*.bin"))
self.assertEqual(len(bin_file), 3)
self.assertTrue(os.path.exists(os.path.join(sys.argv[4], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[6], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(sys.argv[12]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(),
torch.Size([12501, 4096]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(),
torch.Size([1536, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(),
torch.Size([4096, 512]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(),
torch.Size([2752, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(),
torch.Size([4096, 1376]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(),
torch.Size([4096]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([12501, 4096]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,100 +0,0 @@
import sys
import os
import json
import torch
import tqdm
import torch_npu
from utils import ParamConfig, assert_judge
from transformers import AutoTokenizer
import modellink
from megatron.legacy.model import GPTModel
from tests.common import DistributedTest
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.evaluation_param + config.auxiliary_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def get_result(self, tokenizer, result):
if result:
final_result = [result[0]]
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
final_result.append("T")
else:
final_result.append("F")
else:
final_result = None
return final_result
def test_boolq_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
answer_result = {}
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "boolq" in path:
test_dir = path
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
with open(file_path, encoding='utf-8') as f:
boolq_question_list = []
for line in f.readlines():
boolq_question_list.append(json.loads(line))
boolq_question_list = boolq_question_list[:60]
subject_result = {}
acc_n = 0
for index, item in enumerate(boolq_question_list):
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
result = self.get_result(tokenizer, result)
if result:
answer = result[1]
else:
answer = None
try:
if torch.distributed.get_rank() == 0:
subject_result[str(index)] = answer
if subject_result[str(index)] == str(item['answer'])[0]:
acc_n += 1
except Exception as e:
raise e
if torch.distributed.get_rank() == 0:
total_n += len(boolq_question_list)
total_acc_n += acc_n
answer_result['Boolq_dataset'] = subject_result
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.83) < 0.03)

View File

@ -1,93 +0,0 @@
import sys
import os
import torch
import torch_npu
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
from tests.common import DistributedTest
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["解释一下“温故而知新”"]
output = model.generate(instruction, detokenize=False)
expected_output = [14727, 29728, 261, 22051, 1156, 3101, 422, 278, 0, 87,
18392, 221, 1906, 358, 132, 237, 79, 221, 261, 81,
14572, 2449, 2369, 72, 8022, 2449, 221, 261, 88, 14572]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
output[:20].unsqueeze(0).float())
print(cos_sim)
assert_judge(cos_sim > 0.80)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
prompt = "解释一下“温故而知新”"
system_template = ""
dialog_template = "{instruction}"
template = system_template + dialog_template
instruction = template.format(instruction=prompt)
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [391, 426, 1774, 1906, 8627, 10281, 261, 36018, 837, 42310,
434, 10045, 6468, 29728, 278, 0, 43652, 43652, 1774, 1906,
28043, 609, 43652, 43652, 1774, 1906, 8627, 10281, 261, 36018]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
output[:20].unsqueeze(0).float())
print(cos_sim)
assert_judge(cos_sim > 0.80)

View File

@ -1,25 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
convert_weight_param = config_file["CONVERT_WEIGHT_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,10 +0,0 @@
#!/bin/bash
# Provide uniform access for piepline.
python tests/pipeline/baichuan-13B/test_process_pretrain_data.py
python tests/pipeline/baichuan-13B/test_process_instruction_data.py
python tests/pipeline/baichuan-13B/test_convert_weight_from_huggingface.py
pytest -s tests/pipeline/baichuan-13B/test_generation.py
pytest -s tests/pipeline/baichuan-13B/test_evaluation.py
pytest -s tests/pipeline/baichuan-13B/test_lora.py
pytest -s tests/pipeline/baichuan-13B/test_trainer.py

View File

@ -1,136 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "40",
"--hidden-size", "5120",
"--ffn-hidden-size", "13696",
"--num-attention-heads", "40",
"--position-embedding-type", "alibi",
"--make-vocab-size-divisible-by", "64",
"--max-position-embeddings", "4096",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/baichuan-13B-tp8-pp1/"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/baichuan-13B-hf"
],
"PROCESS_DATA_INPUT_PATH": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet"
],
"PROCESS_PRETRAIN_DATA_PARAM": [
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan-13B/alpaca",
"--workers", "4",
"--log-interval", "1000"
],
"PROCESS_INSTRUCTION_DATA_PARAM": [
"--output-prefix", "/home/dataset/tune-dataset-baichuan-13B/alpaca",
"--tokenizer-not-use-fast",
"--handler-name", "GeneralInstructionHandler",
"--append-eod"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"EVALUATION_PARAM": [
"--tokenizer-not-use-fast",
"--task-data-path", "/home/dataset/eval_dataset/boolq/test",
"--task", "boolq",
"--max-new-tokens", "1"
],
"LORA_PARAM": [
"--finetune",
"--lora-r", "16",
"--lora-alpha", "32",
"--lora-target-modules", "query_key_value", "dense", "gate_proj","dense_h_to_4h", "dense_4h_to_h",
"--is-instruction-dataset",
"--tokenizer-not-use-fast",
"--save", "/home/dataset/lora-save-weight-baichuan-13B",
"--data-path", "/home/dataset/tune-dataset-baichuan-13B/alpaca",
"--train-iters", "10"
],
"LORA_INFERENCE_PARAM": [
"--lora-r", "16",
"--lora-alpha", "32",
"--lora-target-modules", "query_key_value", "dense", "gate_proj","dense_h_to_4h", "dense_4h_to_h",
"--tokenizer-not-use-fast",
"--lora-load", "/home/dataset/lora-save-weight-baichuan-13B"
],
"TRAINING_PARAM": [
"--tokenizer-type", "Llama2Tokenizer",
"--tokenizer-model", "/home/dataset/baichuan-13B-hf/tokenizer.model",
"--save", "/autotest/dataset/save-weight-baichuan-13B",
"--data-path", "/home/dataset/pretrain-dataset-baichuan-13B/alpaca_text_document",
"--train-iters", "10"
],
"REGULARIZATION": [
"--attention-dropout", "0.0",
"--hidden-dropout", "0.0",
"--weight-decay", "1e-1",
"--clip-grad", "1.0",
"--adam-beta1", "0.9",
"--adam-beta2", "0.95",
"--adam-eps","1.0e-5"
],
"LEARNING_RATE": [
"--lr", "1e-5",
"--lr-decay-style", "cosine",
"--min-lr", "1e-7"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "8",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--fp16",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "4096",
"--seed", "42"
],
"TRAINING_AUX": [
"--sequence-parallel",
"--initial-loss-scale", "1024.0",
"--use-fused-rmsnorm",
"--init-method-std", "0.01",
"--split", "100,0,0",
"--log-interval", "1",
"--save-interval", "10",
"--eval-interval", "1000",
"--eval-iters", "0",
"--distributed-backend", "nccl",
"--num-workers", "0"
],
"CONVERT_WEIGHT_PARAM": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--load-dir", "/home/dataset/baichuan-13B-hf",
"--save-dir", "/home/dataset/baichuan-13B-tp8-pp1",
"--tokenizer-model", "None",
"--w-pack", "True"
]
}

View File

@ -1,85 +0,0 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_DETECT=0
GPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH="/home/dataset/pretrain-dataset-baichuan-13B/alpaca_text_document"
TOKENIZER_MODEL="/home/dataset/baichuan-13B-hf/tokenizer.model"
CKPT_LOAD_DIR="/home/dataset/baichuan-13B-tp8-pp1"
TP=8
PP=1
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--sequence-parallel \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 13696 \
--num-attention-heads 40 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--seq-length 4096 \
--disable-bias-linear \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--global-batch-size 32 \
--untie-embeddings-and-output-weights \
--make-vocab-size-divisible-by 64 \
--lr 1e-5 \
--no-gradient-accumulation-fusion \
--load ${CKPT_LOAD_DIR} \
--train-iters 2000 \
--lr-decay-style cosine \
--attention-dropout 0.0 \
--position-embedding-type alibi \
--hidden-dropout 0.0 \
--normalization RMSNorm \
--use-fused-rmsnorm \
--swiglu \
--attention-softmax-in-fp32 \
--min-lr 1e-7 \
--weight-decay 1e-1 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--initial-loss-scale 1024.0 \
--adam-beta2 0.95 \
--adam-eps 1.0e-5 \
--no-load-optim \
--no-load-rng \
--fp16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 949,50,1
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 10000 \
--eval-interval 10000 \
--eval-iters 1
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl 2>&1 | tee /home/dataset/new_baichuan-13B.log

View File

@ -1,64 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
import torch
from utils import ParamConfig
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self):
# configure params, the index starts from 1
sys.argv = [sys.argv[0]] + ParamConfig.convert_weight_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(sys.argv[10], "*.bin"))
self.assertEqual(len(bin_file), 3)
self.assertTrue(os.path.exists(os.path.join(sys.argv[10], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(sys.argv[12], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(sys.argv[8]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(),
torch.Size([8000, 5120]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([5120]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(),
torch.Size([1920, 5120]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(),
torch.Size([5120, 640]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(),
torch.Size([3424, 5120]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(),
torch.Size([5120, 1712]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([5120]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(),
torch.Size([5120]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 5120]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,100 +0,0 @@
import sys
import os
import json
import torch
import tqdm
import torch_npu
from utils import ParamConfig, assert_judge
from transformers import AutoTokenizer
import modellink
from megatron.legacy.model import GPTModel
from tests.common import DistributedTest
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.evaluation_param + config.auxiliary_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def get_result(self, tokenizer, result):
if result:
final_result = [result[0]]
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
final_result.append("T")
else:
final_result.append("F")
else:
final_result = None
return final_result
def test_boolq_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
answer_result = {}
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "boolq" in path:
test_dir = path
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
with open(file_path, encoding='utf-8') as f:
boolq_question_list = []
for line in f.readlines():
boolq_question_list.append(json.loads(line))
boolq_question_list = boolq_question_list[:60]
subject_result = {}
acc_n = 0
for index, item in enumerate(boolq_question_list):
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
result = self.get_result(tokenizer, result)
if result:
answer = result[1]
else:
answer = None
try:
if torch.distributed.get_rank() == 0:
subject_result[str(index)] = answer
if subject_result[str(index)] == str(item['answer'])[0]:
acc_n += 1
except Exception as e:
raise e
if torch.distributed.get_rank() == 0:
total_n += len(boolq_question_list)
total_acc_n += acc_n
answer_result['Boolq_dataset'] = subject_result
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.71) < 0.03)

View File

@ -1,97 +0,0 @@
import sys
import os
import torch
import torch_npu
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
from tests.common import DistributedTest
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["解释一下“温故而知新”"]
output = model.generate(instruction, detokenize=False)
expected_output = [5, 31694, 31829, 31290, 31356, 31226, 31125, 1231, 31178, 31387,
34360, 73, 31106, 5, 31106, 84, 31442, 32369, 85, 31106,
5, 31106, 53, 31694, 31143, 31694, 31434, 73, 31106, 5,
31106, 54, 31829, 31143, 32363, 31135, 3317, 73, 31106, 5,
31106, 55, 31226, 31143, 5916, 3317, 73, 31106, 5, 31106]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
output[:20].unsqueeze(0).float())
print(cos_sim)
assert_judge(cos_sim > 0.80)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
prompt = "解释一下“温故而知新”"
system_template = ""
dialog_template = "{instruction}"
template = system_template + dialog_template
instruction = template.format(instruction=prompt)
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [5, 31694, 31829, 31290, 31356, 31226, 31125, 1231, 31178, 31387,
34360, 73, 31106, 5, 31106, 84, 31442, 32369, 85, 31106,
5, 31106, 53, 31694, 31143, 31694, 31434, 73, 31106, 5,
31106, 54, 31829, 31143, 32363, 31135, 3317, 73, 31106, 5,
31106, 55, 31226, 31143, 5916, 3317, 73, 31106, 5, 31106]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
output[:20].unsqueeze(0).float())
print(cos_sim)
assert_judge(cos_sim > 0.80)

View File

@ -1,134 +0,0 @@
import sys
import os
import torch
import torch_npu
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators
from tests.common import DistributedTest
class TestLora(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.network_size + config.tokenizer_param \
+ config.auxiliary_param + config.lora_param + config.regularization \
+ config.learning_rate_param + config.training_aux + config.distributed_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_megatron_lora_module(self):
self.init(config=ParamConfig)
from megatron.core import tensor_parallel
from pretrain_gpt import model_provider
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder
)
model = model[0]
for name, module in model.named_modules():
if name.endswith("query_key_value.lora_A.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("query_key_value.lora_B.default"):
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
if name.endswith("dense.lora_A.default"):
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
if name.endswith("dense.lora_B.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("dense_h_to_4h.lora_A.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("dense_h_to_4h.lora_B.default"):
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
if name.endswith("dense_4h_to_h.lora_A.default"):
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
if name.endswith("dense_4h_to_h.lora_B.default"):
assert_judge(isinstance(module, torch.nn.Linear))
def test_lora(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time, num_floating_point_operations
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder
)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
saved_checkpoint = False
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)

View File

@ -1,81 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
sys.argv = [sys.argv[0]] + ParamConfig.tokenizer_param + ParamConfig.process_data_input_path \
+ ParamConfig.process_instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 64000)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1557, 52, 31141, 31150, 59, 31219])
self.assertEqual(self.tokenizer.detokenize(31338), '')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = sys.argv[8].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 90, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,81 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
sys.argv = [sys.argv[0]] + ParamConfig.tokenizer_param + ParamConfig.process_data_input_path \
+ ParamConfig.process_pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 64000)
self.assertEqual(self.tokenizer.tokenize('bug'), [15498])
self.assertEqual(self.tokenizer.detokenize(23961), 'prolong')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = sys.argv[8].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 25, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,152 +0,0 @@
import sys
import os
import torch
import torch_npu
from utils import ParamConfig, assert_judge
import modellink
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
from tests.common import DistributedTest
class TestTraining(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.network_size + config.auxiliary_param \
+ config.regularization + config.learning_rate_param \
+ config.training_aux + config.distributed_param + config.training_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_training(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
saved_checkpoint = False
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
break
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
def test_breakpoint_renewal_training(self):
self.init(config=ParamConfig)
self.args.load = self.args.save
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_timers
from megatron.training.training import train_step
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
iteration = self.args.iteration
if torch.distributed.get_rank() == 0:
print(f"iteration:{iteration}")
assert_judge(iteration == 10)
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
timers('interval-time', log_level=0).start(barrier=True)
if iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
if 'lm loss' in loss_dict.keys():
if torch.distributed.get_rank() == 0:
print(f"loss:{loss_dict['lm loss']}")
assert_judge(abs(loss_dict['lm loss'] - 0.97) < 0.1)

View File

@ -1,47 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
distributed_param,
network_size,
inference_param,
evaluation_param,
lora_param,
training_param,
training_auxiliary,
learning_rate,
regularization,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
lora_param = config_file["LORA_PARAM"]
training_param = config_file["TRAINING_PARAM"]
training_aux = config_file["TRAINING_AUX"]
learning_rate_param = config_file["LEARNING_RATE"]
regularization = config_file["REGULARIZATION"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
process_pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA_PARAM"]
process_instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA_PARAM"]
process_data_input_path = config_file["PROCESS_DATA_INPUT_PATH"]
lora_inference_param = config_file["LORA_INFERENCE_PARAM"]
convert_weight_param = config_file["CONVERT_WEIGHT_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,5 +0,0 @@
# Provide uniform access for piepline.
python ./tests/pipeline/baichuan-7B/test_process_pretrain_data.py
python ./tests/pipeline/baichuan-7B/test_process_instruction_data.py

View File

@ -1,21 +0,0 @@
{
"PROCESS_PRETRAIN_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/baichuan-7B-hf",
"--workers", "4",
"--log-interval", "1000"
],
"PROCESS_INSTRUCTION_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "GeneralInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-baichuan-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/baichuan-7B-hf",
"--workers", "4",
"--log-interval", "1000",
"--append-eod"
]
}

View File

@ -1,84 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 64000)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1557, 52, 31141, 31150, 59, 31219])
self.assertEqual(self.tokenizer.detokenize(31338), '')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 89, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,88 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 64000)
self.assertEqual(self.tokenizer.tokenize('bug'), [15498])
self.assertEqual(self.tokenizer.detokenize(23961), 'prolong')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 25, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,20 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -8,7 +8,7 @@ from utils import ParamConfig, assert_judge
from transformers import AutoTokenizer
import modellink
from megatron.legacy.model import GPTModel
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from modellink.tasks.evaluation.utils import add_text_generate_args

View File

@ -6,7 +6,7 @@ from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
class TestGeneration(DistributedTest):

View File

@ -1,5 +0,0 @@
# Provide uniform access for piepline.
python ./tests/pipeline/baichuan2-7B/test_process_pretrain_data.py
python ./tests/pipeline/baichuan2-7B/test_process_instruction_data.py

View File

@ -1,21 +0,0 @@
{
"PROCESS_PRETRAIN_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan2-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/baichuan2-7B-hf",
"--workers", "4",
"--log-interval", "1000"
],
"PROCESS_INSTRUCTION_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "GeneralInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-baichuan2-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/baichuan2-7B-hf",
"--workers", "4",
"--log-interval", "1000",
"--append-eod"
]
}

View File

@ -1,83 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 125696)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [92655, 92335, 92365, 92379, 92383, 92574])
self.assertEqual(self.tokenizer.detokenize(31338), ' Norman')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 83, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,83 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 125696)
self.assertEqual(self.tokenizer.tokenize('bug'), [44985])
self.assertEqual(self.tokenizer.detokenize(23961), '为孩子')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 23, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,20 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -7,7 +7,7 @@ import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel

View File

@ -3,7 +3,7 @@ import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel

View File

@ -3,7 +3,7 @@ import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel

View File

@ -1,5 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/codellama-34B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/codellama-34B/test_generation.py

View File

@ -1,62 +0,0 @@
{
"CONVERT_CKPT_PARAM": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--load-dir", "/home/dataset/codellama-34B-hf",
"--save-dir", "/home/dataset/codellama-34B-mt-t8p1",
"--target-tensor-parallel-size", "8",
"--target-pipeline-parallel-size", "1",
"--tokenizer-model", "None"
],
"NETWORK_SIZE": [
"--num-layers", "48",
"--hidden-size", "8192",
"--ffn-hidden-size", "22016",
"--num-attention-heads", "64",
"--max-position-embeddings", "16384",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/codellama-34B-mt-t8p1"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/codellama-34B-hf"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint",
"--attention-softmax-in-fp32"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--seed", "42",
"--use-fused-rmsnorm",
"--group-query-attention",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "4096",
"--num-query-groups", "8",
"--vocab-size", "32000",
"--padded-vocab-size", "32000",
"--rotary-base", "1000000"
]
}

View File

@ -1,59 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
from utils import ParamConfig
import torch
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
self.assertEqual(len(bin_file), 7)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 8192]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([8192]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1280, 8192]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([8192, 1024]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5504, 8192]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([8192, 2752]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([8192]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([8192]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 8192]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,100 +0,0 @@
import sys
import os
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["import socket\n\ndef ping_exponential_backoff(host: str):"]
output = model.generate(instruction, detokenize=False)
expected_output1 = [13, 1678, 9995, 13, 1678, 349, 292, 263, 3495, 773,
25658, 1250, 2696, 29889, 13, 1678, 9995, 13, 1678, 363,
474, 297, 3464, 29898, 29896, 29892, 29871, 29896, 29900, 1125,
13, 4706, 1018, 29901, 13, 9651, 9909, 29889, 29887, 621,
520, 29890, 948, 420, 29898, 3069, 29897, 13, 9651, 736]
expected_output2 = [13, 1678, 9995, 13, 1678, 349, 292, 263, 3495, 773,
25658, 1250, 2696, 29889, 13, 1678, 9995, 13, 1678, 9055,
353, 29871, 29896, 13, 1678, 1550, 5852, 29901, 13, 4706,
1018, 29901, 13, 9651, 9909, 29889, 29887, 621, 520, 29890,
948, 420, 29898, 3069, 29897, 13, 9651, 736, 13, 4706]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
cos_sim = torch.max(cos_sim1, cos_sim2)
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "def fibonacci("
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [29876, 1125, 13, 1678, 565, 302, 1275, 29871, 29900, 29901,
13, 4706, 736, 29871, 29900, 13, 1678, 25342, 302, 1275,
29871, 29896, 29901, 13, 4706, 736, 29871, 29896, 13, 1678,
1683, 29901, 13, 4706, 736, 18755, 265, 21566, 29898, 29876,
448, 29871, 29896, 29897, 718, 18755, 265, 21566, 29898, 29876]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
output[:50].unsqueeze(0).float())
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)

View File

@ -1,33 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
convert_ckpt_param,
network_size,
tokenizer_param,
distributed_param,
inference_param,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
network_size = config_file["NETWORK_SIZE"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
distributed_param = config_file["DISTRIBUTED_PARAM"]
inference_param = config_file["INFERENCE_PARAM"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,130 +0,0 @@
import json
import argparse
from typing import Tuple, Optional
import pandas as pd
class Comparator:
def __init__(self,
base_path_prefix: str,
test_path_prefix: str,
loss_error_rate: float = 0.02,
perf_error_rate: float = 0.03,
mem_error_rate: float = 0.003,
warm_up: int = 1,
compute_steps: int = 2000):
self.base_path_prefix = base_path_prefix
self.test_path_prefix = test_path_prefix
self.loss_error_rate = loss_error_rate
self.perf_error_rate = perf_error_rate
self.mem_error_rate = mem_error_rate
self.compute_steps = compute_steps
self.warm_up = warm_up
def _read_check_loss_file(self) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, int]]:
base_loss_pd = pd.read_csv(f"{self.base_path_prefix}_loss.tsv", sep='\t')
test_loss_pd = pd.read_csv(f"{self.test_path_prefix}_loss.tsv", sep='\t')
if len(base_loss_pd) < self.compute_steps or len(test_loss_pd) < self.compute_steps:
print("The log doesn't have enough steps to compute!")
return None
base_loss_start = base_loss_pd.loss.ne(float('inf')).argmax()
test_loss_start = test_loss_pd.loss.ne(float('inf')).argmax()
if base_loss_start != test_loss_start:
print("The validate loss step is not equal!")
return None
return base_loss_pd, test_loss_pd, base_loss_start
def compare_memory(self) -> bool:
base_mem_pd = pd.read_csv(f"{self.base_path_prefix}_memory.tsv", sep='\t')
test_mem_pd = pd.read_csv(f"{self.test_path_prefix}_memory.tsv", sep='\t')
base_mem_mean = base_mem_pd.memory.mean()
test_mem_mean = test_mem_pd.memory.mean()
if base_mem_mean * (1 + self.mem_error_rate) < test_mem_mean:
print("Memory test failed!")
return False
print("Memory test pass!")
return True
def compare_perf(self) -> bool:
result = self._read_check_loss_file()
if not result:
return False
base_loss_pd, test_loss_pd, loss_start = result
with open(f"{self.base_path_prefix}_parameters.json") as f:
base_params = json.load(f)
with open(f"{self.test_path_prefix}_parameters.json") as f:
test_params = json.load(f)
if base_params != test_params:
print("The parameters are not equal")
return False
global_batch_size = base_params.get("global_batch_size") or base_params.get("train_batch_size")
seq_length = base_params.get("seq_length") or base_params.get("seq-length")
world_size = base_params.get("world_size", 8)
# Here we need to skip the first steps until the training is stable
base_itertime_mean = base_loss_pd[self.warm_up:self.compute_steps].iter_time.mean()
test_itertime_mean = test_loss_pd[self.warm_up:self.compute_steps].iter_time.mean()
base_perf = global_batch_size * seq_length / world_size / base_itertime_mean
test_perf = global_batch_size * seq_length / world_size / test_itertime_mean
if (1 - self.perf_error_rate) * base_perf > test_perf:
print("Perf test failed!")
return False
print("Perf test pass!")
return True
def compare_loss(self) -> bool:
result = self._read_check_loss_file()
if not result:
return False
base_loss_pd, test_loss_pd, loss_start = result
loss_error_rates = (test_loss_pd[loss_start:self.compute_steps].loss - base_loss_pd[loss_start:self.compute_steps].loss) / base_loss_pd[loss_start:self.compute_steps].loss
if abs(loss_error_rates.mean()) > self.loss_error_rate:
print("Loss test failed!")
return False
print("Loss test pass!")
return True
def __call__(self) -> None:
self.compare_loss()
self.compare_perf()
self.compare_memory()
def main(args):
Comparator(args.base_path_prefix,
args.test_path_prefix,
args.loss_error_rate,
args.perf_error_rate,
args.mem_error_rate,
args.warm_up,
args.compute_steps)()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Compare log data between baseline and test')
parser.add_argument('base_path_prefix', help='The baseline path prefix')
parser.add_argument('test_path_prefix', help='The test path prefix')
parser.add_argument('--loss_error_rate', type=float, default=0.02, help='The loss error rate')
parser.add_argument('--perf_error_rate', type=float, default=0.03, help='The perf error rate')
parser.add_argument('--warm_up', type=int, default=1, help='The perf test start from warm_up step')
parser.add_argument('--mem_error_rate', type=float, default=0.003, help='The memory error rate')
parser.add_argument('--compute_steps', type=int, default=2000, help='The compute steps')
args = parser.parse_args()
main(args)

View File

@ -1,157 +0,0 @@
from abc import ABC, abstractmethod
import json
import argparse
class BaseLogExtractor(ABC):
def __init__(self, input_path:str, out_path_prefix:str):
self.input_path = input_path
self.out_path_prefix = out_path_prefix
self.losses = []
self.memories = []
self.parameters = {"global_batch_size": 0,
"seq_length": 0,
"world_size": 0}
@abstractmethod
def _extract_parameter(self, line: str) -> None:
pass
@abstractmethod
def _extract_iterline(self, line: str) -> None:
pass
@abstractmethod
def _extract_memory(self, line) -> None:
pass
def _extract(self) -> None:
with open(self.input_path, encoding='utf-8') as f:
for line in f:
self._extract_parameter(line)
self._extract_iterline(line)
self._extract_memory(line)
def _save(self) -> None:
loss_path = f"{self.out_path_prefix}_loss.tsv"
with open(loss_path, 'w') as f:
f.write("step\tloss\titer_time\n")
for step, loss, iter_time in self.losses:
f.write(f"{step}\t{loss}\t{iter_time}\n")
memory_path = f"{self.out_path_prefix}_memory.tsv"
with open(memory_path, 'w') as f:
f.write("rank_id\tmemory\n")
for rank_id, memory in sorted(self.memories):
f.write(f"{rank_id}\t{memory}\n")
parameters_path = f"{self.out_path_prefix}_parameters.json"
with open(parameters_path, 'w') as f:
json.dump(self.parameters, f, indent=4)
def __call__(self):
self._extract()
self._save()
class MegatronLogExtractor(BaseLogExtractor):
def _extract_parameter(self, line: str) -> None:
for param in self.parameters.keys():
if line.startswith(f" {param}"):
blank_pos = line.rfind(' ')
self.parameters[param] = int(line[blank_pos:])
def _extract_iterline(self, line: str):
if (len(line) < 23 or not line[22:].startswith(" iteration")) and (not line.startswith(" iteration")):
return
backslash_pos = line.find('/')
blank_pos = line.rfind(' ', 0, backslash_pos)
step = line[blank_pos:backslash_pos]
ms_pos = line.find('(ms):')
pipe_pos = line.find('|', ms_pos)
iter_time = line[ms_pos+6: pipe_pos-1]
loss_pos = line.find('lm loss:')
if loss_pos > 0:
bar_pos = line.find('|', loss_pos)
loss = line[loss_pos+9:bar_pos-1]
else:
loss = 'inf'
self.losses.append((int(step), float(loss), float(iter_time)))
def _extract_memory(self, line) -> None:
if not line.startswith("[Rank"):
return
start = 0
while start >= 0:
rsb_pos = line.find(']', start)
rankid = line[start+6:rsb_pos]
mem_pos = line.find('allocated:', rsb_pos)
pipe_pos = line.find('|', mem_pos)
memory = line[mem_pos+11:pipe_pos-1]
self.memories.append((int(rankid), float(memory)))
start = line.find("[Rank", pipe_pos)
class DeepSpeedLogExtractor(BaseLogExtractor):
def __init__(self, input_path: str, out_path_prefix: str):
super().__init__(input_path, out_path_prefix)
self.parameters = {
"train_batch_size": 0,
"seq-length": 0
}
def _extract_parameter(self, line: str) -> None:
for param in self.parameters.keys():
param_pos = line.find(f" \"{param}\":")
if f" \"{param}\":" in line:
colon_pos = line.find(':', param_pos)
comma_pos = line.find(',', colon_pos)
self.parameters[param] = int(line[colon_pos + 1 : comma_pos])
def _extract_iterline(self, line: str):
if not line.startswith("steps: "):
return
step_pos = 0
loss_pos = line.find(' loss:')
iter_time_pos = line.find(' iter time (s):')
iter_time_end = line.find(' samples/sec:')
step = line[step_pos + 7 : loss_pos]
iter_time = line[iter_time_pos + 15 : iter_time_end]
if loss_pos > 0:
loss = line[loss_pos + 6 : iter_time_pos]
else:
loss = 'inf'
self.losses.append((int(step), float(loss), float(iter_time)))
def _extract_memory(self, line) -> None:
if not line.startswith("after 1 iterations memory (MB)"):
return
mem_pos = line.find('allocated: ')
pipe_pos = line.find('|', mem_pos)
memory = line[mem_pos + 11 : pipe_pos - 1]
self.memories.append((0, float(memory)))
def main(args):
if args.frame_kind.lower() == 'megatron':
MegatronLogExtractor(args.input_path, args.output_path_prefix)()
if args.frame_kind.lower() == 'deepspeed':
DeepSpeedLogExtractor(args.input_path, args.output_path_prefix)()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='extract loss, performance and memory data from training log')
parser.add_argument('frame_kind', help='The training frame: Megatron, Deepspeed or Torch')
parser.add_argument('input_path', help='The training log path')
parser.add_argument('output_path_prefix', help='The output path prefix')
args = parser.parse_args()
main(args)

View File

@ -1,7 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/gemma-7B/test_process_pretrain_data.py
python tests/pipeline/gemma-7B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/gemma-7B/test_generation.py
pytest -s tests/pipeline/gemma-7B/test_evaluation.py

View File

@ -1,76 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "28",
"--hidden-size", "3072",
"--ffn-hidden-size", "24576",
"--num-attention-heads", "16",
"--max-position-embeddings", "8192",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--add-rmsnorm-offset",
"--geglu",
"--kv-channels", "256",
"--input-embeds-norm",
"--vocab-size", "256000"
],
"INFERENCE_AUX": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-model", "/home/dataset/gemma-7B-hf/tokenizer.model",
"--tokenizer-name-or-path", "/home/dataset/gemma-7B-hf",
"--load", "/home/dataset/gemma-7B-tp8-pp1",
"--seed", "42",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256"
],
"EVALUATION_PARAM": [
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test/",
"--max-new-tokens", "1"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "8192"
],
"PROCESS_PRETRAIN_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-gemma-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/gemma-7B-hf",
"--workers", "4",
"--log-interval", "1000"
],
"CONVERT_CKPT_FROM_HF": [
"--model-type", "GPT",
"--loader", "gemma_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--load-dir", "/home/dataset/gemma-7B-hf",
"--save-dir", "/home/dataset/gemma-7B-tp8-pp1",
"--tokenizer-model", "/home/dataset/gemma-7B-hf/tokenizer.model"
]
}

View File

@ -1,56 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
import torch
from utils import ParamConfig
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
st_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.safetensors"))
self.assertEqual(len(st_file), 4)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "model.safetensors.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([32000, 3072]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([3072]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 3072]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([3072, 512]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([6144, 3072]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([3072, 3072]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([3072]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([3072]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,106 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.inference_aux + config.evaluation_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def get_result(self, tokenizer, result):
if result:
final_result = [result[0]]
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
final_result.append('T')
else:
final_result.append('F')
else:
final_result = None
return final_result
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
temp = []
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0][1]
temp.append(answer)
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.572) < 0.1)

View File

@ -1,105 +0,0 @@
import sys
import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
config.inference_aux + config.inference_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["how are you?", "Give me three tips for staying healthy."]
output = model.generate(instruction)
expect_output1 = [
""" I am an AI language model, and I am here to help you with your queries. How can I assist you today? """
]
expect_output2 = [
"""1. Eat a balanced diet \n2. Get regular exercise \n3. Get enough sleep """
]
expect_output1_seq = "".join(expect_output1)
expect_output2_seq = ''.join(expect_output2)
if torch.distributed.get_rank() == 0:
print(output[0])
print(output[1])
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
print("similarity1:", similarity1)
print("similarity2:", similarity2)
assert_judge(similarity1 > 0.8)
assert_judge(similarity2 > 0.8)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "What is the whether like today?"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False
)
expected_output = [
"""I do not have access to real-time information, therefore I cannot provide you with the weather for today."""
]
expected_output_seq = "".join(expected_output)
if torch.distributed.get_rank() == 0:
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
print(output)
print("similarity:", similarity)
assert_judge(similarity > 0.75)

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 256000)
self.assertEqual(self.tokenizer.tokenize('bug'), [2, 4594])
self.assertEqual(self.tokenizer.detokenize(23961), ' infinite')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '<eos>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 22, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,34 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
distributed_param,
network_size,
inference_param,
evaluation_param,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_aux = config_file["INFERENCE_AUX"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,7 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/intern-7B/test_process_pretrain_data.py
python tests/pipeline/intern-7B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/intern-7B/test_generation.py
pytest -s tests/pipeline/intern-7B/test_evalution.py
pytest -s tests/pipeline/intern-7B/test_trainer.py

View File

@ -1,118 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "11008",
"--num-attention-heads", "32",
"--max-position-embeddings", "2048",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "32",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--add-qkv-bias",
"--add-dense-bias",
"--skip-bias-add"
],
"INFERENCE_AUX": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model",
"--tokenizer-name-or-path", "/home/dataset/intern-hf/",
"--load", "/home/dataset/intern-tp8-pp1/",
"--seed", "42",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "64"
],
"EVALUATION_PARAM": [
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test/",
"--task", "mmlu",
"--max-new-tokens", "2"
],
"TRAINING_PARAM": [
"--save", "/autotest/dataset/save-weight-intern",
"--data-path", "/home/dataset/pretrain-dataset-intern/alpaca_text_document",
"--train-iters", "15"
],
"REGULARIZATION": [
"--attention-dropout", "0.0",
"--hidden-dropout", "0.0",
"--weight-decay", "1e-1",
"--clip-grad", "1.0",
"--adam-beta1", "0.9",
"--adam-beta2", "0.95"
],
"LEARNING_RATE": [
"--lr", "1.25e-6",
"--lr-decay-style", "cosine",
"--lr-warmup-fraction", "0.01",
"--min-lr", "1.25e-7"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "8",
"--global-batch-size", "64",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "2048"
],
"TRAINING_AUX": [
"--sequence-parallel",
"--initial-loss-scale", "65536",
"--use-flash-attn",
"--use-fused-rmsnorm",
"--init-method-std", "0.01",
"--split", "100,0,0",
"--log-interval", "1",
"--save-interval", "10",
"--eval-interval", "1000",
"--eval-iters", "0",
"--num-workers", "0",
"--distributed-backend", "nccl",
"--tokenizer-type", "Llama2Tokenizer",
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model"
],
"PROCESS_PRETRAIN_DATA":[
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-intern/alpaca",
"--tokenizer-name-or-path", "/home/dataset/intern-hf",
"--workers", "4",
"--log-interval", "1000",
"--handler-name", "AlpacaPretrainHandler",
"--tokenizer-not-use-fast",
"--append-eod"
],
"CONVERT_CKPT_PARAM":[
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--load-dir", "/home/dataset/intern-hf",
"--save-dir", "/home/dataset/intern-tp8-pp1",
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model",
"--add-qkv-bias",
"--add-dense-bias"
]
}

View File

@ -1,61 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
from utils import ParamConfig
import torch
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.bin"))
self.assertEqual(len(bin_file), 8)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([12896, 4096]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.bias'].size(), torch.Size([1536]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([2752, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1376]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.bias'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([12896, 4096]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,96 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.evaluation_param + config.inference_aux + config.auxiliary_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
temp = []
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df.iloc[0:20]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0].strip()
temp.append(answer)
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.41) <= 0.02)

View File

@ -1,100 +0,0 @@
import sys
import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.inference_aux + config.auxiliary_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["春夏秋冬,四个季节"]
output = model.generate(instruction, detokenize=False)
expected_output = [98899, 67921, 70257, 67780, 60443, 67942, 68212, 98899, 60357, 60443,
67942, 60515, 98899, 60357, 60443, 67942, 68123, 99157, 364, 61145,
98899, 60355, 67546, 60353, 62513, 60410, 98899, 60355, 72801, 61209,
60431, 98899, 60355, 60758, 70447, 83396, 98899, 60355, 60758, 60958,
60353, 68124, 99157, 364, 61145, 60353, 62513, 60410, 98899, 60355,
67546, 60353, 62513, 60410, 98899, 60355, 72801, 61209, 60431, 98899,
]
expected_output_seq = torch.tensor(expected_output)[:20].unsqueeze(0).float().npu()
output_seq = output[:20].unsqueeze(0).float()
if torch.distributed.get_rank() == 0:
print(len(output))
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(expected_output_seq, output_seq)
print("cos_sim:", cos_sim)
assert_judge(cos_sim > 0.80)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "春夏秋冬,四个季节"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [98899, 67921, 70257, 67780, 60724, 71526, 68881, 99157, 60450, 67921,
70257, 60417, 98899, 60661, 67780, 60724, 60434, 68108, 60477, 61472,
60353, 76934, 99157, 364, 72196, 98899, 75427, 83396, 99157, 364,
69025, 98899, 83649, 61549, 60511, 99157, 364, 75814, 98899, 62084,
60449, 61469, 61469, 99157, 364, 69713, 98899, 61139, 60620, 60862,
]
expected_output_seq = torch.tensor(expected_output)[:15].unsqueeze(0).float().npu()
output_seq = output[:15].unsqueeze(0).float()
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(expected_output_seq, output_seq)
print("cos_sim:", cos_sim)
assert_judge(cos_sim > 0.6)

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import AlpacaPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
config = ParamConfig
sys.argv = [sys.argv[0]] + config.process_pretrain_data
self.config = config.process_pretrain_data
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 103168)
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 2463])
self.assertEqual(self.tokenizer.detokenize(23961), ' possibilities')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, AlpacaPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 28, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,152 +0,0 @@
import sys
import os
import subprocess
import torch
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
class TestTraining(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.training_param + config.auxiliary_param + config.learning_rate_param + \
config.training_aux + config.regularization
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_training(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
saved_checkpoint = False
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
break
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
def test_breakpoint_renewal_training(self):
self.init(config=ParamConfig)
self.args.load = self.args.save
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step
if self.args.load == self.args.save:
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
for model_module in model:
model_module.train()
timers = get_timers()
iteration = self.args.iteration
assert_judge(iteration == 10)
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
timers('interval-time', log_level=0).start(barrier=True)
if iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
if torch.distributed.get_rank() == 0:
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
assert_judge(abs(loss_dict.get('lm loss') - 8.37) < 0.3)

View File

@ -1,43 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
distributed_param,
network_size,
inference_param,
evaluation_param,
lora_param,
training_param,
training_auxiliary,
learning_rate,
regularization,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
training_param = config_file["TRAINING_PARAM"]
training_aux = config_file["TRAINING_AUX"]
learning_rate_param = config_file["LEARNING_RATE"]
regularization = config_file["REGULARIZATION"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
process_pretrain_data = config_file["PROCESS_PRETRAIN_DATA"]
inference_aux = config_file["INFERENCE_AUX"]
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,10 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/llama2-7B/test_process_pretrain_data.py
python tests/pipeline/llama2-7B/test_process_instruction_data.py
python tests/pipeline/llama2-7B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/llama2-7B/test_generation.py
pytest -s tests/pipeline/llama2-7B/test_evaluation.py
pytest -s tests/pipeline/llama2-7B/test_lora.py
pytest -s tests/pipeline/llama2-7B/test_trainer.py

View File

@ -1,134 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "11008",
"--num-attention-heads", "32",
"--max-position-embeddings", "4096",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights"
],
"INFERENCE_AUX": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model",
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
"--load", "/home/dataset/llama2-7B-tp8-pp1",
"--seed", "42",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256"
],
"EVALUATION_PARAM": [
"--task-data-path", "/home/dataset/eval_dataset/boolq/test/", "/home/dataset/eval_dataset/mmlu/test/",
"--max-new-tokens", "2"
],
"LORA_PARAM": [
"--finetune",
"--is-instruction-dataset",
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
"--lora-r", "16",
"--lora-alpha", "32",
"--lora-target-modules", "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h",
"--load", "/home/dataset/llama2-7B-tp8-pp1",
"--save", "/home/dataset/lora-save-weight-llama2-7B",
"--data-path", "/home/dataset/tune-dataset-llama2-7B/alpaca",
"--train-iters", "10"
],
"TRAINING_PARAM": [
"--tokenizer-type", "Llama2Tokenizer",
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model",
"--save", "/autotest/dataset/save-weight-llama2-7B",
"--data-path", "/home/dataset/pretrain-dataset-llama2-7B/alpaca_text_document",
"--train-iters", "15"
],
"REGULARIZATION": [
"--attention-dropout", "0.0",
"--hidden-dropout", "0.0",
"--weight-decay", "1e-1",
"--clip-grad", "1.0",
"--adam-beta1", "0.9",
"--adam-beta2", "0.95"
],
"LEARNING_RATE": [
"--lr", "1.25e-6",
"--lr-decay-style", "cosine",
"--lr-warmup-fraction", "0.01",
"--min-lr", "1.25e-7"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "4",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "4096"
],
"TRAINING_AUX": [
"--sequence-parallel",
"--initial-loss-scale", "65536",
"--use-flash-attn",
"--use-fused-rmsnorm",
"--init-method-std", "0.01",
"--split", "100,0,0",
"--log-interval", "1",
"--save-interval", "10",
"--eval-interval", "1000",
"--eval-iters", "0",
"--num-workers", "0",
"--distributed-backend", "nccl"
],
"PROCESS_PRETRAIN_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-llama2-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
"--workers", "4",
"--log-interval", "1000"
],
"PROCESS_INSTRUCTION_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "GeneralInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-llama2-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
"--workers", "4",
"--log-interval", "1000",
"--append-eod"
],
"CONVERT_CKPT_FROM_HF": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--load-dir", "/home/dataset/llama2-7B",
"--save-dir", "/home/dataset/llama2-7B-tp8-pp1",
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model"
]
}

View File

@ -1,85 +0,0 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_ASD_ENABLE=0
GPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=/home/dataset/pretrain-dataset-llama2-7B/alpaca_text_document
TOKENIZER_MODEL=/home/dataset/llama2-7B/tokenizer.model
TP=8
PP=1
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 4 \
--global-batch-size 16 \
--make-vocab-size-divisible-by 1 \
--lr 1.25e-6 \
--train-iters 2000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--use-fused-rmsnorm \
--swiglu \
--use-flash-attn \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--initial-loss-scale 65536 \
--adam-beta2 0.95 \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--bf16
"
DATA_ARGS="
--data-path $DATA_PATH \
--split 100,0,0
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 10000 \
--eval-interval 5000 \
--eval-iters 0 \
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl 2>&1 | tee /home/dataset/new_llama2-7B.log

View File

@ -1,58 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
import torch
from utils import ParamConfig
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.bin"))
self.assertEqual(len(bin_file), 2)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 4096]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([2752, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1376]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 4096]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,169 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.inference_aux + config.evaluation_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def get_result(self, tokenizer, result):
if result:
final_result = [result[0]]
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
final_result.append('T')
else:
final_result.append('F')
else:
final_result = None
return final_result
def test_boolq_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
total_acc_n = 0
total_n = 0
for path in self.args.task_data_path:
if "boolq" in path:
test_dir = path
print(test_dir)
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
with open(file_path, encoding='utf-8') as f:
boolq_question_list = []
for line in f.readlines():
boolq_question_list.append(json.loads(line))
boolq_question_list = boolq_question_list[:654]
subject_result = {}
acc_n = 0
for index, item in enumerate(boolq_question_list):
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
result = self.get_result(tokenizer, result)
if result:
answer = result[1]
else:
answer = None
try:
if torch.distributed.get_rank() == 0:
subject_result[str(index)] = answer
if subject_result[str(index)] == str(item['answer'])[0]:
acc_n += 1
except Exception as e:
if torch.distributed.get_rank() == 0:
raise e
if torch.distributed.get_rank() == 0:
total_n += len(boolq_question_list)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge((final_acc - 0.775) < 0.01)
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
temp = []
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0][0]
temp.append(answer)
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
assert_judge(abs(final_acc - 0.498) < 0.01)
print(final_acc)

View File

@ -1,113 +0,0 @@
import sys
import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
config.inference_aux + config.inference_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["how are you?", "Give me three tips for staying healthy."]
output = model.generate(instruction)
expect_output1 = [
"I'm doing well, thanks for asking! I've been keeping busy with work and spending time with friends and family. ",
"It's been great to have some time off from school and just relax a bit. How about you? How have you been?\n",
"\nI hope you're doing well! It's always great to catch up with you and hear about what's going on in your life. ",
"I'm looking forward to hearing all about it. Let me know if you want to hang out soon!"
]
expect_output2 = [
'\n\n1. Eat a balanced diet: A healthy diet should include a variety of fruits, vegetables, whole grains, lean proteins, and healthy fats. ',
'Aim to include a rainbow of colors on your plate to ensure you are getting a range of vitamins and minerals.',
'\n2. Stay hydrated: Drink plenty of water throughout the day, aiming for at least eight cups (64 ounces) daily. ',
'Limit your consumption of sugary drinks'
]
expect_output1_seq = "".join(expect_output1)
expect_output2_seq = ''.join(expect_output2)
if torch.distributed.get_rank() == 0:
print(output[0])
print(output[1])
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
print("similarity1:", similarity1)
print("similarity2:", similarity2)
assert_judge(similarity1 > 0.85)
assert_judge(similarity2 > 0.85)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "What is the whether like today?"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False
)
expected_output = [
"Answer:\nThe weather today is sunny with a high of 75 degrees Fahrenheit and a low of 50 degrees Fahrenheit. ",
"There is no rain or other weather alerts in the area.",
"\nWould you like to know the weather for a different location?"
]
expected_output_seq = "".join(expected_output)
if torch.distributed.get_rank() == 0:
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
print(output)
print("similarity:", similarity)
assert_judge(similarity > 0.75)

View File

@ -1,133 +0,0 @@
import sys
import os
import torch
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators
class TestLora(DistributedTest):
world_size = 8
def init(self, config: ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
config.auxiliary_param + config.learning_rate_param + config.regularization + config.lora_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_megatron_lora_module(self):
self.init(config=ParamConfig)
from megatron.core import tensor_parallel
from pretrain_gpt import model_provider
model, _, _ = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder
)
model = model[0]
for name, module in model.named_modules():
if name.endswith("query_key_value.lora_A.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("query_key_value.lora_B.default"):
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
if name.endswith("dense.lora_A.default"):
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
if name.endswith("dense.lora_B.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("dense_h_to_4h.lora_A.default"):
assert_judge(isinstance(module, torch.nn.Linear))
if name.endswith("dense_h_to_4h.lora_B.default"):
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
if name.endswith("dense_4h_to_h.lora_A.default"):
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
if name.endswith("dense_4h_to_h.lora_B.default"):
assert_judge(isinstance(module, torch.nn.Linear))
def test_lora(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time, num_floating_point_operations
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder
)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
saved_checkpoint = False
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 32000)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1, 529, 29900, 29916, 29943, 29955, 29958])
self.assertEqual(self.tokenizer.detokenize(31338), '')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 93, delta=2)
if __name__ == "__main__":
unittest.main()

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 32000)
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 6494])
self.assertEqual(self.tokenizer.detokenize(23961), 'Ukraine')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 26, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,152 +0,0 @@
import sys
import os
import subprocess
import torch
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
class TestTraining(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
config.auxiliary_param + config.learning_rate_param + config.regularization + config.training_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_training(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
saved_checkpoint = False
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
break
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
def test_breakpoint_renewal_training(self):
self.init(config=ParamConfig)
self.args.load = self.args.save
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_timers
from megatron.training.training import train_step
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, _, _ \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
for model_module in model:
model_module.train()
timers = get_timers()
iteration = self.args.iteration
assert_judge(iteration == 10)
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
timers('interval-time', log_level=0).start(barrier=True)
if iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
if torch.distributed.get_rank() == 0:
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
assert_judge(abs(7.6 - loss_dict.get('lm loss')) < 0.2)

View File

@ -1,45 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
distributed_param,
network_size,
inference_param,
evaluation_param,
lora_param,
training_param,
training_auxiliary,
learning_rate,
regularization,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_aux = config_file["INFERENCE_AUX"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
lora_param = config_file["LORA_PARAM"]
training_param = config_file["TRAINING_PARAM"]
training_aux = config_file["TRAINING_AUX"]
learning_rate_param = config_file["LEARNING_RATE"]
regularization = config_file["REGULARIZATION"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,7 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/llama3-8B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/llama3-8B/test_generation.py
pytest -s tests/pipeline/llama3-8B/test_evaluation.py
pytest -s tests/pipeline/llama3-8B/test_chat.py

View File

@ -1,76 +0,0 @@
{
"CONVERT_CKPT_PARAM": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--load-dir", "/home/dataset/llama3-8B-hf",
"--save-dir", "/home/dataset/llama3-8B-mt-t8p1",
"--target-tensor-parallel-size", "8",
"--target-pipeline-parallel-size", "1",
"--tokenizer-model", "None"
],
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "14336",
"--num-attention-heads", "32",
"--max-position-embeddings", "8192",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "16032",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/llama3-8B-mt-t8p1"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/llama3-8B-hf"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint",
"--attention-softmax-in-fp32"
],
"INFERENCE_HF_CHAT_PARAM": [
"--hf-chat-template"
],
"INFERENCE_PROMPT_CHAT_PARAM": [
"--prompt-type", "llama3"
],
"EVALUATION_PARAM": [
"--tokenizer-not-use-fast",
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
"--task", "mmlu",
"--max-new-tokens", "1",
"--exit-on-missing-checkpoint"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--seed", "42",
"--use-fused-rmsnorm",
"--group-query-attention",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "8192",
"--num-query-groups", "8",
"--rotary-base", "500000"
]
}

View File

@ -1,123 +0,0 @@
import sys
import os
import nltk
import torch
from torch import distributed as dist
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args, chat_get_instruction, chat_print_and_update_histories
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig, chat_type=None):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
if chat_type == "hf_chat":
sys.argv = sys.argv + config.inference_hf_chat_param
elif chat_type == "prompt_chat":
sys.argv = sys.argv + config.inference_prompt_chat_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def run_chat(self, model, turn0outputExpect):
histories_no_template = []
histories_template = []
instruction = None
test_questions = ["你能推荐几本深度学习的书吗?", "上面推荐的书建议学习顺序呢?", "9.11和9.9谁大?"]
turns = 0
while turns < 3:
prompt = test_questions[turns]
instruction = chat_get_instruction(self.args, histories_no_template, histories_template, prompt)
responses = model.generate(
instruction,
do_sample=True,
top_k=self.args.top_k,
top_p=self.args.top_p,
tokenizer=None,
temperature=self.args.temperature,
max_new_tokens=self.args.max_new_tokens,
stream=True
)
output = chat_print_and_update_histories(self.args, responses, histories_no_template, histories_template, prompt)
if torch.distributed.get_rank() == 0:
print("-------------------------------")
print(output)
if(turns == 0):
similarity1 = self.edit_distance_similarity(output[:30], turn0outputExpect[0][:30])
similarity2 = self.edit_distance_similarity(output[:30], turn0outputExpect[1][:30])
print("similarity1:", similarity1)
print("similarity1:", similarity2)
assert_judge(max(similarity1, similarity2) > 0.75)
turns = turns + 1
def test_hf_chat(self):
"""Interactive dialog mode with multiple rounds of conversation"""
self.init(config=ParamConfig, chat_type="hf_chat")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
turn1outputExpect = []
turn1outputExpect1 = "Here are some highly recommended books on deep learning that can help you dive deeper into the subject:"
turn1outputExpect2 = '''Here are some highly recommended books for deep learning:\n\n**Foundational Books**\n\n1. **"Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville**: This is the bible of deep learning.'''
turn1outputExpect.append(turn1outputExpect1)
turn1outputExpect.append(turn1outputExpect2)
self.run_chat(model, turn1outputExpect)
def test_prompt_type_chat(self):
"""Interactive dialog mode with multiple rounds of conversation"""
self.init(config=ParamConfig, chat_type="prompt_chat")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
turn1outputExpect = []
turn1outputExpect1 = "Here are some highly recommended books on deep learning that can help you dive deeper into the subject:"
turn1outputExpect2 = '''Here are some highly recommended books for deep learning:\n\n**Foundational Books**\n\n1. **"Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville**: This is the bible of deep learning.'''
turn1outputExpect.append(turn1outputExpect1)
turn1outputExpect.append(turn1outputExpect2)
self.run_chat(model, turn1outputExpect)

View File

@ -1,60 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
from utils import ParamConfig
import torch
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
satetensors_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.safetensors"))
self.assertEqual(len(satetensors_file), 4)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "model.safetensors.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([16032, 4096]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
print(weight_common_content['encoder']["layers.31.mlp.dense_h_to_4h._extra_state"])
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([768, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([3584, 4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1792]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([16032, 4096]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,94 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.evaluation_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0].strip()
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.687) < 0.02)

View File

@ -1,97 +0,0 @@
import sys
import os
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from megatron.training import get_args
from megatron.training.initialize import initialize_megatron
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
config.inference_param + config.auxiliary_param + config.tokenizer_param
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
self.args = get_args()
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["春夏秋冬,四个季节"]
output = model.generate(instruction, detokenize=False)
expected_output1 = [3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602]
expected_output2 = [3922, 64803, 19483, 13646, 125436, 3922, 64803, 19483, 24273, 25129,
3922, 64803, 19483, 27384, 24273, 25129, 3922, 64803, 19483, 31809,
24273, 25129, 3922, 64803, 19483, 27384, 24273, 25129, 9554, 64803,
19483, 31809, 24273, 25129, 3922, 64803, 19483, 31809, 24273, 25129]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
output[:40].unsqueeze(0).float())
cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
output[:40].unsqueeze(0).float())
cos_sim = torch.max(cos_sim1, cos_sim2)
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "北京奥运会"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
expected_output = [9554, 30867, 106633, 29430, 17905, 3922, 102446, 110125, 35287, 28038,
70090, 108025, 109169, 57668, 26123, 34208, 28038, 37046, 34208, 57668,
26123, 78640, 61075, 104261, 103302, 1811, 1049, 23, 8107, 24,
9953, 3922, 110284, 35287, 19000, 70090, 108448, 23039, 9554, 30537]
if torch.distributed.get_rank() == 0:
print(output)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
output[:40].unsqueeze(0).float())
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.95)

View File

@ -1,38 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
convert_ckpt_param,
network_size,
tokenizer_param,
distributed_param,
inference_param,
evaluation_param,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
network_size = config_file["NETWORK_SIZE"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
distributed_param = config_file["DISTRIBUTED_PARAM"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
inference_hf_chat_param = config_file["INFERENCE_HF_CHAT_PARAM"]
inference_prompt_chat_param = config_file["INFERENCE_PROMPT_CHAT_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,9 +0,0 @@
# Provide uniform access for piepline.
python tests/pipeline/mistral-7B/test_process_pretrain_data.py
python tests/pipeline/mistral-7B/test_process_instruction_data.py
python tests/pipeline/mistral-7B/test_convert_ckpt_from_huggingface.py
pytest -s tests/pipeline/mistral-7B/test_generation.py
pytest -s tests/pipeline/mistral-7B/test_evaluation.py
pytest -s tests/pipeline/mistral-7B/test_trainer.py

View File

@ -1,123 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "14336",
"--num-attention-heads", "32",
"--max-position-embeddings", "32768",
"--position-embedding-type", "rope",
"--group-query-attention",
"--num-query-groups", "8",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--sliding-window", "4096"
],
"INFERENCE_AUX": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
"--load", "/home/dataset/mistral-7B-tp8-pp1",
"--seed", "42",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256"
],
"EVALUATION_PARAM": [
"--task-data-path", "/home/dataset/eval_dataset/boolq/test/", "/home/dataset/eval_dataset/mmlu/test/",
"--max-new-tokens", "2"
],
"TRAINING_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
"--load", "/home/dataset/mistral-7B-tp8-pp1",
"--save", "/autotest/dataset/save-weight-mistral-7B",
"--data-path", "/home/dataset/pretrain-dataset-mistral-7B/alpaca_text_document",
"--train-iters", "15"
],
"REGULARIZATION": [
"--attention-dropout", "0.0",
"--hidden-dropout", "0.0",
"--weight-decay", "1e-1",
"--clip-grad", "1.0",
"--adam-beta1", "0.9",
"--adam-beta2", "0.95"
],
"LEARNING_RATE": [
"--lr", "1.25e-6",
"--lr-decay-style", "cosine",
"--lr-warmup-fraction", "0.01",
"--min-lr", "1.25e-7"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "32",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--attention-softmax-in-fp32",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "8192"
],
"TRAINING_AUX": [
"--sequence-parallel",
"--initial-loss-scale", "65536",
"--use-flash-attn",
"--use-fused-rmsnorm",
"--init-method-std", "0.01",
"--split", "100,0,0",
"--log-interval", "1",
"--save-interval", "10",
"--eval-interval", "1000",
"--eval-iters", "0",
"--num-workers", "0",
"--distributed-backend", "nccl"
],
"PROCESS_PRETRAIN_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--output-prefix", "/home/dataset/pretrain-dataset-mistral-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
"--workers", "4",
"--log-interval", "1000"
],
"PROCESS_INSTRUCTION_DATA": [
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "GeneralInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-mistral-7B/alpaca",
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
"--workers", "4",
"--log-interval", "1000",
"--append-eod"
],
"CONVERT_CKPT_FROM_HF": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--target-tensor-parallel-size", "8",
"--load-dir", "/home/dataset/mistral-7B",
"--save-dir", "/home/dataset/mistral-7B-tp8-pp1",
"--tokenizer-model", "/home/dataset/mistral-7B/tokenizer.model"
]
}

View File

@ -1,97 +0,0 @@
#!/bin/bash
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NPU_ASD_ENABLE=0
GPUS_PER_NODE=8
MASTER_ADDR=localhost
MASTER_PORT=6000
NNODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
DATA_PATH=/home/dataset/pretrain-dataset-mistral-7B/alpaca_text_document
TOKENIZER_MODEL=/home/dataset/mistral-7B
TP=8
PP=1
NUM_LAYERS=32
DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT
"
GPT_ARGS="
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--sequence-parallel \
--sliding-window 4096 \
--num-layers ${NUM_LAYERS} \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--tokenizer-type PretrainedFromHF \
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
--seq-length 32768 \
--max-position-embeddings 32768 \
--micro-batch-size 1 \
--global-batch-size 32 \
--make-vocab-size-divisible-by 1 \
--lr 1.25e-6 \
--train-iters 2000 \
--lr-decay-style cosine \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--attention-dropout 0.0 \
--init-method-std 0.01 \
--hidden-dropout 0.0 \
--position-embedding-type rope \
--normalization RMSNorm \
--use-fused-rmsnorm \
--swiglu \
--use-flash-attn \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--min-lr 1.25e-7 \
--weight-decay 1e-1 \
--lr-warmup-fraction 0.01 \
--clip-grad 1.0 \
--adam-beta1 0.9 \
--initial-loss-scale 65536 \
--adam-beta2 0.95 \
--no-gradient-accumulation-fusion \
--no-load-optim \
--no-load-rng \
--use-mc2 \
--use-fused-swiglu \
--use-rotary-position-embeddings \
--use-fused-rotary-pos-emb \
--use-distributed-optimizer \
--overlap-grad-reduce \
--bf16
"
# --save ${CKPT_SAVE_DIR} \
# --load ${CKPT_LOAD_DIR} \
DATA_ARGS="
--data-path $DATA_PATH \
--split 100,0,0
"
OUTPUT_ARGS="
--log-interval 1 \
--save-interval 10000 \
--eval-interval 5000 \
--eval-iters 0 \
"
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--distributed-backend nccl 2>&1 | tee /home/dataset/new_mistral-7B.log

View File

@ -1,64 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
import torch
from utils import ParamConfig
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.safetensors"))
self.assertEqual(len(bin_file), 3)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "model.safetensors.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 4096]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(
weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([768, 4096]))
self.assertEqual(
weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
self.assertEqual(
weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([3584, 4096]))
self.assertEqual(
weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1792]))
self.assertEqual(
weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(
weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 4096]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,107 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.inference_aux + config.evaluation_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def get_result(self, tokenizer, result):
if result:
final_result = [result[0]]
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
final_result.append('T')
else:
final_result.append('F')
else:
final_result = None
return final_result
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
temp = []
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0][0]
temp.append(answer)
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(f"==================== final acc: {final_acc} ====================")
assert_judge(abs(final_acc - 0.594) < 0.01)

View File

@ -1,117 +0,0 @@
import sys
import os
import nltk
import torch
import torch_npu
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
config.inference_aux + config.inference_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["how are you?", "Give me three tips for staying healthy."]
output = model.generate(instruction)
expect_output1 = [
"I'm doing well, thank you for asking! I've been keeping busy with work and various projects. "
"How about you? How have you been?"
]
expect_output2 = [
'\n\n1. Eat a balanced diet: Consuming a variety of nutrient-dense foods from all the food groups is '
'essential for maintaining good health.\n\n2. Stay hydrated: Water is essential for maintaining '
'good health. Aim to drink at least eight glasses of water a day.\n\n3. Get enough sleep: '
'Sleep is essential for maintaining good health. Aim to get at least seven to eight hours of'
' quality sleep each night.'
]
expect_output1_seq = "".join(expect_output1)
expect_output2_seq = ''.join(expect_output2)
if torch.distributed.get_rank() == 0:
print(output[0])
print(output[1])
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
print("similarity1:", similarity1)
print("similarity2:", similarity2)
assert_judge(similarity1 > 0.85)
assert_judge(similarity2 > 0.85)
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig)
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "What is the whether like today?"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False
)
expected_output = [
"The weather today is described as mostly sunny with a high temperature around 70 degrees "
"Fahrenheit (21 degrees Celsius).\n\nTo determine if the weather will be similar tomorrow, "
"you would need to check the weather forecast for tomorrow. The forecast may "
"indicate similar weather conditions, or it may suggest different conditions such as rain or clouds."
"\n\nTherefore, to answer your question, I would need to check the weather forecast for tomorrow. "
"Once I have that information, I can tell you whether the weather is expected to be similar to today, "
"or if it is expected to be different."
]
expected_output_seq = "".join(expected_output)
if torch.distributed.get_rank() == 0:
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
print("similarity:", similarity)
assert_judge(similarity > 0.75)

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 32000)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1, 523, 28734, 7355, 28787, 28767])
self.assertEqual(self.tokenizer.detokenize(31338), '')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 90, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,82 +0,0 @@
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessPretrainData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eos append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 32000)
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 10079])
self.assertEqual(self.tokenizer.detokenize(23961), 'behaviors')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
"""
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, GeneralPretrainHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 1)
self.assertEqual(len(idx_file), 1)
self.assertAlmostEqual((total_size / (1024 * 1024)), 26, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,154 +0,0 @@
import sys
import os
import subprocess
import torch
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.core.enums import ModelType
from megatron.core.utils import get_model_config
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
class TestTraining(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
config.auxiliary_param + config.learning_rate_param + config.regularization + config.training_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_training(self):
self.init(config=ParamConfig)
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
from megatron.core import mpu
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, valid_data_iterator, test_data_iterator \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
if self.args.eval_iters == 0:
assert_judge(valid_data_iterator is None)
assert_judge(test_data_iterator is None)
for model_module in model:
model_module.train()
timers = get_timers()
total_loss_dict = {}
iteration = self.args.iteration
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
report_memory_flag = True
timers('interval-time', log_level=0).start(barrier=True)
num_floating_point_operations_so_far = 0
while iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
batch_size = mpu.get_data_parallel_world_size() * \
self.args.micro_batch_size * \
get_num_microbatches()
self.args.consumed_train_samples += batch_size
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
loss_scale = optimizer.get_loss_scale().item()
params_norm = None
learning_rate = None
decoupled_learning_rate = None
for param_group in optimizer.param_groups:
if param_group['is_decoupled_lr']:
decoupled_learning_rate = param_group['lr']
else:
learning_rate = param_group['lr']
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
decoupled_learning_rate,
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros_in_grad)
saved_checkpoint = False
if self.args.save and self.args.save_interval and \
iteration % self.args.save_interval == 0:
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
saved_checkpoint = True
break
if saved_checkpoint:
for file_name in os.listdir(self.args.save):
file_path = os.path.join(self.args.save, file_name)
if os.path.isfile(file_path):
assert_judge(file_path.endswith(".txt"))
else:
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
def test_breakpoint_renewal_training(self):
self.init(config=ParamConfig)
self.args.load = self.args.save
torch.npu.set_compile_mode(jit_compile=True)
from pretrain_gpt import model_provider, forward_step
from pretrain_gpt import train_valid_test_datasets_provider
from megatron.training.global_vars import update_num_microbatches, get_timers
from megatron.training.training import train_step
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
model, optimizer, lr_scheduler = setup_model_and_optimizer(
model_provider, ModelType.encoder_or_decoder)
assert_judge(isinstance(model, list))
config = get_model_config(model[0])
train_valid_test_datasets_provider.is_distributed = True
train_data_iterator, _, _ \
= build_train_valid_test_data_iterators(
train_valid_test_datasets_provider
)
for model_module in model:
model_module.train()
timers = get_timers()
iteration = self.args.iteration
assert_judge(iteration == 10)
config.grad_scale_func = optimizer.scale_loss
config.timers = timers
timers('interval-time', log_level=0).start(barrier=True)
if iteration < self.args.train_iters:
update_num_microbatches(self.args.consumed_train_samples)
self.args.curr_iteration = iteration
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step,
train_data_iterator,
model,
optimizer,
lr_scheduler,
config)
iteration += 1
if torch.distributed.get_rank() == 0:
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
assert_judge(abs(1.0998 - loss_dict.get('lm loss')) < 0.2)

View File

@ -1,43 +0,0 @@
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
distributed_param,
network_size,
inference_param,
evaluation_param,
training_param,
training_auxiliary,
learning_rate,
regularization,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
distributed_param = config_file["DISTRIBUTED_PARAM"]
network_size = config_file["NETWORK_SIZE"]
inference_aux = config_file["INFERENCE_AUX"]
inference_param = config_file["INFERENCE_PARAM"]
evaluation_param = config_file["EVALUATION_PARAM"]
training_param = config_file["TRAINING_PARAM"]
training_aux = config_file["TRAINING_AUX"]
learning_rate_param = config_file["LEARNING_RATE"]
regularization = config_file["REGULARIZATION"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -1,151 +0,0 @@
{
"NETWORK_SIZE": [
"--num-layers", "32",
"--hidden-size", "4096",
"--ffn-hidden-size", "11008",
"--num-attention-heads", "32",
"--max-position-embeddings", "32768",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "16",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--add-qkv-bias"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "1",
"--pipeline-model-parallel-size", "1"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "2",
"--global-batch-size", "64",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--seed", "42",
"--use-fused-rmsnorm",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "8192",
"--padded-vocab-size", "151936",
"--attention-softmax-in-fp32"
],
"OUTPUT_PARAM": [
"--log-interval", "1",
"--save-interval", "10000",
"--eval-interval", "1000",
"--eval-iters", "5"
],
"INSTRUCTION_PARAM": [
"--finetune",
"--is-instruction-dataset",
"--data-path", "/home/dataset/tune-dataset-qwen-7B/alpaca",
"--split", "90,5,5",
"--train-iters", "5"
],
"DISTRIBUTED_PARAM_TP8_PP1": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"PROCESS_INSTRUCTION_DATA": [
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "LlamaFactoryInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
"--workers", "4",
"--log-interval", "1000",
"--append-eod",
"--prompt-type", "qwen",
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
"--overwrite-cache"
],
"PROCESS_INSTRUCTION_DATA_MIX1": [
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "LlamaFactoryInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
"--workers", "4",
"--log-interval", "1000",
"--append-eod",
"--prompt-type", "qwen",
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
"--overwrite-cache",
"--interleave-probs", "0.1, 0.2, 0.3, 0.4",
"--mix-strategy", "interleave_under",
"--max-samples", "10"
],
"PROCESS_INSTRUCTION_DATA_MIX2": [
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
"--tokenizer-type", "PretrainedFromHF",
"--handler-name", "LlamaFactoryInstructionHandler",
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
"--workers", "4",
"--log-interval", "1000",
"--append-eod",
"--prompt-type", "qwen",
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
"--overwrite-cache",
"--interleave-probs", "0.1, 0.2, 0.3, 0.4",
"--mix-strategy", "interleave_over",
"--max-samples", "10"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint",
"--attention-softmax-in-fp32",
"--prompt-type", "qwen",
"--seed", "42",
"--load", "/home/dataset/Qwen-7B-v0.1-tp8-pp1/"
],
"BEAM_SEARCH_AUXILIARY_PARAM": [
"--task", "beam_search",
"--top-p", "0.95",
"--top-k", "50"
],
"GREEDY_SEARCH_AUXILIARY_PARAM": [
"--task", "greedy"
],
"DO_SAMPLE_AUXILIARY_PARAM": [
"--task", "do_sample",
"--top-p", "0.95",
"--top-k", "50"
],
"BEAM_SEARCH_WITH_SAMPLING_AUXILIARY_PARAM": [
"--task", "beam_search_with_sampling",
"--top-p", "0.95",
"--top-k", "50"
],
"RETURN_OUTPUT_LOG_PROBS_AUXILIARY_PARAM": [
"--task", "return_output_log_probs",
"--temperature 0.6",
"--top-p", "0.95",
"--top-k", "50"
]
}

View File

@ -1,6 +0,0 @@
# Provide uniform access for piepline.
pytest -s ./tests/pipeline/qwen-7B/test_instruction.py
pytest -s ./tests/pipeline/qwen-7B/test_process_instruction_data.py
pytest -s ./tests/pipeline/qwen-7B/test_generation.py
pytest -s ./tests/pipeline/qwen-7B/test_generation2.py

View File

@ -1,141 +0,0 @@
import sys
import os
import torch
import nltk
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from megatron.training import get_args, get_tokenizer
from megatron.training.initialize import initialize_megatron
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig, task=None):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param_tp8_pp1 + config.network_size + \
config.inference_param + config.beam_search_auxliary_param + config.auxiliary_param + config.tokenizer_param
if task == "beam_search_with_sampling":
sys.argv = sys.argv + config.beam_search_with_sampling_auxliary_param
elif task == "return_output_log_probs":
sys.argv = sys.argv + config.return_output_log_probs_auxliary_param
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
self.args = get_args()
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def test_beam_search_with_sampling(self):
"""Beam Search with sampling"""
self.init(config=ParamConfig, task="beam_search_with_sampling")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = "Give me three tips for staying healthy."
output = model.generate(
instruction,
num_beams=2,
do_sample=True,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=self.args.max_new_tokens,
tokenizer=None,
stream=False
)
expect_output1 = '''1. Get enough sleep. A good night's sleep is important for your physical and mental health.\n2. Eat a balanced diet. Eating a variety of healthy foods can help you get the nutrients your body needs.\n3. Exercise regularly. Exercise can help you maintain a healthy weight, reduce stress, and improve your overall health.'''
expect_output2 = '''Sure, here are three tips for staying healthy:\n1. Eat a balanced diet that includes fruits, vegetables, whole grains, and lean proteins.\n2. Get regular exercise, such as going for a walk or doing yoga.\n3. Get enough sleep each night, ideally 7-8 hours.'''
if torch.distributed.get_rank() == 0:
print(output)
tokenizer = get_tokenizer()
similarity1 = self.edit_distance_similarity(output[:30], expect_output1[:30])
similarity2 = self.edit_distance_similarity(output[:30], expect_output2[:30])
print("similarity1:", similarity1)
print("similarity1:", similarity2)
assert_judge(max(similarity1, similarity2) > 0.75)
def test_return_output_log_probs(self):
"""Returns the probability distribution of tokens"""
self.init(config=ParamConfig, task="return_output_log_probs")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = "What is the whether like today?"
output1, log_probs = model.generate(
instruction,
do_sample=True,
top_k=self.args.top_k,
top_p=self.args.top_p,
temperature=self.args.temperature,
max_new_tokens=self.args.max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False,
return_output_log_probs=True
)
if torch.distributed.get_rank() == 0:
tokenizer = get_tokenizer()
print("--------------output1-------------")
print(output1)
print(tokenizer.decode(output1))
expected_output1 = [2132, 686, 6761, 389, 1380, 498, 525, 304, 279, 1879,
13, 576, 9104, 646, 387, 2155, 304, 2155, 7482, 624,
872, 198, 3838, 374, 279, 9104, 1075, 304, 7148, 5267,
77091, 198, 785, 9104, 304, 7148, 3351, 374, 39698, 323]
expected_output1_ext = [2132, 686, 6761, 389, 1380, 498, 525, 7407, 13, 16503,
498, 3291, 752, 697, 3728, 5267, 872, 198, 29596, 11902,
198, 77091, 198, 641, 9656, 11902, 11, 432, 594, 39698,
3351, 13, 576, 9315, 374, 220, 23, 15, 12348, 68723]
expected_output1_ext2 = [2132, 374, 83253, 16916, 3351, 382, 77091, 198, 3838, 374,
279, 9104, 1075, 3351, 5267, 2610, 525, 264, 10950, 17847,
13, 279, 198, 3838, 374, 279, 9104, 1075, 3351, 5267,
2610, 525, 264, 10950, 17847, 13, 279, 198, 3838, 374]
print("--------------log_probs----------------")
print(log_probs.shape)
assert_judge(log_probs.shape[0] == 256)
assert_judge(log_probs.shape[1] == 151936)
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output1[:40]).unsqueeze(0).float().npu(),
output1[:40].unsqueeze(0).float())
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output1_ext[:40]).unsqueeze(0).float().npu(),
output1[:40].unsqueeze(0).float()))
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output1_ext2[:40]).unsqueeze(0).float().npu(),
output1[:40].unsqueeze(0).float()))
print("similarity1: ", cos_sim)
assert_judge(cos_sim > 0.75)

View File

@ -1,165 +0,0 @@
import sys
import os
import torch
import nltk
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from megatron.training import get_args, get_tokenizer
from megatron.training.initialize import initialize_megatron
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
class TestGeneration(DistributedTest):
world_size = 8
def init(self, config=ParamConfig, task=None):
"""
initialize the environment and arguments
"""
sys.argv = [sys.argv[0]] + config.distributed_param_tp8_pp1 + config.network_size + \
config.inference_param + config.beam_search_auxliary_param + config.auxiliary_param + config.tokenizer_param
if task == "beam_search":
sys.argv = sys.argv + config.beam_search_auxliary_param
elif task == "greedy":
sys.argv = sys.argv + config.greedy_search_auxliary_param
elif task == "do_sample":
sys.argv = sys.argv + config.do_sample_auxliary_param
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
self.args = get_args()
def test_beam_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `beam search`.
"""
self.init(config=ParamConfig, task="beam_search")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
max_new_tokens = self.args.max_new_tokens
instruction = "如何提高身体素质"
output = model.generate(
instruction,
num_beams=2,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=max_new_tokens,
tokenizer=None,
stream=False,
detokenize=False
)
if torch.distributed.get_rank() == 0:
print("----------------------output-------------------------")
print(output)
expected_output1 = [100627, 101099, 100838, 104339, 101194, 3837, 87752, 99639, 6684, 31338,
96422, 28311, 16, 13, 4891, 251, 248, 68878, 101079, 5122,
106854, 104102, 71817, 16, 20, 15, 83031, 9370, 15946, 49567,
102660, 18830, 100316, 101079, 3837, 29524, 99234, 99314, 5373, 107530]
expected_output2 = [30534, 100627, 101099, 100838, 3837, 73670, 103975, 87752, 101082, 28311,
16, 13, 4891, 223, 98, 99446, 104579, 5122, 101907, 109635,
103170, 107151, 5373, 100912, 52510, 116570, 5373, 105349, 5373, 105373,
33108, 117094, 49567, 102100, 101252, 3837, 101153, 44636, 108461, 5373]
similarity = torch.nn.CosineSimilarity(dim=1)
cos_sim = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
output[:40].unsqueeze(0).float())
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
output[:40].unsqueeze(0).float()))
print("similarity: ", cos_sim)
assert_judge(cos_sim > 0.85)
def edit_distance_similarity(self, text1, text2):
"""
edit distance: to compare the similarity between two texts.
"""
distance = nltk.edit_distance(text1, text2)
try:
similarity = 1 - (distance / max(len(text1), len(text2)))
except ZeroDivisionError as e:
raise e
return similarity
def test_greedy_search(self):
"""
load weight to get model and construct the prompts to generate output,
and compare with expected for `greedy search`.
"""
self.init(config=ParamConfig, task="greedy")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = ["What are the characteristics of Suzhou?", "Introducing the Forbidden City in Beijing."]
output = model.generate(instruction)
expect_output1 = [
"Suzhou is a city in China. It is known for its beautiful gardens, canals, and classical Chinese architecture. It is also known for its silk production and traditional arts and crafts. The city has a rich cultural heritage and is home to many historic temples and museums. Additionally, Suzhou is known for its cuisine, which features local specialties such as sweet and sour fish and rice cakes."
]
expect_output2 = [
'The Forbidden City is a palace complex in Beijing, China. It was the home of the emperors of China for almost 500 years, from the Ming Dynasty to the end of the Qing Dynasty. The complex covers an area of 72 hectares and has over 9,000 rooms. It is a UNESCO World Heritage Site and one of the most popular tourist attractions in China..'
]
expect_output1_seq = "".join(expect_output1)
expect_output2_seq = ''.join(expect_output2)
if torch.distributed.get_rank() == 0:
print("----------------------output1-------------------------")
print(output[0])
print("----------------------output2-------------------------")
print(output[1])
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
print("similarity1:", similarity1)
print("similarity2:", similarity2)
assert_judge(similarity1 > 0.85)
assert_judge(similarity2 > 0.85)
def test_do_sample(self):
"""Do Sample"""
self.init(config=ParamConfig, task="do_sample")
from inference import model_provider
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_model_name_or_path=self.args.load
)
instruction = "what is Disneyland?"
output = model.generate(
[instruction, instruction],
do_sample=True,
top_k=self.args.top_k,
top_p=self.args.top_p,
max_new_tokens=self.args.max_new_tokens,
tokenizer=None,
stream=False
)
expect_output1 = "Disneyland Park is an entertainment park located in Anaheim, California, United States. It is owned by the Disney Parks, Experiences and Consumer Products division of the American multinational conglomerate corporation the Walt Disney Company. It is also the first of seven theme parks built at Walt Disney's original vision, where visitors can enjoy various attractions, entertainment, and dining."
expect_output1_seq = "".join(expect_output1)
if torch.distributed.get_rank() == 0:
print(output)
tokenizer = get_tokenizer()
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
print("similarity1:", similarity1)
assert_judge(similarity1 > 0.85)

View File

@ -1,138 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
import sys
import os
import time
import numpy as np
import torch
from common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.data.dataset_utils import get_train_valid_test_split_
from megatron.training.initialize import initialize_megatron
from megatron.training import get_args
from modellink.data.mtf_dataset import get_packed_indexed_dataset, MTFDataset
from modellink.data.decoder_packed_mtf_dataset import DecoderPackedMTFDataset, _build_shuffle_idx
class TestInstruction(DistributedTest):
world_size = 1
def init(self, config=ParamConfig):
sys.argv = ([sys.argv[0]] + config.distributed_param + config.network_size + config.tokenizer_param +
config.auxiliary_param + config.instruction_param + config.output_param)
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=None,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
self.args = get_args()
def _build_index_mappings(
self,
name,
data_prefix,
start_index,
nb_documents,
num_samples: int,
seed,
):
"""
- `shuffle_index` is [num_epoch * len(self.mtf)]
- `sample_index` is [num_sample, 2] (storing the start and end of the sample). We query the sample via `self.shuffle_index[start:end]`
"""
# rng state
np_rng = np.random.RandomState(seed=seed)
# Filename of the index mappings.
_filename = data_prefix
_filename += '_{}_indexmap'.format(name)
_filename += '_{}ns'.format(num_samples)
_filename += '_{}s'.format(seed)
shuffle_idx_filename = _filename + '_decoder_packed_shuffle_idx.npy'
if os.path.isfile(shuffle_idx_filename):
os.remove(shuffle_idx_filename)
# Build the indexed mapping if not exist.
if not os.path.isfile(shuffle_idx_filename):
print(' > WARNING: could not find index map files, building '
'the indices on rank 0 ...')
# iteratively add the entire dataset for every epoch and see if it's enough given current packing strategy
start_time = time.time()
epoch = 0
shuffle_idx = []
while len(shuffle_idx) <= num_samples:
new_document_ids = _build_shuffle_idx(nb_documents=nb_documents, start_index=start_index, np_rng=np_rng)
# Generate a shuffling of the entire dataset
shuffle_idx.extend(new_document_ids.tolist())
epoch += 1
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
print(' > elasped time to build and save shuffle-idx and sample-idx mapping'
' (seconds): {:4f}'.format(time.time() - start_time))
# Load mappings.
start_time = time.time()
print(' > loading shuffle-idx mapping from {}'.format(
shuffle_idx_filename))
shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
print(' loaded indexed file in {:3.3f} seconds'.format(
time.time() - start_time))
return shuffle_idx, epoch
def test_train_valid_test_split(self):
self.init(config=ParamConfig)
data_prefix = self.args.data_path[0]
packed_indexed_dataset = get_packed_indexed_dataset(data_prefix=data_prefix)
total_num_of_documents = len(list(packed_indexed_dataset.values())[0])
assert_judge(52002 == total_num_of_documents)
splits = get_train_valid_test_split_(self.args.split, total_num_of_documents)
if self.args.train_samples:
train_samples = self.args.train_samples
else:
train_samples = self.args.train_iters * self.args.global_batch_size
eval_iters = (self.args.train_iters // self.args.eval_interval + 1) * \
self.args.eval_iters
test_iters = self.args.eval_iters
train_val_test_num_samples = [train_samples,
eval_iters * self.args.global_batch_size,
test_iters * self.args.global_batch_size]
def build_shuffle_index(index, name):
shuffle_index = None
if splits[index + 1] > splits[index]:
documents = np.arange(start=splits[index], stop=splits[index + 1],
step=1, dtype=np.int32)
mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, documents=documents)
shuffle_index = self._build_index_mappings(name=name, data_prefix=data_prefix, start_index=documents[0], nb_documents=len(documents), num_samples=train_val_test_num_samples[index], seed=self.args.seed)
return shuffle_index
train_shuffle_index, train_epoch = build_shuffle_index(0, 'train')
valid_shuffle_index, valid_epoch = build_shuffle_index(1, 'valid')
test_shuffle_index, test_epoch = build_shuffle_index(2, 'test')
### 数量验证
assert_judge(abs(len(train_shuffle_index) - 0.9 * total_num_of_documents * train_epoch) <= train_epoch)
assert_judge(abs(len(valid_shuffle_index) - 0.05 * total_num_of_documents * valid_epoch) <= valid_epoch)
assert_judge(abs(len(test_shuffle_index) - 0.05 * total_num_of_documents * test_epoch) <= test_epoch)
### document划分验证
train_shuffle_index_set = set(train_shuffle_index)
valid_shuffle_index_set = set(valid_shuffle_index)
test_shuffle_index_set = set(test_shuffle_index)
assert_judge(len(train_shuffle_index_set & valid_shuffle_index_set) == 0)
assert_judge(len(test_shuffle_index_set & valid_shuffle_index_set) == 0)
assert_judge(len(train_shuffle_index_set & test_shuffle_index_set) == 0)
def test_instruction(self):
pass

View File

@ -1,157 +0,0 @@
# coding=utf-8
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import sys
import os
import glob
from utils import ParamConfig
from modellink.tokenizer import build_tokenizer
from modellink.tokenizer.tokenizer import _AutoTokenizer
from modellink.tasks.preprocess.data_handler import LlamaFactoryInstructionHandler
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
from preprocess_data import get_args, build_splitter
class TestProcessInstructionData(unittest.TestCase):
@classmethod
def setUpClass(self):
# configure params, the index starts from 1
self.config = ParamConfig
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
self.args = get_args()
self.tokenizer = build_tokenizer(self.args)
self.splitter = build_splitter(self.args)
self.raw_dataset = build_dataset(self.args)
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
# for test_build_dataset_mix1
sys.argv = [sys.argv[0]] + self.config.instruction_data_mix_param1
self.args = get_args()
self.raw_dataset_mix1 = build_dataset(self.args)
# for test_build_dataset_mix2
sys.argv = [sys.argv[0]] + self.config.instruction_data_mix_param2
self.args = get_args()
self.raw_dataset_mix2 = build_dataset(self.args)
def test_build_tokenizer(self):
"""
Test normal function of the tokenizer:
the instance of tokenizer
the length of vocabulary
the encode function
the decode function
the eod append
...(If missed something else, welcome to add)
"""
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
self.assertEqual(self.tokenizer.vocab_size, 151851)
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [27, 15, 9770, 22, 29])
self.assertEqual(self.tokenizer.detokenize(31338), '<EFBFBD>')
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '<|im_end|>')
def test_build_splitter(self):
"""
If there's no split_sentence, default process is `IdentitySplitter()`.
"""
pass
def test_build_dataset_mix1(self):
"""
Test the raw_dataset, need to test number of columns and rows
outputs["prompt"] = prompt
outputs["response"] = response
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
outputs["tools"].append("")
"""
print("-------------------test_build_dataset_mix1-------------------------")
print(len(self.raw_dataset_mix1.__getitem__("prompt")))
print(len(self.raw_dataset_mix1.__getitem__("response")))
print(len(self.raw_dataset_mix1.__getitem__("system")))
print(len(self.raw_dataset_mix1.__getitem__("tools")))
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("prompt")), 40)
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("response")), 40)
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("system")), 40)
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("tools")), 40)
def test_build_dataset_mix2(self):
"""
Test the raw_dataset, need to test number of columns and rows
outputs["prompt"] = prompt
outputs["response"] = response
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
outputs["tools"].append("")
"""
print("----------------test_build_dataset_mix2--------------------------")
print(len(self.raw_dataset_mix2.__getitem__("prompt")))
print(len(self.raw_dataset_mix2.__getitem__("response")))
print(len(self.raw_dataset_mix2.__getitem__("system")))
print(len(self.raw_dataset_mix2.__getitem__("tools")))
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("prompt")), 40)
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("response")), 40)
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("system")), 40)
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("tools")), 40)
def test_build_dataset(self):
"""
Test the raw_dataset, need to test number of columns and rows
outputs["prompt"] = prompt
outputs["response"] = response
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
outputs["tools"].append("")
"""
self.assertEqual(len(self.raw_dataset.__getitem__("prompt")), 62981)
self.assertEqual(len(self.raw_dataset.__getitem__("response")), 62981)
self.assertEqual(len(self.raw_dataset.__getitem__("system")), 62981)
self.assertEqual(len(self.raw_dataset.__getitem__("tools")), 62981)
def test_get_dataset_handler(self):
"""
Test if get the right data handler for pretrain
"""
self.assertIsInstance(self.handler, LlamaFactoryInstructionHandler)
def test_serialize_to_disk(self):
"""
Test generate pretrain object files and files are not None(MB).
"""
self.handler.serialize_to_disk()
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
total_size = 0
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
total_size += os.path.getsize(file_path)
self.assertEqual(len(bin_file), 3)
self.assertEqual(len(idx_file), 3)
self.assertAlmostEqual((total_size / (1024 * 1024)), 111, delta=1)
if __name__ == "__main__":
unittest.main()

View File

@ -1,52 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
import json
import os
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParamConfig:
"""
We can config the params in the `.json` file including:
convert_ckpt_param,
network_size,
tokenizer_param,
distributed_param,
inference_param,
evaluation_param,
and other auxiliary_param.
"""
base_dir = Path(__file__).absolute().parent
param_config = os.path.join(base_dir, "param_config.json")
with open(param_config) as f:
config_file = json.load(f)
network_size = config_file["NETWORK_SIZE"]
tokenizer_param = config_file["TOKENIZER_PARAM"]
distributed_param = config_file["DISTRIBUTED_PARAM"]
distributed_param_tp8_pp1 = config_file["DISTRIBUTED_PARAM_TP8_PP1"]
auxiliary_param = config_file["AUXILIARY_PARAM"]
instruction_param = config_file["INSTRUCTION_PARAM"]
output_param = config_file["OUTPUT_PARAM"]
# prepreocess instruction data
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
instruction_data_mix_param1 = config_file["PROCESS_INSTRUCTION_DATA_MIX1"]
instruction_data_mix_param2 = config_file["PROCESS_INSTRUCTION_DATA_MIX2"]
# inference
inference_param = config_file["INFERENCE_PARAM"]
beam_search_auxliary_param = config_file["BEAM_SEARCH_AUXILIARY_PARAM"]
greedy_search_auxliary_param = config_file["GREEDY_SEARCH_AUXILIARY_PARAM"]
do_sample_auxliary_param = config_file["DO_SAMPLE_AUXILIARY_PARAM"]
beam_search_with_sampling_auxliary_param = config_file["BEAM_SEARCH_WITH_SAMPLING_AUXILIARY_PARAM"]
return_output_log_probs_auxliary_param = config_file["RETURN_OUTPUT_LOG_PROBS_AUXILIARY_PARAM"]
def assert_judge(expression):
if not expression:
raise AssertionError

View File

@ -11,7 +11,7 @@ import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel

View File

@ -6,7 +6,7 @@ import sys
import os
import torch
import torch_npu
from tests.common import DistributedTest
from tests.test_tools.dist_test import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel

View File

@ -1,69 +0,0 @@
{
"CONVERT_CKPT_PARAM": [
"--model-type", "GPT",
"--loader", "llama2_hf",
"--saver", "megatron",
"--load-dir", "/home/dataset/yi-34B-hf",
"--save-dir", "/home/dataset/yi-34B-mt-t8p1",
"--target-tensor-parallel-size", "8",
"--target-pipeline-parallel-size", "1",
"--tokenizer-model", "None"
],
"NETWORK_SIZE": [
"--num-layers", "60",
"--hidden-size", "7168",
"--ffn-hidden-size", "20480",
"--num-attention-heads", "56",
"--max-position-embeddings", "4096",
"--position-embedding-type", "rope",
"--make-vocab-size-divisible-by", "1",
"--normalization", "RMSNorm",
"--swiglu",
"--untie-embeddings-and-output-weights",
"--load", "/home/dataset/yi-34B-mt-t8p1"
],
"TOKENIZER_PARAM": [
"--tokenizer-type", "PretrainedFromHF",
"--tokenizer-name-or-path", "/home/dataset/yi-34B-hf"
],
"DISTRIBUTED_PARAM": [
"--tensor-model-parallel-size", "8",
"--pipeline-model-parallel-size", "1"
],
"INFERENCE_PARAM": [
"--max-new-tokens", "256",
"--tokenizer-not-use-fast",
"--exit-on-missing-checkpoint",
"--attention-softmax-in-fp32"
],
"EVALUATION_PARAM": [
"--tokenizer-not-use-fast",
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
"--task", "mmlu",
"--max-new-tokens", "1",
"--exit-on-missing-checkpoint"
],
"AUXILIARY_PARAM": [
"--micro-batch-size", "1",
"--global-batch-size", "16",
"--no-masked-softmax-fusion",
"--disable-bias-linear",
"--no-gradient-accumulation-fusion",
"--bf16",
"--seed", "42",
"--use-fused-rmsnorm",
"--group-query-attention",
"--no-load-optim",
"--no-load-rng",
"--seq-length", "4096",
"--num-query-groups", "8",
"--vocab-size", "64000",
"--rotary-base", "5000000"
]
}

View File

@ -1,59 +0,0 @@
import unittest
import sys
import os
import subprocess
import glob
from pathlib import Path
from utils import ParamConfig
import torch
import modellink
class TestConvertCkptFromHuggingface(unittest.TestCase):
def setUp(self, config=ParamConfig):
# configure params, the index starts from 1
self.config = config
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
def test_file_exsit(self):
"""
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
"""
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
self.assertEqual(len(bin_file), 7)
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
def test_convert_weights_form_huggingface(self):
"""
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
"""
base_dir = Path(__file__).absolute().parent.parent.parent.parent
file_path = os.path.join(base_dir, "convert_ckpt.py")
arguments = sys.argv[1:]
subprocess.run(["python", file_path] + arguments)
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
weight_common_content = weight_content['model']['language_model'] # extract commmon content
# embedding, encoder, output_layer is three out layers.
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([8000, 7168]))
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([7168]))
# encoder has a common final_norm and each one has folliowing six layers
weight_common_content['encoder'].pop('final_norm.weight')
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1152, 7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([7168, 896]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5120, 7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([7168, 2560]))
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([7168]))
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([7168]))
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 7168]))
if __name__ == "__main__":
unittest.main()

View File

@ -1,94 +0,0 @@
import sys
import os
import json
from pathlib import Path
import tqdm
import pandas as pd
import torch
import torch_npu
from transformers import AutoTokenizer
from tests.common import DistributedTest
from utils import ParamConfig, assert_judge
import modellink
from megatron.legacy.model import GPTModel
from modellink.tasks.evaluation.utils import add_text_generate_args
class TestEvaluation(DistributedTest):
world_size = 8
def init(self, config=ParamConfig):
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
config.evaluation_param + config.tokenizer_param
from megatron.training.initialize import initialize_megatron
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
initialize_megatron(extra_args_provider=add_text_generate_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True})
from megatron.training import get_args
self.args = get_args()
def test_mmlu_evaluation(self):
self.init(config=ParamConfig)
from evaluation import model_provider
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
model = GPTModel.from_pretrained(
model_provider=model_provider,
pretrained_name_or_path=self.args.load
)
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
max_new_tokens = self.args.max_new_tokens
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
total_acc_n = 0
total_n = 0
test_dir = None
for path in self.args.task_data_path:
if "mmlu" in path:
test_dir = path
base_dir = Path(__file__).absolute().parent.parent.parent.parent
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
with open(template_dir, encoding='utf-8') as f:
mmlu_few_shot_template = json.load(f)
for file in tqdm.tqdm(os.listdir(test_dir)):
file_path = os.path.join(test_dir, file)
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
subject_name = file[0: -9]
subject = subject_name.replace("_", " ")
acc_n = 0
data_df_test = data_df[0:10]
for index, row in data_df_test.iterrows():
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
subject=subject,
question=test_question)
chat_result = model.generate(
instruction,
do_sample=False,
max_new_tokens=max_new_tokens,
tokenizer=tokenizer,
stream=False,
return_output_log_probs=True
)
assert_judge(isinstance(chat_result, tuple))
assert_judge(isinstance(chat_result[1], torch.Tensor))
answer = None
if chat_result:
answer = chat_result[0].strip()
if answer == row['answer']:
acc_n += 1
if torch.distributed.get_rank() == 0:
total_n += len(data_df_test)
total_acc_n += acc_n
if torch.distributed.get_rank() == 0:
try:
final_acc = total_acc_n / total_n
except ZeroDivisionError as e:
raise e
print(final_acc)
assert_judge(abs(final_acc - 0.803) < 0.01)

Some files were not shown because too many files have changed in this diff Show More