mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-12-02 03:48:24 +08:00
!1640 门禁、流水整改三:工具代码框架整合,预留流水打样例子
Merge pull request !1640 from guoxinjie/master
This commit is contained in:
parent
a2552c6a49
commit
9e590514e9
@ -1,8 +1,46 @@
|
||||
import os
|
||||
import stat
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
|
||||
def read_files_from_txt(txt_file):
|
||||
with open(txt_file, "r") as f:
|
||||
return [line.strip() for line in f.readlines()]
|
||||
|
||||
|
||||
def is_examples(file):
|
||||
return file.startswith("example/")
|
||||
|
||||
|
||||
def is_pipecase(file):
|
||||
return file.startswith("tests/pipeline")
|
||||
|
||||
|
||||
def is_markdown(file):
|
||||
return file.endswith(".md")
|
||||
|
||||
|
||||
def skip_ci_file(files, skip_cond):
|
||||
for file in files:
|
||||
if not any(condition(file) for condition in skip_cond):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def alter_skip_ci():
|
||||
parent_dir = Path(__file__).absolute().parents[2]
|
||||
raw_txt_file = os.path.join(parent_dir, "modify.txt")
|
||||
|
||||
if not os.path.exists(raw_txt_file):
|
||||
return False
|
||||
|
||||
file_list = read_files_from_txt(raw_txt_file)
|
||||
skip_conds = [
|
||||
is_examples,
|
||||
is_pipecase,
|
||||
is_markdown
|
||||
]
|
||||
|
||||
return skip_ci_file(file_list, skip_conds)
|
||||
|
||||
|
||||
def acquire_exitcode(command):
|
||||
@ -46,18 +84,8 @@ class ST_Test:
|
||||
|
||||
st_dir = "st"
|
||||
self.st_shell = os.path.join(
|
||||
test_dir, st_dir, "run.sh"
|
||||
test_dir, st_dir, "st_run.sh"
|
||||
)
|
||||
llama_instruction_shell_file = os.path.join(
|
||||
test_dir, st_dir, "test_llama_instruction_ptd.sh")
|
||||
llama_pretrain_ha_save_shell_file = os.path.join(
|
||||
test_dir, st_dir, "test_llama_pretrain_ha_save_ptd.sh")
|
||||
llama_pretrain_ha_load_shell_file = os.path.join(
|
||||
test_dir, st_dir, "test_llama_pretrain_ha_load_ptd.sh")
|
||||
|
||||
self.st_file_list = [
|
||||
llama_instruction_shell_file
|
||||
]
|
||||
|
||||
def run_st(self):
|
||||
rectify_case = f"bash {self.st_shell}"
|
||||
@ -65,24 +93,22 @@ class ST_Test:
|
||||
if rectify_code != 0:
|
||||
print("rectify case failed, check it.")
|
||||
exit(1)
|
||||
all_success = True
|
||||
for shell_file in self.st_file_list:
|
||||
command = f"sh {shell_file}"
|
||||
st_exitcode = acquire_exitcode(command)
|
||||
if st_exitcode != 0:
|
||||
all_success = False
|
||||
print(f"ST run {shell_file} failed")
|
||||
exit(1)
|
||||
|
||||
if all_success:
|
||||
print("ST test success")
|
||||
else:
|
||||
print("ST failed")
|
||||
exit(1)
|
||||
|
||||
def run_tests():
|
||||
ut = UT_Test()
|
||||
st = ST_Test()
|
||||
|
||||
ut.run_ut()
|
||||
st.run_st()
|
||||
|
||||
|
||||
def main():
|
||||
if alter_skip_ci():
|
||||
print("Skipping CI")
|
||||
else:
|
||||
run_tests()
|
||||
|
||||
if __name__ == "__main__":
|
||||
ut = UT_Test()
|
||||
ut.run_ut()
|
||||
st = ST_Test()
|
||||
st.run_st()
|
||||
main()
|
||||
|
@ -1,124 +0,0 @@
|
||||
# GPT3 $\color{black}{\bf\tiny{【社区贡献模型】}}$
|
||||
|
||||
<p align="left">
|
||||
<b>简体中文</b> |
|
||||
<b><a href="README.md">English</a> </b>
|
||||
</p>
|
||||
|
||||
# 目录
|
||||
|
||||
- [GPT3](#GPT3)
|
||||
- [目录](#目录)
|
||||
- [GPT3-175B](#GPT3-175B)
|
||||
- [训练-175B](#训练)
|
||||
- [脚本](#脚本)
|
||||
|
||||
# GPT3-175B
|
||||
|
||||
## 训练
|
||||
|
||||
GPT3-175B 训练的硬件配置:
|
||||
|
||||
| 硬件 | 配置 |
|
||||
| :--: | :-------------: |
|
||||
| NPU | 128 x Ascend NPUs |
|
||||
|
||||
### 脚本
|
||||
|
||||
1. 克隆仓库到本地服务器:
|
||||
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
git clone https://github.com/NVIDIA/Megatron-LM.git
|
||||
cd Megatron-LM
|
||||
git checkout core_r0.6.0
|
||||
cp -r megatron ../ModelLink/
|
||||
cd ..
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
mkdir vocab_file
|
||||
mkdir dataset
|
||||
```
|
||||
|
||||
2. 搭建环境
|
||||
|
||||
```bash
|
||||
# python3.8
|
||||
conda create -n test python=3.8
|
||||
conda activate test
|
||||
|
||||
# 安装 torch 和 torch_npu
|
||||
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
|
||||
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
|
||||
|
||||
# 修改 ascend-toolkit 路径
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# 安装 MindSpeed
|
||||
git clone https://gitee.com/ascend/MindSpeed.git
|
||||
cd MindSpeed
|
||||
git checkout 2b0edd2
|
||||
pip install -r requirements.txt
|
||||
pip3 install -e .
|
||||
cd ..
|
||||
|
||||
# 安装其他依赖
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. 准备数据、词表来拉起模型
|
||||
3.1 准备数据
|
||||
|
||||
可以从 [这里](https://huggingface.co/datasets/wikipedia/tree/main/data/20220301.en) 下载原始数据
|
||||
```shell
|
||||
# 下载 enwiki 数据
|
||||
# 总共有 41 个文件,我们可以选择部分来制作数据
|
||||
cd ./dataset
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00000-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00001-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00002-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00003-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00004-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00005-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00006-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00007-of-00041.parquet
|
||||
cd ..
|
||||
|
||||
# 下载 vocab file 和 merge table
|
||||
cd vocab_file
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
|
||||
cd ..
|
||||
|
||||
# 处理成训练数据
|
||||
python ./preprocess_data.py \
|
||||
--input ./dataset/ \
|
||||
--output-prefix ./dataset/gpt_text_sentence \
|
||||
--tokenizer-type GPT2BPETokenizer \
|
||||
--vocab-file ./vocab_file/gpt2-vocab.json \
|
||||
--merge-file ./vocab_file/gpt2-merges.txt \
|
||||
--append-eod \
|
||||
--workers 4 \
|
||||
--log-interval 1000
|
||||
```
|
||||
|
||||
3.2 用 ptd 模式进行预训练
|
||||
配置 GPT3-175B PTD 预训练脚本: examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
|
||||
```shell
|
||||
# 请根据真实情况配置 ascend-toolkit 路径
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# 请根据真实存放路径配置以下参数
|
||||
VOCAB_FILE="./vocab_file/gpt2-vocab.json" # 词表
|
||||
MERGE_FILE="./vocab_file/gpt2-merges.txt" # BPE 合并表
|
||||
DATA_PATH="./dataset/gpt_text_sentence" # 数据路径
|
||||
```
|
||||
|
||||
拉起 GPT3-175B PTD 预训练脚本: examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
|
||||
```shell
|
||||
bash examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
```
|
||||
|
@ -1,124 +0,0 @@
|
||||
# GPT3 $\color{black}{\rm\tiny{【model}}$ $\color{black}{\rm\tiny{contributed}}$ $\color{black}{\rm\tiny{by}}$ $\color{black}{\rm\tiny{Community】}}$
|
||||
|
||||
<p align="left">
|
||||
<b>English</b> |
|
||||
<b><a href="README_en.md">English</a> </b>
|
||||
</p>
|
||||
|
||||
# Contents
|
||||
|
||||
- [GPT3](#GPT3)
|
||||
- [Contents](#contents)
|
||||
- [GPT3-175B](#GPT3-175B)
|
||||
- [Training-175B](#training)
|
||||
- [Script](#script)
|
||||
|
||||
# GPT3-175B
|
||||
|
||||
## Training
|
||||
|
||||
Here is a hardware summary of pre-trianing GPT3-175B:
|
||||
|
||||
| Hardware | Value |
|
||||
| :--: | :-------------: |
|
||||
| NPU | 128 x Ascend NPUs |
|
||||
|
||||
### Script
|
||||
|
||||
1. Clone repository to your local server:
|
||||
|
||||
```shell
|
||||
git clone https://gitee.com/ascend/ModelLink.git
|
||||
git clone https://github.com/NVIDIA/Megatron-LM.git
|
||||
cd Megatron-LM
|
||||
git checkout core_r0.6.0
|
||||
cp -r megatron ../ModelLink/
|
||||
cd ..
|
||||
cd ModelLink
|
||||
mkdir logs
|
||||
mkdir vocab_file
|
||||
mkdir dataset
|
||||
```
|
||||
|
||||
2. Build environment
|
||||
|
||||
```bash
|
||||
# python3.8
|
||||
conda create -n test python=3.8
|
||||
conda activate test
|
||||
|
||||
# install torch and torch_npu
|
||||
pip install torch-2.1.0-cp38-cp38m-manylinux2014_aarch64.whl
|
||||
pip install torch_npu-2.1.0*-cp38-cp38m-linux_aarch64.whl
|
||||
pip install apex-0.1_ascend*-cp38-cp38m-linux_aarch64.whl
|
||||
|
||||
# modify ascend-toolkit path
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# install MindSpeed
|
||||
git clone https://gitee.com/ascend/MindSpeed.git
|
||||
cd MindSpeed
|
||||
git checkout 2b0edd2
|
||||
pip install -r requirements.txt
|
||||
pip3 install -e .
|
||||
cd ..
|
||||
|
||||
# install other packages
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Prepare dataset and vocab file for pretrain
|
||||
3.1 Prepare dataset
|
||||
|
||||
Download the GPT raw dataset from [here](https://huggingface.co/datasets/wikipedia/tree/main/data/20220301.en)
|
||||
```shell
|
||||
# download enwiki raw data
|
||||
# There are 41 files in total, we can just select part to make our datasets.
|
||||
cd ./dataset
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00000-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00001-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00002-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00003-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00004-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00005-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00006-of-00041.parquet
|
||||
wget https://huggingface.co/datasets/wikipedia/blob/main/data/20220301.en/train-00007-of-00041.parquet
|
||||
cd ..
|
||||
|
||||
# download vocab file and merge table
|
||||
cd vocab_file
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
|
||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
|
||||
cd ..
|
||||
|
||||
# process formal dataset
|
||||
python ./preprocess_data.py \
|
||||
--input ./dataset/ \
|
||||
--output-prefix ./dataset/gpt_text_sentence \
|
||||
--tokenizer-type GPT2BPETokenizer \
|
||||
--vocab-file ./vocab_file/gpt2-vocab.json \
|
||||
--merge-file ./vocab_file/gpt2-merges.txt \
|
||||
--append-eod \
|
||||
--workers 4 \
|
||||
--log-interval 1000
|
||||
```
|
||||
|
||||
3.2 pre-training in ptd mode
|
||||
Config GPT3-175B PTD pre-training script: examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
|
||||
```shell
|
||||
# modify ascend-toolkit path according to your own config
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
# modify config according to your own actual situation
|
||||
VOCAB_FILE="./vocab_file/gpt2-vocab.json" # vocab file for training
|
||||
MERGE_FILE="./vocab_file/gpt2-merges.txt" # BPE merge file for training
|
||||
DATA_PATH="./dataset/gpt_text_sentence" # dataset path
|
||||
```
|
||||
|
||||
Launch GPT3-175B PTD pre-training script: examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
|
||||
```shell
|
||||
bash examples/gpt3/pretrain_gpt3_175B_ptd.sh
|
||||
```
|
||||
|
@ -84,6 +84,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--jit-compile \
|
||||
--distributed-backend nccl 2>&1
|
||||
--distributed-backend nccl \
|
||||
| tee ./logs/pretrain_gpt3_175B_8layers.log
|
||||
|
@ -85,6 +85,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--jit-compile \
|
||||
--distributed-backend nccl 2>&1
|
||||
--distributed-backend nccl \
|
||||
| tee ./logs/pretrain_gpt3_175B.log
|
||||
|
@ -41,3 +41,30 @@ def pytest_fixture_setup(fixturedef, request):
|
||||
if getattr(fixturedef.func, "is_dist_fixture", False):
|
||||
dist_fixture_class = fixturedef.func()
|
||||
dist_fixture_class(request)
|
||||
|
||||
|
||||
# we still want to configure path argument by ourselves
|
||||
# for different prefix_name of different scripts so we use this method.
|
||||
# One more thing, as you can see, it has more scalibility.
|
||||
def pytest_addoption(parser: pytest.Parser):
|
||||
parser.addoption("--baseline-json", action="store", default=None,
|
||||
help="Path to the baseline JSON file")
|
||||
parser.addoption("--generate-log", action="store", default=None,
|
||||
help="Path to the generate log file")
|
||||
parser.addoption("--generate-json", action="store", default=None,
|
||||
help="Path to the generate JSON file")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def baseline_json(request: pytest.FixtureRequest):
|
||||
return request.config.getoption("--baseline-json")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def generate_log(request: pytest.FixtureRequest):
|
||||
return request.config.getoption("--generate-log")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def generate_json(request: pytest.FixtureRequest):
|
||||
return request.config.getoption("--generate-json")
|
||||
|
@ -1,6 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/Aquila2-7B/test_convert_weight_from_huggingface.py
|
||||
pytest -s tests/pipeline/Aquila2-7B/test_generation.py
|
||||
pytest -s tests/pipeline/Aquila2-7B/test_evaluation.py
|
@ -1,60 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "11008",
|
||||
"--num-attention-heads", "32",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--max-position-embeddings", "2048",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--load", "/home/dataset/Aquila2-7B-tp8-pp1/"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/Aquila2-7B-hf"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "512",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--tokenizer-not-use-fast",
|
||||
"--task-data-path", "/home/dataset/eval_dataset/boolq/test",
|
||||
"--task", "boolq",
|
||||
"--max-new-tokens", "1"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "8",
|
||||
"--global-batch-size", "64",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "2048"
|
||||
],
|
||||
|
||||
"CONVERT_WEIGHT_PARAM": [
|
||||
"--model-type", "GPT",
|
||||
"--load-dir", "/home/dataset/Aquila2-7B-hf",
|
||||
"--save-dir", "/home/dataset/Aquila2-7B-tp8-pp1",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--tokenizer-model", "/home/dataset/Aquila2-7B-hf/tokenizer.json"
|
||||
]
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from utils import ParamConfig
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# configure params, the index starts from 1
|
||||
sys.argv = [sys.argv[0]] + ParamConfig.convert_weight_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(sys.argv[4], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertTrue(os.path.exists(os.path.join(sys.argv[4], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(sys.argv[6], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(sys.argv[12]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(),
|
||||
torch.Size([12501, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(),
|
||||
torch.Size([1536, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(),
|
||||
torch.Size([4096, 512]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(),
|
||||
torch.Size([2752, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(),
|
||||
torch.Size([4096, 1376]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(),
|
||||
torch.Size([4096]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([12501, 4096]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,100 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
import tqdm
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
from transformers import AutoTokenizer
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from tests.common import DistributedTest
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.evaluation_param + config.auxiliary_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def get_result(self, tokenizer, result):
|
||||
if result:
|
||||
final_result = [result[0]]
|
||||
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
|
||||
final_result.append("T")
|
||||
else:
|
||||
final_result.append("F")
|
||||
else:
|
||||
final_result = None
|
||||
return final_result
|
||||
|
||||
def test_boolq_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
|
||||
|
||||
answer_result = {}
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "boolq" in path:
|
||||
test_dir = path
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
with open(file_path, encoding='utf-8') as f:
|
||||
boolq_question_list = []
|
||||
for line in f.readlines():
|
||||
boolq_question_list.append(json.loads(line))
|
||||
boolq_question_list = boolq_question_list[:60]
|
||||
subject_result = {}
|
||||
acc_n = 0
|
||||
for index, item in enumerate(boolq_question_list):
|
||||
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
|
||||
result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
result = self.get_result(tokenizer, result)
|
||||
if result:
|
||||
answer = result[1]
|
||||
else:
|
||||
answer = None
|
||||
try:
|
||||
if torch.distributed.get_rank() == 0:
|
||||
subject_result[str(index)] = answer
|
||||
if subject_result[str(index)] == str(item['answer'])[0]:
|
||||
acc_n += 1
|
||||
except Exception as e:
|
||||
raise e
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(boolq_question_list)
|
||||
total_acc_n += acc_n
|
||||
answer_result['Boolq_dataset'] = subject_result
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.83) < 0.03)
|
@ -1,93 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
from tests.common import DistributedTest
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.auxiliary_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["解释一下“温故而知新”"]
|
||||
output = model.generate(instruction, detokenize=False)
|
||||
expected_output = [14727, 29728, 261, 22051, 1156, 3101, 422, 278, 0, 87,
|
||||
18392, 221, 1906, 358, 132, 237, 79, 221, 261, 81,
|
||||
14572, 2449, 2369, 72, 8022, 2449, 221, 261, 88, 14572]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
|
||||
output[:20].unsqueeze(0).float())
|
||||
print(cos_sim)
|
||||
assert_judge(cos_sim > 0.80)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
prompt = "解释一下“温故而知新”"
|
||||
system_template = ""
|
||||
dialog_template = "{instruction}"
|
||||
template = system_template + dialog_template
|
||||
instruction = template.format(instruction=prompt)
|
||||
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
expected_output = [391, 426, 1774, 1906, 8627, 10281, 261, 36018, 837, 42310,
|
||||
434, 10045, 6468, 29728, 278, 0, 43652, 43652, 1774, 1906,
|
||||
28043, 609, 43652, 43652, 1774, 1906, 8627, 10281, 261, 36018]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
|
||||
output[:20].unsqueeze(0).float())
|
||||
print(cos_sim)
|
||||
assert_judge(cos_sim > 0.80)
|
@ -1,25 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
tokenizer_param = config_file["TOKENIZER_PARAM"]
|
||||
convert_weight_param = config_file["CONVERT_WEIGHT_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,10 +0,0 @@
|
||||
#!/bin/bash
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/baichuan-13B/test_process_pretrain_data.py
|
||||
python tests/pipeline/baichuan-13B/test_process_instruction_data.py
|
||||
python tests/pipeline/baichuan-13B/test_convert_weight_from_huggingface.py
|
||||
pytest -s tests/pipeline/baichuan-13B/test_generation.py
|
||||
pytest -s tests/pipeline/baichuan-13B/test_evaluation.py
|
||||
pytest -s tests/pipeline/baichuan-13B/test_lora.py
|
||||
pytest -s tests/pipeline/baichuan-13B/test_trainer.py
|
@ -1,136 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "40",
|
||||
"--hidden-size", "5120",
|
||||
"--ffn-hidden-size", "13696",
|
||||
"--num-attention-heads", "40",
|
||||
"--position-embedding-type", "alibi",
|
||||
"--make-vocab-size-divisible-by", "64",
|
||||
"--max-position-embeddings", "4096",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--load", "/home/dataset/baichuan-13B-tp8-pp1/"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/baichuan-13B-hf"
|
||||
],
|
||||
|
||||
"PROCESS_DATA_INPUT_PATH": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet"
|
||||
],
|
||||
|
||||
"PROCESS_PRETRAIN_DATA_PARAM": [
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan-13B/alpaca",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA_PARAM": [
|
||||
"--output-prefix", "/home/dataset/tune-dataset-baichuan-13B/alpaca",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--handler-name", "GeneralInstructionHandler",
|
||||
"--append-eod"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--tokenizer-not-use-fast",
|
||||
"--task-data-path", "/home/dataset/eval_dataset/boolq/test",
|
||||
"--task", "boolq",
|
||||
"--max-new-tokens", "1"
|
||||
],
|
||||
|
||||
"LORA_PARAM": [
|
||||
"--finetune",
|
||||
"--lora-r", "16",
|
||||
"--lora-alpha", "32",
|
||||
"--lora-target-modules", "query_key_value", "dense", "gate_proj","dense_h_to_4h", "dense_4h_to_h",
|
||||
"--is-instruction-dataset",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--save", "/home/dataset/lora-save-weight-baichuan-13B",
|
||||
"--data-path", "/home/dataset/tune-dataset-baichuan-13B/alpaca",
|
||||
"--train-iters", "10"
|
||||
],
|
||||
|
||||
"LORA_INFERENCE_PARAM": [
|
||||
"--lora-r", "16",
|
||||
"--lora-alpha", "32",
|
||||
"--lora-target-modules", "query_key_value", "dense", "gate_proj","dense_h_to_4h", "dense_4h_to_h",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--lora-load", "/home/dataset/lora-save-weight-baichuan-13B"
|
||||
],
|
||||
"TRAINING_PARAM": [
|
||||
"--tokenizer-type", "Llama2Tokenizer",
|
||||
"--tokenizer-model", "/home/dataset/baichuan-13B-hf/tokenizer.model",
|
||||
"--save", "/autotest/dataset/save-weight-baichuan-13B",
|
||||
"--data-path", "/home/dataset/pretrain-dataset-baichuan-13B/alpaca_text_document",
|
||||
"--train-iters", "10"
|
||||
],
|
||||
|
||||
"REGULARIZATION": [
|
||||
"--attention-dropout", "0.0",
|
||||
"--hidden-dropout", "0.0",
|
||||
"--weight-decay", "1e-1",
|
||||
"--clip-grad", "1.0",
|
||||
"--adam-beta1", "0.9",
|
||||
"--adam-beta2", "0.95",
|
||||
"--adam-eps","1.0e-5"
|
||||
],
|
||||
|
||||
"LEARNING_RATE": [
|
||||
"--lr", "1e-5",
|
||||
"--lr-decay-style", "cosine",
|
||||
"--min-lr", "1e-7"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "8",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--fp16",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "4096",
|
||||
"--seed", "42"
|
||||
],
|
||||
|
||||
"TRAINING_AUX": [
|
||||
"--sequence-parallel",
|
||||
"--initial-loss-scale", "1024.0",
|
||||
"--use-fused-rmsnorm",
|
||||
"--init-method-std", "0.01",
|
||||
"--split", "100,0,0",
|
||||
"--log-interval", "1",
|
||||
"--save-interval", "10",
|
||||
"--eval-interval", "1000",
|
||||
"--eval-iters", "0",
|
||||
"--distributed-backend", "nccl",
|
||||
"--num-workers", "0"
|
||||
],
|
||||
|
||||
"CONVERT_WEIGHT_PARAM": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--load-dir", "/home/dataset/baichuan-13B-hf",
|
||||
"--save-dir", "/home/dataset/baichuan-13B-tp8-pp1",
|
||||
"--tokenizer-model", "None",
|
||||
"--w-pack", "True"
|
||||
]
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
export NPU_DETECT=0
|
||||
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6000
|
||||
NNODES=1
|
||||
NODE_RANK=0
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
||||
|
||||
DATA_PATH="/home/dataset/pretrain-dataset-baichuan-13B/alpaca_text_document"
|
||||
TOKENIZER_MODEL="/home/dataset/baichuan-13B-hf/tokenizer.model"
|
||||
CKPT_LOAD_DIR="/home/dataset/baichuan-13B-tp8-pp1"
|
||||
|
||||
TP=8
|
||||
PP=1
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT
|
||||
"
|
||||
|
||||
GPT_ARGS="
|
||||
--tensor-model-parallel-size $TP \
|
||||
--pipeline-model-parallel-size $PP \
|
||||
--sequence-parallel \
|
||||
--num-layers 40 \
|
||||
--hidden-size 5120 \
|
||||
--ffn-hidden-size 13696 \
|
||||
--num-attention-heads 40 \
|
||||
--tokenizer-type Llama2Tokenizer \
|
||||
--tokenizer-model $TOKENIZER_MODEL \
|
||||
--seq-length 4096 \
|
||||
--disable-bias-linear \
|
||||
--max-position-embeddings 4096 \
|
||||
--micro-batch-size 1 \
|
||||
--global-batch-size 32 \
|
||||
--untie-embeddings-and-output-weights \
|
||||
--make-vocab-size-divisible-by 64 \
|
||||
--lr 1e-5 \
|
||||
--no-gradient-accumulation-fusion \
|
||||
--load ${CKPT_LOAD_DIR} \
|
||||
--train-iters 2000 \
|
||||
--lr-decay-style cosine \
|
||||
--attention-dropout 0.0 \
|
||||
--position-embedding-type alibi \
|
||||
--hidden-dropout 0.0 \
|
||||
--normalization RMSNorm \
|
||||
--use-fused-rmsnorm \
|
||||
--swiglu \
|
||||
--attention-softmax-in-fp32 \
|
||||
--min-lr 1e-7 \
|
||||
--weight-decay 1e-1 \
|
||||
--clip-grad 1.0 \
|
||||
--adam-beta1 0.9 \
|
||||
--initial-loss-scale 1024.0 \
|
||||
--adam-beta2 0.95 \
|
||||
--adam-eps 1.0e-5 \
|
||||
--no-load-optim \
|
||||
--no-load-rng \
|
||||
--fp16
|
||||
"
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path $DATA_PATH \
|
||||
--split 949,50,1
|
||||
"
|
||||
|
||||
OUTPUT_ARGS="
|
||||
--log-interval 1 \
|
||||
--save-interval 10000 \
|
||||
--eval-interval 10000 \
|
||||
--eval-iters 1
|
||||
"
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl 2>&1 | tee /home/dataset/new_baichuan-13B.log
|
@ -1,64 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from utils import ParamConfig
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# configure params, the index starts from 1
|
||||
sys.argv = [sys.argv[0]] + ParamConfig.convert_weight_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(sys.argv[10], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertTrue(os.path.exists(os.path.join(sys.argv[10], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(sys.argv[12], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(sys.argv[8]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(),
|
||||
torch.Size([8000, 5120]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([5120]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(),
|
||||
torch.Size([1920, 5120]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(),
|
||||
torch.Size([5120, 640]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(),
|
||||
torch.Size([3424, 5120]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(),
|
||||
torch.Size([5120, 1712]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([5120]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(),
|
||||
torch.Size([5120]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 5120]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,100 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import torch
|
||||
import tqdm
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
from transformers import AutoTokenizer
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from tests.common import DistributedTest
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.evaluation_param + config.auxiliary_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def get_result(self, tokenizer, result):
|
||||
if result:
|
||||
final_result = [result[0]]
|
||||
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
|
||||
final_result.append("T")
|
||||
else:
|
||||
final_result.append("F")
|
||||
else:
|
||||
final_result = None
|
||||
return final_result
|
||||
|
||||
def test_boolq_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
|
||||
|
||||
answer_result = {}
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "boolq" in path:
|
||||
test_dir = path
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
with open(file_path, encoding='utf-8') as f:
|
||||
boolq_question_list = []
|
||||
for line in f.readlines():
|
||||
boolq_question_list.append(json.loads(line))
|
||||
boolq_question_list = boolq_question_list[:60]
|
||||
subject_result = {}
|
||||
acc_n = 0
|
||||
for index, item in enumerate(boolq_question_list):
|
||||
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
|
||||
result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
result = self.get_result(tokenizer, result)
|
||||
if result:
|
||||
answer = result[1]
|
||||
else:
|
||||
answer = None
|
||||
try:
|
||||
if torch.distributed.get_rank() == 0:
|
||||
subject_result[str(index)] = answer
|
||||
if subject_result[str(index)] == str(item['answer'])[0]:
|
||||
acc_n += 1
|
||||
except Exception as e:
|
||||
raise e
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(boolq_question_list)
|
||||
total_acc_n += acc_n
|
||||
answer_result['Boolq_dataset'] = subject_result
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.71) < 0.03)
|
@ -1,97 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
from tests.common import DistributedTest
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.auxiliary_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["解释一下“温故而知新”"]
|
||||
output = model.generate(instruction, detokenize=False)
|
||||
expected_output = [5, 31694, 31829, 31290, 31356, 31226, 31125, 1231, 31178, 31387,
|
||||
34360, 73, 31106, 5, 31106, 84, 31442, 32369, 85, 31106,
|
||||
5, 31106, 53, 31694, 31143, 31694, 31434, 73, 31106, 5,
|
||||
31106, 54, 31829, 31143, 32363, 31135, 3317, 73, 31106, 5,
|
||||
31106, 55, 31226, 31143, 5916, 3317, 73, 31106, 5, 31106]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
|
||||
output[:20].unsqueeze(0).float())
|
||||
print(cos_sim)
|
||||
assert_judge(cos_sim > 0.80)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
prompt = "解释一下“温故而知新”"
|
||||
system_template = ""
|
||||
dialog_template = "{instruction}"
|
||||
template = system_template + dialog_template
|
||||
instruction = template.format(instruction=prompt)
|
||||
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
expected_output = [5, 31694, 31829, 31290, 31356, 31226, 31125, 1231, 31178, 31387,
|
||||
34360, 73, 31106, 5, 31106, 84, 31442, 32369, 85, 31106,
|
||||
5, 31106, 53, 31694, 31143, 31694, 31434, 73, 31106, 5,
|
||||
31106, 54, 31829, 31143, 32363, 31135, 3317, 73, 31106, 5,
|
||||
31106, 55, 31226, 31143, 5916, 3317, 73, 31106, 5, 31106]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output)[:20].unsqueeze(0).float().npu(),
|
||||
output[:20].unsqueeze(0).float())
|
||||
print(cos_sim)
|
||||
assert_judge(cos_sim > 0.80)
|
@ -1,134 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators
|
||||
from tests.common import DistributedTest
|
||||
|
||||
|
||||
class TestLora(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.network_size + config.tokenizer_param \
|
||||
+ config.auxiliary_param + config.lora_param + config.regularization \
|
||||
+ config.learning_rate_param + config.training_aux + config.distributed_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_megatron_lora_module(self):
|
||||
self.init(config=ParamConfig)
|
||||
from megatron.core import tensor_parallel
|
||||
from pretrain_gpt import model_provider
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder
|
||||
)
|
||||
model = model[0]
|
||||
for name, module in model.named_modules():
|
||||
if name.endswith("query_key_value.lora_A.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
if name.endswith("query_key_value.lora_B.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
|
||||
|
||||
if name.endswith("dense.lora_A.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
|
||||
if name.endswith("dense.lora_B.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
|
||||
if name.endswith("dense_h_to_4h.lora_A.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
if name.endswith("dense_h_to_4h.lora_B.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
|
||||
|
||||
if name.endswith("dense_4h_to_h.lora_A.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
|
||||
if name.endswith("dense_4h_to_h.lora_B.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
|
||||
def test_lora(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time, num_floating_point_operations
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder
|
||||
)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
saved_checkpoint = False
|
||||
num_floating_point_operations_so_far = 0
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
@ -1,81 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
sys.argv = [sys.argv[0]] + ParamConfig.tokenizer_param + ParamConfig.process_data_input_path \
|
||||
+ ParamConfig.process_instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 64000)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1557, 52, 31141, 31150, 59, 31219])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), '度')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralInstructionHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = sys.argv[8].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 90, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,81 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
sys.argv = [sys.argv[0]] + ParamConfig.tokenizer_param + ParamConfig.process_data_input_path \
|
||||
+ ParamConfig.process_pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 64000)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [15498])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), 'prolong')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = sys.argv[8].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 25, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,152 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
|
||||
from tests.common import DistributedTest
|
||||
|
||||
|
||||
class TestTraining(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.network_size + config.auxiliary_param \
|
||||
+ config.regularization + config.learning_rate_param \
|
||||
+ config.training_aux + config.distributed_param + config.training_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
saved_checkpoint = False
|
||||
num_floating_point_operations_so_far = 0
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
break
|
||||
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
||||
|
||||
def test_breakpoint_renewal_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
self.args.load = self.args.save
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step
|
||||
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
iteration = self.args.iteration
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(f"iteration:{iteration}")
|
||||
assert_judge(iteration == 10)
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
|
||||
if iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
if 'lm loss' in loss_dict.keys():
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(f"loss:{loss_dict['lm loss']}")
|
||||
assert_judge(abs(loss_dict['lm loss'] - 0.97) < 0.1)
|
@ -1,47 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
distributed_param,
|
||||
network_size,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
lora_param,
|
||||
training_param,
|
||||
training_auxiliary,
|
||||
learning_rate,
|
||||
regularization,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
lora_param = config_file["LORA_PARAM"]
|
||||
training_param = config_file["TRAINING_PARAM"]
|
||||
training_aux = config_file["TRAINING_AUX"]
|
||||
learning_rate_param = config_file["LEARNING_RATE"]
|
||||
regularization = config_file["REGULARIZATION"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
tokenizer_param = config_file["TOKENIZER_PARAM"]
|
||||
process_pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA_PARAM"]
|
||||
process_instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA_PARAM"]
|
||||
process_data_input_path = config_file["PROCESS_DATA_INPUT_PATH"]
|
||||
lora_inference_param = config_file["LORA_INFERENCE_PARAM"]
|
||||
convert_weight_param = config_file["CONVERT_WEIGHT_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,5 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python ./tests/pipeline/baichuan-7B/test_process_pretrain_data.py
|
||||
python ./tests/pipeline/baichuan-7B/test_process_instruction_data.py
|
||||
|
@ -1,21 +0,0 @@
|
||||
{
|
||||
"PROCESS_PRETRAIN_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/baichuan-7B-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "GeneralInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-baichuan-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/baichuan-7B-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod"
|
||||
]
|
||||
}
|
@ -1,84 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 64000)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1557, 52, 31141, 31150, 59, 31219])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), '度')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralInstructionHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 89, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,88 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 64000)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [15498])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), 'prolong')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 25, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,20 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -8,7 +8,7 @@ from utils import ParamConfig, assert_judge
|
||||
from transformers import AutoTokenizer
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
|
@ -1,5 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python ./tests/pipeline/baichuan2-7B/test_process_pretrain_data.py
|
||||
python ./tests/pipeline/baichuan2-7B/test_process_instruction_data.py
|
||||
|
@ -1,21 +0,0 @@
|
||||
{
|
||||
"PROCESS_PRETRAIN_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-baichuan2-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/baichuan2-7B-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "GeneralInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-baichuan2-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/baichuan2-7B-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod"
|
||||
]
|
||||
}
|
@ -1,83 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 125696)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [92655, 92335, 92365, 92379, 92383, 92574])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), ' Norman')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralInstructionHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 83, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,83 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 125696)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [44985])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), '为孩子')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 23, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,20 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -7,7 +7,7 @@ import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
|
@ -3,7 +3,7 @@ import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
|
@ -1,5 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/codellama-34B/test_convert_ckpt_from_huggingface.py
|
||||
|
||||
pytest -s tests/pipeline/codellama-34B/test_generation.py
|
@ -1,62 +0,0 @@
|
||||
{
|
||||
"CONVERT_CKPT_PARAM": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--load-dir", "/home/dataset/codellama-34B-hf",
|
||||
"--save-dir", "/home/dataset/codellama-34B-mt-t8p1",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--target-pipeline-parallel-size", "1",
|
||||
"--tokenizer-model", "None"
|
||||
],
|
||||
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "48",
|
||||
"--hidden-size", "8192",
|
||||
"--ffn-hidden-size", "22016",
|
||||
"--num-attention-heads", "64",
|
||||
"--max-position-embeddings", "16384",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--load", "/home/dataset/codellama-34B-mt-t8p1"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/codellama-34B-hf"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint",
|
||||
"--attention-softmax-in-fp32"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "16",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--seed", "42",
|
||||
"--use-fused-rmsnorm",
|
||||
"--group-query-attention",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "4096",
|
||||
"--num-query-groups", "8",
|
||||
"--vocab-size", "32000",
|
||||
"--padded-vocab-size", "32000",
|
||||
"--rotary-base", "1000000"
|
||||
]
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from utils import ParamConfig
|
||||
import torch
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 7)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 8192]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([8192]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1280, 8192]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([8192, 1024]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5504, 8192]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([8192, 2752]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([8192]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([8192]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 8192]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,100 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.auxiliary_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["import socket\n\ndef ping_exponential_backoff(host: str):"]
|
||||
output = model.generate(instruction, detokenize=False)
|
||||
expected_output1 = [13, 1678, 9995, 13, 1678, 349, 292, 263, 3495, 773,
|
||||
25658, 1250, 2696, 29889, 13, 1678, 9995, 13, 1678, 363,
|
||||
474, 297, 3464, 29898, 29896, 29892, 29871, 29896, 29900, 1125,
|
||||
13, 4706, 1018, 29901, 13, 9651, 9909, 29889, 29887, 621,
|
||||
520, 29890, 948, 420, 29898, 3069, 29897, 13, 9651, 736]
|
||||
expected_output2 = [13, 1678, 9995, 13, 1678, 349, 292, 263, 3495, 773,
|
||||
25658, 1250, 2696, 29889, 13, 1678, 9995, 13, 1678, 9055,
|
||||
353, 29871, 29896, 13, 1678, 1550, 5852, 29901, 13, 4706,
|
||||
1018, 29901, 13, 9651, 9909, 29889, 29887, 621, 520, 29890,
|
||||
948, 420, 29898, 3069, 29897, 13, 9651, 736, 13, 4706]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
|
||||
output[:50].unsqueeze(0).float())
|
||||
cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
|
||||
output[:50].unsqueeze(0).float())
|
||||
cos_sim = torch.max(cos_sim1, cos_sim2)
|
||||
print("similarity: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.95)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "def fibonacci("
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
expected_output = [29876, 1125, 13, 1678, 565, 302, 1275, 29871, 29900, 29901,
|
||||
13, 4706, 736, 29871, 29900, 13, 1678, 25342, 302, 1275,
|
||||
29871, 29896, 29901, 13, 4706, 736, 29871, 29896, 13, 1678,
|
||||
1683, 29901, 13, 4706, 736, 18755, 265, 21566, 29898, 29876,
|
||||
448, 29871, 29896, 29897, 718, 18755, 265, 21566, 29898, 29876]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
|
||||
output[:50].unsqueeze(0).float())
|
||||
print("similarity: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.95)
|
@ -1,33 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
convert_ckpt_param,
|
||||
network_size,
|
||||
tokenizer_param,
|
||||
distributed_param,
|
||||
inference_param,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
tokenizer_param = config_file["TOKENIZER_PARAM"]
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,130 +0,0 @@
|
||||
import json
|
||||
import argparse
|
||||
from typing import Tuple, Optional
|
||||
import pandas as pd
|
||||
|
||||
class Comparator:
|
||||
def __init__(self,
|
||||
base_path_prefix: str,
|
||||
test_path_prefix: str,
|
||||
loss_error_rate: float = 0.02,
|
||||
perf_error_rate: float = 0.03,
|
||||
mem_error_rate: float = 0.003,
|
||||
warm_up: int = 1,
|
||||
compute_steps: int = 2000):
|
||||
self.base_path_prefix = base_path_prefix
|
||||
self.test_path_prefix = test_path_prefix
|
||||
self.loss_error_rate = loss_error_rate
|
||||
self.perf_error_rate = perf_error_rate
|
||||
self.mem_error_rate = mem_error_rate
|
||||
self.compute_steps = compute_steps
|
||||
self.warm_up = warm_up
|
||||
|
||||
def _read_check_loss_file(self) -> Optional[Tuple[pd.DataFrame, pd.DataFrame, int]]:
|
||||
base_loss_pd = pd.read_csv(f"{self.base_path_prefix}_loss.tsv", sep='\t')
|
||||
test_loss_pd = pd.read_csv(f"{self.test_path_prefix}_loss.tsv", sep='\t')
|
||||
if len(base_loss_pd) < self.compute_steps or len(test_loss_pd) < self.compute_steps:
|
||||
print("The log doesn't have enough steps to compute!")
|
||||
return None
|
||||
|
||||
base_loss_start = base_loss_pd.loss.ne(float('inf')).argmax()
|
||||
test_loss_start = test_loss_pd.loss.ne(float('inf')).argmax()
|
||||
if base_loss_start != test_loss_start:
|
||||
print("The validate loss step is not equal!")
|
||||
return None
|
||||
|
||||
return base_loss_pd, test_loss_pd, base_loss_start
|
||||
|
||||
|
||||
|
||||
def compare_memory(self) -> bool:
|
||||
base_mem_pd = pd.read_csv(f"{self.base_path_prefix}_memory.tsv", sep='\t')
|
||||
test_mem_pd = pd.read_csv(f"{self.test_path_prefix}_memory.tsv", sep='\t')
|
||||
base_mem_mean = base_mem_pd.memory.mean()
|
||||
test_mem_mean = test_mem_pd.memory.mean()
|
||||
if base_mem_mean * (1 + self.mem_error_rate) < test_mem_mean:
|
||||
print("Memory test failed!")
|
||||
return False
|
||||
|
||||
print("Memory test pass!")
|
||||
return True
|
||||
|
||||
def compare_perf(self) -> bool:
|
||||
result = self._read_check_loss_file()
|
||||
if not result:
|
||||
return False
|
||||
|
||||
base_loss_pd, test_loss_pd, loss_start = result
|
||||
|
||||
with open(f"{self.base_path_prefix}_parameters.json") as f:
|
||||
base_params = json.load(f)
|
||||
|
||||
with open(f"{self.test_path_prefix}_parameters.json") as f:
|
||||
test_params = json.load(f)
|
||||
|
||||
if base_params != test_params:
|
||||
print("The parameters are not equal")
|
||||
return False
|
||||
|
||||
global_batch_size = base_params.get("global_batch_size") or base_params.get("train_batch_size")
|
||||
seq_length = base_params.get("seq_length") or base_params.get("seq-length")
|
||||
world_size = base_params.get("world_size", 8)
|
||||
|
||||
|
||||
# Here we need to skip the first steps until the training is stable
|
||||
base_itertime_mean = base_loss_pd[self.warm_up:self.compute_steps].iter_time.mean()
|
||||
test_itertime_mean = test_loss_pd[self.warm_up:self.compute_steps].iter_time.mean()
|
||||
|
||||
base_perf = global_batch_size * seq_length / world_size / base_itertime_mean
|
||||
test_perf = global_batch_size * seq_length / world_size / test_itertime_mean
|
||||
|
||||
if (1 - self.perf_error_rate) * base_perf > test_perf:
|
||||
print("Perf test failed!")
|
||||
return False
|
||||
|
||||
print("Perf test pass!")
|
||||
return True
|
||||
|
||||
|
||||
def compare_loss(self) -> bool:
|
||||
result = self._read_check_loss_file()
|
||||
if not result:
|
||||
return False
|
||||
|
||||
base_loss_pd, test_loss_pd, loss_start = result
|
||||
|
||||
loss_error_rates = (test_loss_pd[loss_start:self.compute_steps].loss - base_loss_pd[loss_start:self.compute_steps].loss) / base_loss_pd[loss_start:self.compute_steps].loss
|
||||
if abs(loss_error_rates.mean()) > self.loss_error_rate:
|
||||
print("Loss test failed!")
|
||||
return False
|
||||
|
||||
print("Loss test pass!")
|
||||
return True
|
||||
|
||||
|
||||
def __call__(self) -> None:
|
||||
self.compare_loss()
|
||||
self.compare_perf()
|
||||
self.compare_memory()
|
||||
|
||||
def main(args):
|
||||
Comparator(args.base_path_prefix,
|
||||
args.test_path_prefix,
|
||||
args.loss_error_rate,
|
||||
args.perf_error_rate,
|
||||
args.mem_error_rate,
|
||||
args.warm_up,
|
||||
args.compute_steps)()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Compare log data between baseline and test')
|
||||
parser.add_argument('base_path_prefix', help='The baseline path prefix')
|
||||
parser.add_argument('test_path_prefix', help='The test path prefix')
|
||||
parser.add_argument('--loss_error_rate', type=float, default=0.02, help='The loss error rate')
|
||||
parser.add_argument('--perf_error_rate', type=float, default=0.03, help='The perf error rate')
|
||||
parser.add_argument('--warm_up', type=int, default=1, help='The perf test start from warm_up step')
|
||||
parser.add_argument('--mem_error_rate', type=float, default=0.003, help='The memory error rate')
|
||||
parser.add_argument('--compute_steps', type=int, default=2000, help='The compute steps')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -1,157 +0,0 @@
|
||||
from abc import ABC, abstractmethod
|
||||
import json
|
||||
import argparse
|
||||
|
||||
class BaseLogExtractor(ABC):
|
||||
def __init__(self, input_path:str, out_path_prefix:str):
|
||||
self.input_path = input_path
|
||||
self.out_path_prefix = out_path_prefix
|
||||
self.losses = []
|
||||
self.memories = []
|
||||
self.parameters = {"global_batch_size": 0,
|
||||
"seq_length": 0,
|
||||
"world_size": 0}
|
||||
|
||||
@abstractmethod
|
||||
def _extract_parameter(self, line: str) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _extract_iterline(self, line: str) -> None:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _extract_memory(self, line) -> None:
|
||||
pass
|
||||
|
||||
def _extract(self) -> None:
|
||||
with open(self.input_path, encoding='utf-8') as f:
|
||||
for line in f:
|
||||
self._extract_parameter(line)
|
||||
self._extract_iterline(line)
|
||||
self._extract_memory(line)
|
||||
|
||||
|
||||
def _save(self) -> None:
|
||||
loss_path = f"{self.out_path_prefix}_loss.tsv"
|
||||
with open(loss_path, 'w') as f:
|
||||
f.write("step\tloss\titer_time\n")
|
||||
for step, loss, iter_time in self.losses:
|
||||
f.write(f"{step}\t{loss}\t{iter_time}\n")
|
||||
|
||||
memory_path = f"{self.out_path_prefix}_memory.tsv"
|
||||
with open(memory_path, 'w') as f:
|
||||
f.write("rank_id\tmemory\n")
|
||||
for rank_id, memory in sorted(self.memories):
|
||||
f.write(f"{rank_id}\t{memory}\n")
|
||||
|
||||
parameters_path = f"{self.out_path_prefix}_parameters.json"
|
||||
with open(parameters_path, 'w') as f:
|
||||
json.dump(self.parameters, f, indent=4)
|
||||
|
||||
|
||||
def __call__(self):
|
||||
self._extract()
|
||||
self._save()
|
||||
|
||||
|
||||
class MegatronLogExtractor(BaseLogExtractor):
|
||||
|
||||
def _extract_parameter(self, line: str) -> None:
|
||||
for param in self.parameters.keys():
|
||||
if line.startswith(f" {param}"):
|
||||
blank_pos = line.rfind(' ')
|
||||
self.parameters[param] = int(line[blank_pos:])
|
||||
|
||||
def _extract_iterline(self, line: str):
|
||||
|
||||
if (len(line) < 23 or not line[22:].startswith(" iteration")) and (not line.startswith(" iteration")):
|
||||
return
|
||||
|
||||
backslash_pos = line.find('/')
|
||||
blank_pos = line.rfind(' ', 0, backslash_pos)
|
||||
step = line[blank_pos:backslash_pos]
|
||||
ms_pos = line.find('(ms):')
|
||||
pipe_pos = line.find('|', ms_pos)
|
||||
iter_time = line[ms_pos+6: pipe_pos-1]
|
||||
loss_pos = line.find('lm loss:')
|
||||
if loss_pos > 0:
|
||||
bar_pos = line.find('|', loss_pos)
|
||||
loss = line[loss_pos+9:bar_pos-1]
|
||||
else:
|
||||
loss = 'inf'
|
||||
self.losses.append((int(step), float(loss), float(iter_time)))
|
||||
|
||||
def _extract_memory(self, line) -> None:
|
||||
if not line.startswith("[Rank"):
|
||||
return
|
||||
|
||||
start = 0
|
||||
while start >= 0:
|
||||
rsb_pos = line.find(']', start)
|
||||
rankid = line[start+6:rsb_pos]
|
||||
mem_pos = line.find('allocated:', rsb_pos)
|
||||
pipe_pos = line.find('|', mem_pos)
|
||||
memory = line[mem_pos+11:pipe_pos-1]
|
||||
self.memories.append((int(rankid), float(memory)))
|
||||
start = line.find("[Rank", pipe_pos)
|
||||
|
||||
|
||||
class DeepSpeedLogExtractor(BaseLogExtractor):
|
||||
|
||||
def __init__(self, input_path: str, out_path_prefix: str):
|
||||
super().__init__(input_path, out_path_prefix)
|
||||
self.parameters = {
|
||||
"train_batch_size": 0,
|
||||
"seq-length": 0
|
||||
}
|
||||
|
||||
def _extract_parameter(self, line: str) -> None:
|
||||
for param in self.parameters.keys():
|
||||
param_pos = line.find(f" \"{param}\":")
|
||||
if f" \"{param}\":" in line:
|
||||
colon_pos = line.find(':', param_pos)
|
||||
comma_pos = line.find(',', colon_pos)
|
||||
self.parameters[param] = int(line[colon_pos + 1 : comma_pos])
|
||||
|
||||
def _extract_iterline(self, line: str):
|
||||
if not line.startswith("steps: "):
|
||||
return
|
||||
|
||||
step_pos = 0
|
||||
loss_pos = line.find(' loss:')
|
||||
iter_time_pos = line.find(' iter time (s):')
|
||||
iter_time_end = line.find(' samples/sec:')
|
||||
|
||||
step = line[step_pos + 7 : loss_pos]
|
||||
iter_time = line[iter_time_pos + 15 : iter_time_end]
|
||||
if loss_pos > 0:
|
||||
loss = line[loss_pos + 6 : iter_time_pos]
|
||||
else:
|
||||
loss = 'inf'
|
||||
self.losses.append((int(step), float(loss), float(iter_time)))
|
||||
|
||||
def _extract_memory(self, line) -> None:
|
||||
if not line.startswith("after 1 iterations memory (MB)"):
|
||||
return
|
||||
|
||||
mem_pos = line.find('allocated: ')
|
||||
pipe_pos = line.find('|', mem_pos)
|
||||
memory = line[mem_pos + 11 : pipe_pos - 1]
|
||||
self.memories.append((0, float(memory)))
|
||||
|
||||
|
||||
def main(args):
|
||||
if args.frame_kind.lower() == 'megatron':
|
||||
MegatronLogExtractor(args.input_path, args.output_path_prefix)()
|
||||
if args.frame_kind.lower() == 'deepspeed':
|
||||
DeepSpeedLogExtractor(args.input_path, args.output_path_prefix)()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='extract loss, performance and memory data from training log')
|
||||
parser.add_argument('frame_kind', help='The training frame: Megatron, Deepspeed or Torch')
|
||||
parser.add_argument('input_path', help='The training log path')
|
||||
parser.add_argument('output_path_prefix', help='The output path prefix')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
@ -1,7 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/gemma-7B/test_process_pretrain_data.py
|
||||
python tests/pipeline/gemma-7B/test_convert_ckpt_from_huggingface.py
|
||||
|
||||
pytest -s tests/pipeline/gemma-7B/test_generation.py
|
||||
pytest -s tests/pipeline/gemma-7B/test_evaluation.py
|
@ -1,76 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "28",
|
||||
"--hidden-size", "3072",
|
||||
"--ffn-hidden-size", "24576",
|
||||
"--num-attention-heads", "16",
|
||||
"--max-position-embeddings", "8192",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--normalization", "RMSNorm",
|
||||
"--add-rmsnorm-offset",
|
||||
"--geglu",
|
||||
"--kv-channels", "256",
|
||||
"--input-embeds-norm",
|
||||
"--vocab-size", "256000"
|
||||
],
|
||||
|
||||
"INFERENCE_AUX": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-model", "/home/dataset/gemma-7B-hf/tokenizer.model",
|
||||
"--tokenizer-name-or-path", "/home/dataset/gemma-7B-hf",
|
||||
"--load", "/home/dataset/gemma-7B-tp8-pp1",
|
||||
"--seed", "42",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test/",
|
||||
"--max-new-tokens", "1"
|
||||
],
|
||||
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "16",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "8192"
|
||||
],
|
||||
|
||||
|
||||
"PROCESS_PRETRAIN_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-gemma-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/gemma-7B-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
|
||||
"CONVERT_CKPT_FROM_HF": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "gemma_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--load-dir", "/home/dataset/gemma-7B-hf",
|
||||
"--save-dir", "/home/dataset/gemma-7B-tp8-pp1",
|
||||
"--tokenizer-model", "/home/dataset/gemma-7B-hf/tokenizer.model"
|
||||
]
|
||||
}
|
@ -1,56 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from utils import ParamConfig
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
st_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.safetensors"))
|
||||
self.assertEqual(len(st_file), 4)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "model.safetensors.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([32000, 3072]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([3072]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 3072]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([3072, 512]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([6144, 3072]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([3072, 3072]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([3072]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([3072]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,106 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
|
||||
config.inference_aux + config.evaluation_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def get_result(self, tokenizer, result):
|
||||
if result:
|
||||
final_result = [result[0]]
|
||||
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
|
||||
final_result.append('T')
|
||||
else:
|
||||
final_result.append('F')
|
||||
else:
|
||||
final_result = None
|
||||
return final_result
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
temp = []
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df[0:10]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0][1]
|
||||
temp.append(answer)
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.572) < 0.1)
|
@ -1,105 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
|
||||
config.inference_aux + config.inference_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["how are you?", "Give me three tips for staying healthy."]
|
||||
output = model.generate(instruction)
|
||||
expect_output1 = [
|
||||
""" I am an AI language model, and I am here to help you with your queries. How can I assist you today? """
|
||||
]
|
||||
expect_output2 = [
|
||||
"""1. Eat a balanced diet \n2. Get regular exercise \n3. Get enough sleep """
|
||||
]
|
||||
|
||||
expect_output1_seq = "".join(expect_output1)
|
||||
expect_output2_seq = ''.join(expect_output2)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output[0])
|
||||
print(output[1])
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
|
||||
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity2:", similarity2)
|
||||
assert_judge(similarity1 > 0.8)
|
||||
assert_judge(similarity2 > 0.8)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "What is the whether like today?"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False
|
||||
)
|
||||
expected_output = [
|
||||
"""I do not have access to real-time information, therefore I cannot provide you with the weather for today."""
|
||||
]
|
||||
expected_output_seq = "".join(expected_output)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
|
||||
print(output)
|
||||
print("similarity:", similarity)
|
||||
assert_judge(similarity > 0.75)
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 256000)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [2, 4594])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), ' infinite')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '<eos>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 22, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,34 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
distributed_param,
|
||||
network_size,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_aux = config_file["INFERENCE_AUX"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,7 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/intern-7B/test_process_pretrain_data.py
|
||||
python tests/pipeline/intern-7B/test_convert_ckpt_from_huggingface.py
|
||||
pytest -s tests/pipeline/intern-7B/test_generation.py
|
||||
pytest -s tests/pipeline/intern-7B/test_evalution.py
|
||||
pytest -s tests/pipeline/intern-7B/test_trainer.py
|
@ -1,118 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "11008",
|
||||
"--num-attention-heads", "32",
|
||||
"--max-position-embeddings", "2048",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "32",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--add-qkv-bias",
|
||||
"--add-dense-bias",
|
||||
"--skip-bias-add"
|
||||
],
|
||||
|
||||
"INFERENCE_AUX": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model",
|
||||
"--tokenizer-name-or-path", "/home/dataset/intern-hf/",
|
||||
"--load", "/home/dataset/intern-tp8-pp1/",
|
||||
"--seed", "42",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "64"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test/",
|
||||
"--task", "mmlu",
|
||||
"--max-new-tokens", "2"
|
||||
],
|
||||
|
||||
"TRAINING_PARAM": [
|
||||
"--save", "/autotest/dataset/save-weight-intern",
|
||||
"--data-path", "/home/dataset/pretrain-dataset-intern/alpaca_text_document",
|
||||
"--train-iters", "15"
|
||||
],
|
||||
|
||||
"REGULARIZATION": [
|
||||
"--attention-dropout", "0.0",
|
||||
"--hidden-dropout", "0.0",
|
||||
"--weight-decay", "1e-1",
|
||||
"--clip-grad", "1.0",
|
||||
"--adam-beta1", "0.9",
|
||||
"--adam-beta2", "0.95"
|
||||
],
|
||||
|
||||
"LEARNING_RATE": [
|
||||
"--lr", "1.25e-6",
|
||||
"--lr-decay-style", "cosine",
|
||||
"--lr-warmup-fraction", "0.01",
|
||||
"--min-lr", "1.25e-7"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "8",
|
||||
"--global-batch-size", "64",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "2048"
|
||||
],
|
||||
|
||||
"TRAINING_AUX": [
|
||||
"--sequence-parallel",
|
||||
"--initial-loss-scale", "65536",
|
||||
"--use-flash-attn",
|
||||
"--use-fused-rmsnorm",
|
||||
"--init-method-std", "0.01",
|
||||
"--split", "100,0,0",
|
||||
"--log-interval", "1",
|
||||
"--save-interval", "10",
|
||||
"--eval-interval", "1000",
|
||||
"--eval-iters", "0",
|
||||
"--num-workers", "0",
|
||||
"--distributed-backend", "nccl",
|
||||
"--tokenizer-type", "Llama2Tokenizer",
|
||||
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model"
|
||||
],
|
||||
|
||||
"PROCESS_PRETRAIN_DATA":[
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-intern/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/intern-hf",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--handler-name", "AlpacaPretrainHandler",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--append-eod"
|
||||
],
|
||||
|
||||
"CONVERT_CKPT_PARAM":[
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--load-dir", "/home/dataset/intern-hf",
|
||||
"--save-dir", "/home/dataset/intern-tp8-pp1",
|
||||
"--tokenizer-model", "/home/dataset/intern-hf/tokenizer.model",
|
||||
"--add-qkv-bias",
|
||||
"--add-dense-bias"
|
||||
]
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from utils import ParamConfig
|
||||
import torch
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 8)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([12896, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.bias'].size(), torch.Size([1536]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([2752, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1376]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.bias'].size(), torch.Size([4096]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([12896, 4096]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,96 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.evaluation_param + config.inference_aux + config.auxiliary_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path, trust_remote_code=True)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
temp = []
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df.iloc[0:20]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0].strip()
|
||||
temp.append(answer)
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.41) <= 0.02)
|
@ -1,100 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.inference_aux + config.auxiliary_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["春夏秋冬,四个季节"]
|
||||
output = model.generate(instruction, detokenize=False)
|
||||
|
||||
expected_output = [98899, 67921, 70257, 67780, 60443, 67942, 68212, 98899, 60357, 60443,
|
||||
67942, 60515, 98899, 60357, 60443, 67942, 68123, 99157, 364, 61145,
|
||||
98899, 60355, 67546, 60353, 62513, 60410, 98899, 60355, 72801, 61209,
|
||||
60431, 98899, 60355, 60758, 70447, 83396, 98899, 60355, 60758, 60958,
|
||||
60353, 68124, 99157, 364, 61145, 60353, 62513, 60410, 98899, 60355,
|
||||
67546, 60353, 62513, 60410, 98899, 60355, 72801, 61209, 60431, 98899,
|
||||
]
|
||||
|
||||
expected_output_seq = torch.tensor(expected_output)[:20].unsqueeze(0).float().npu()
|
||||
output_seq = output[:20].unsqueeze(0).float()
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(len(output))
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(expected_output_seq, output_seq)
|
||||
print("cos_sim:", cos_sim)
|
||||
assert_judge(cos_sim > 0.80)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "春夏秋冬,四个季节"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
expected_output = [98899, 67921, 70257, 67780, 60724, 71526, 68881, 99157, 60450, 67921,
|
||||
70257, 60417, 98899, 60661, 67780, 60724, 60434, 68108, 60477, 61472,
|
||||
60353, 76934, 99157, 364, 72196, 98899, 75427, 83396, 99157, 364,
|
||||
69025, 98899, 83649, 61549, 60511, 99157, 364, 75814, 98899, 62084,
|
||||
60449, 61469, 61469, 99157, 364, 69713, 98899, 61139, 60620, 60862,
|
||||
]
|
||||
expected_output_seq = torch.tensor(expected_output)[:15].unsqueeze(0).float().npu()
|
||||
output_seq = output[:15].unsqueeze(0).float()
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(expected_output_seq, output_seq)
|
||||
print("cos_sim:", cos_sim)
|
||||
assert_judge(cos_sim > 0.6)
|
||||
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import AlpacaPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + config.process_pretrain_data
|
||||
self.config = config.process_pretrain_data
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 103168)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 2463])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), ' possibilities')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, AlpacaPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 28, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,152 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import torch
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
|
||||
|
||||
|
||||
class TestTraining(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.training_param + config.auxiliary_param + config.learning_rate_param + \
|
||||
config.training_aux + config.regularization
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
num_floating_point_operations_so_far = 0
|
||||
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
saved_checkpoint = False
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
break
|
||||
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
||||
|
||||
|
||||
def test_breakpoint_renewal_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
self.args.load = self.args.save
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step
|
||||
if self.args.load == self.args.save:
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
iteration = self.args.iteration
|
||||
assert_judge(iteration == 10)
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
|
||||
if iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
|
||||
assert_judge(abs(loss_dict.get('lm loss') - 8.37) < 0.3)
|
@ -1,43 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
distributed_param,
|
||||
network_size,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
lora_param,
|
||||
training_param,
|
||||
training_auxiliary,
|
||||
learning_rate,
|
||||
regularization,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
training_param = config_file["TRAINING_PARAM"]
|
||||
training_aux = config_file["TRAINING_AUX"]
|
||||
learning_rate_param = config_file["LEARNING_RATE"]
|
||||
regularization = config_file["REGULARIZATION"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
process_pretrain_data = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
inference_aux = config_file["INFERENCE_AUX"]
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,10 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/llama2-7B/test_process_pretrain_data.py
|
||||
python tests/pipeline/llama2-7B/test_process_instruction_data.py
|
||||
python tests/pipeline/llama2-7B/test_convert_ckpt_from_huggingface.py
|
||||
|
||||
pytest -s tests/pipeline/llama2-7B/test_generation.py
|
||||
pytest -s tests/pipeline/llama2-7B/test_evaluation.py
|
||||
pytest -s tests/pipeline/llama2-7B/test_lora.py
|
||||
pytest -s tests/pipeline/llama2-7B/test_trainer.py
|
@ -1,134 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "11008",
|
||||
"--num-attention-heads", "32",
|
||||
"--max-position-embeddings", "4096",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights"
|
||||
],
|
||||
|
||||
"INFERENCE_AUX": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model",
|
||||
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
|
||||
"--load", "/home/dataset/llama2-7B-tp8-pp1",
|
||||
"--seed", "42",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--task-data-path", "/home/dataset/eval_dataset/boolq/test/", "/home/dataset/eval_dataset/mmlu/test/",
|
||||
"--max-new-tokens", "2"
|
||||
],
|
||||
|
||||
"LORA_PARAM": [
|
||||
"--finetune",
|
||||
"--is-instruction-dataset",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
|
||||
"--lora-r", "16",
|
||||
"--lora-alpha", "32",
|
||||
"--lora-target-modules", "query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h",
|
||||
"--load", "/home/dataset/llama2-7B-tp8-pp1",
|
||||
"--save", "/home/dataset/lora-save-weight-llama2-7B",
|
||||
"--data-path", "/home/dataset/tune-dataset-llama2-7B/alpaca",
|
||||
"--train-iters", "10"
|
||||
],
|
||||
|
||||
"TRAINING_PARAM": [
|
||||
"--tokenizer-type", "Llama2Tokenizer",
|
||||
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model",
|
||||
"--save", "/autotest/dataset/save-weight-llama2-7B",
|
||||
"--data-path", "/home/dataset/pretrain-dataset-llama2-7B/alpaca_text_document",
|
||||
"--train-iters", "15"
|
||||
],
|
||||
|
||||
"REGULARIZATION": [
|
||||
"--attention-dropout", "0.0",
|
||||
"--hidden-dropout", "0.0",
|
||||
"--weight-decay", "1e-1",
|
||||
"--clip-grad", "1.0",
|
||||
"--adam-beta1", "0.9",
|
||||
"--adam-beta2", "0.95"
|
||||
],
|
||||
|
||||
"LEARNING_RATE": [
|
||||
"--lr", "1.25e-6",
|
||||
"--lr-decay-style", "cosine",
|
||||
"--lr-warmup-fraction", "0.01",
|
||||
"--min-lr", "1.25e-7"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "4",
|
||||
"--global-batch-size", "16",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "4096"
|
||||
],
|
||||
|
||||
"TRAINING_AUX": [
|
||||
"--sequence-parallel",
|
||||
"--initial-loss-scale", "65536",
|
||||
"--use-flash-attn",
|
||||
"--use-fused-rmsnorm",
|
||||
"--init-method-std", "0.01",
|
||||
"--split", "100,0,0",
|
||||
"--log-interval", "1",
|
||||
"--save-interval", "10",
|
||||
"--eval-interval", "1000",
|
||||
"--eval-iters", "0",
|
||||
"--num-workers", "0",
|
||||
"--distributed-backend", "nccl"
|
||||
],
|
||||
|
||||
"PROCESS_PRETRAIN_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-llama2-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "GeneralInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-llama2-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/llama2-7B",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod"
|
||||
],
|
||||
|
||||
"CONVERT_CKPT_FROM_HF": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--load-dir", "/home/dataset/llama2-7B",
|
||||
"--save-dir", "/home/dataset/llama2-7B-tp8-pp1",
|
||||
"--tokenizer-model", "/home/dataset/llama2-7B/tokenizer.model"
|
||||
]
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
export NPU_ASD_ENABLE=0
|
||||
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6000
|
||||
NNODES=1
|
||||
NODE_RANK=0
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
||||
|
||||
DATA_PATH=/home/dataset/pretrain-dataset-llama2-7B/alpaca_text_document
|
||||
TOKENIZER_MODEL=/home/dataset/llama2-7B/tokenizer.model
|
||||
TP=8
|
||||
PP=1
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT
|
||||
"
|
||||
|
||||
GPT_ARGS="
|
||||
--tensor-model-parallel-size ${TP} \
|
||||
--pipeline-model-parallel-size ${PP} \
|
||||
--sequence-parallel \
|
||||
--num-layers 32 \
|
||||
--hidden-size 4096 \
|
||||
--ffn-hidden-size 11008 \
|
||||
--num-attention-heads 32 \
|
||||
--tokenizer-type Llama2Tokenizer \
|
||||
--tokenizer-model ${TOKENIZER_MODEL} \
|
||||
--seq-length 4096 \
|
||||
--max-position-embeddings 4096 \
|
||||
--micro-batch-size 4 \
|
||||
--global-batch-size 16 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
--lr 1.25e-6 \
|
||||
--train-iters 2000 \
|
||||
--lr-decay-style cosine \
|
||||
--untie-embeddings-and-output-weights \
|
||||
--disable-bias-linear \
|
||||
--attention-dropout 0.0 \
|
||||
--init-method-std 0.01 \
|
||||
--hidden-dropout 0.0 \
|
||||
--position-embedding-type rope \
|
||||
--normalization RMSNorm \
|
||||
--use-fused-rmsnorm \
|
||||
--swiglu \
|
||||
--use-flash-attn \
|
||||
--no-masked-softmax-fusion \
|
||||
--attention-softmax-in-fp32 \
|
||||
--min-lr 1.25e-7 \
|
||||
--weight-decay 1e-1 \
|
||||
--lr-warmup-fraction 0.01 \
|
||||
--clip-grad 1.0 \
|
||||
--adam-beta1 0.9 \
|
||||
--initial-loss-scale 65536 \
|
||||
--adam-beta2 0.95 \
|
||||
--no-gradient-accumulation-fusion \
|
||||
--no-load-optim \
|
||||
--no-load-rng \
|
||||
--bf16
|
||||
"
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path $DATA_PATH \
|
||||
--split 100,0,0
|
||||
"
|
||||
|
||||
OUTPUT_ARGS="
|
||||
--log-interval 1 \
|
||||
--save-interval 10000 \
|
||||
--eval-interval 5000 \
|
||||
--eval-iters 0 \
|
||||
"
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl 2>&1 | tee /home/dataset/new_llama2-7B.log
|
@ -1,58 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from utils import ParamConfig
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 2)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1536, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([2752, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1376]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 4096]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,169 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
|
||||
config.inference_aux + config.evaluation_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def get_result(self, tokenizer, result):
|
||||
if result:
|
||||
final_result = [result[0]]
|
||||
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
|
||||
final_result.append('T')
|
||||
else:
|
||||
final_result.append('F')
|
||||
else:
|
||||
final_result = None
|
||||
return final_result
|
||||
|
||||
def test_boolq_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{passage}\nQuestion: {question}?\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
for path in self.args.task_data_path:
|
||||
if "boolq" in path:
|
||||
test_dir = path
|
||||
print(test_dir)
|
||||
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
with open(file_path, encoding='utf-8') as f:
|
||||
boolq_question_list = []
|
||||
for line in f.readlines():
|
||||
boolq_question_list.append(json.loads(line))
|
||||
boolq_question_list = boolq_question_list[:654]
|
||||
subject_result = {}
|
||||
acc_n = 0
|
||||
for index, item in enumerate(boolq_question_list):
|
||||
instruction = instruction_template.format(passage=item['passage'], question=item['question'])
|
||||
result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
result = self.get_result(tokenizer, result)
|
||||
if result:
|
||||
answer = result[1]
|
||||
else:
|
||||
answer = None
|
||||
try:
|
||||
if torch.distributed.get_rank() == 0:
|
||||
subject_result[str(index)] = answer
|
||||
if subject_result[str(index)] == str(item['answer'])[0]:
|
||||
acc_n += 1
|
||||
except Exception as e:
|
||||
if torch.distributed.get_rank() == 0:
|
||||
raise e
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(boolq_question_list)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge((final_acc - 0.775) < 0.01)
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
temp = []
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df[0:10]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0][0]
|
||||
temp.append(answer)
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
assert_judge(abs(final_acc - 0.498) < 0.01)
|
||||
print(final_acc)
|
@ -1,113 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
|
||||
config.inference_aux + config.inference_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["how are you?", "Give me three tips for staying healthy."]
|
||||
output = model.generate(instruction)
|
||||
expect_output1 = [
|
||||
"I'm doing well, thanks for asking! I've been keeping busy with work and spending time with friends and family. ",
|
||||
"It's been great to have some time off from school and just relax a bit. How about you? How have you been?\n",
|
||||
"\nI hope you're doing well! It's always great to catch up with you and hear about what's going on in your life. ",
|
||||
"I'm looking forward to hearing all about it. Let me know if you want to hang out soon!"
|
||||
]
|
||||
expect_output2 = [
|
||||
'\n\n1. Eat a balanced diet: A healthy diet should include a variety of fruits, vegetables, whole grains, lean proteins, and healthy fats. ',
|
||||
'Aim to include a rainbow of colors on your plate to ensure you are getting a range of vitamins and minerals.',
|
||||
'\n2. Stay hydrated: Drink plenty of water throughout the day, aiming for at least eight cups (64 ounces) daily. ',
|
||||
'Limit your consumption of sugary drinks'
|
||||
]
|
||||
|
||||
expect_output1_seq = "".join(expect_output1)
|
||||
expect_output2_seq = ''.join(expect_output2)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output[0])
|
||||
print(output[1])
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
|
||||
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity2:", similarity2)
|
||||
assert_judge(similarity1 > 0.85)
|
||||
assert_judge(similarity2 > 0.85)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "What is the whether like today?"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False
|
||||
)
|
||||
expected_output = [
|
||||
"Answer:\nThe weather today is sunny with a high of 75 degrees Fahrenheit and a low of 50 degrees Fahrenheit. ",
|
||||
"There is no rain or other weather alerts in the area.",
|
||||
"\nWould you like to know the weather for a different location?"
|
||||
]
|
||||
expected_output_seq = "".join(expected_output)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
|
||||
print(output)
|
||||
print("similarity:", similarity)
|
||||
assert_judge(similarity > 0.75)
|
@ -1,133 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators
|
||||
|
||||
|
||||
class TestLora(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config: ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
|
||||
config.auxiliary_param + config.learning_rate_param + config.regularization + config.lora_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_megatron_lora_module(self):
|
||||
self.init(config=ParamConfig)
|
||||
from megatron.core import tensor_parallel
|
||||
from pretrain_gpt import model_provider
|
||||
model, _, _ = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder
|
||||
)
|
||||
model = model[0]
|
||||
for name, module in model.named_modules():
|
||||
if name.endswith("query_key_value.lora_A.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
if name.endswith("query_key_value.lora_B.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
|
||||
|
||||
if name.endswith("dense.lora_A.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
|
||||
if name.endswith("dense.lora_B.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
|
||||
if name.endswith("dense_h_to_4h.lora_A.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
if name.endswith("dense_h_to_4h.lora_B.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.ColumnParallelLinear))
|
||||
|
||||
if name.endswith("dense_4h_to_h.lora_A.default"):
|
||||
assert_judge(isinstance(module, tensor_parallel.RowParallelLinear))
|
||||
if name.endswith("dense_4h_to_h.lora_B.default"):
|
||||
assert_judge(isinstance(module, torch.nn.Linear))
|
||||
|
||||
def test_lora(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time, num_floating_point_operations
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder
|
||||
)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
num_floating_point_operations_so_far = 0
|
||||
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
saved_checkpoint = False
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
||||
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 32000)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1, 529, 29900, 29916, 29943, 29955, 29958])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), '堂')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralInstructionHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 93, delta=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 32000)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 6494])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), 'Ukraine')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 26, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,152 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import torch
|
||||
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
|
||||
|
||||
|
||||
class TestTraining(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
|
||||
config.auxiliary_param + config.learning_rate_param + config.regularization + config.training_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
num_floating_point_operations_so_far = 0
|
||||
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
saved_checkpoint = False
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
break
|
||||
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
||||
|
||||
|
||||
def test_breakpoint_renewal_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
self.args.load = self.args.save
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step
|
||||
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, _, _ \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
iteration = self.args.iteration
|
||||
assert_judge(iteration == 10)
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
|
||||
if iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
|
||||
assert_judge(abs(7.6 - loss_dict.get('lm loss')) < 0.2)
|
@ -1,45 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
distributed_param,
|
||||
network_size,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
lora_param,
|
||||
training_param,
|
||||
training_auxiliary,
|
||||
learning_rate,
|
||||
regularization,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_aux = config_file["INFERENCE_AUX"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
lora_param = config_file["LORA_PARAM"]
|
||||
training_param = config_file["TRAINING_PARAM"]
|
||||
training_aux = config_file["TRAINING_AUX"]
|
||||
learning_rate_param = config_file["LEARNING_RATE"]
|
||||
regularization = config_file["REGULARIZATION"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,7 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/llama3-8B/test_convert_ckpt_from_huggingface.py
|
||||
|
||||
pytest -s tests/pipeline/llama3-8B/test_generation.py
|
||||
pytest -s tests/pipeline/llama3-8B/test_evaluation.py
|
||||
pytest -s tests/pipeline/llama3-8B/test_chat.py
|
@ -1,76 +0,0 @@
|
||||
{
|
||||
"CONVERT_CKPT_PARAM": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--load-dir", "/home/dataset/llama3-8B-hf",
|
||||
"--save-dir", "/home/dataset/llama3-8B-mt-t8p1",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--target-pipeline-parallel-size", "1",
|
||||
"--tokenizer-model", "None"
|
||||
],
|
||||
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "14336",
|
||||
"--num-attention-heads", "32",
|
||||
"--max-position-embeddings", "8192",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "16032",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--load", "/home/dataset/llama3-8B-mt-t8p1"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/llama3-8B-hf"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint",
|
||||
"--attention-softmax-in-fp32"
|
||||
],
|
||||
|
||||
"INFERENCE_HF_CHAT_PARAM": [
|
||||
"--hf-chat-template"
|
||||
],
|
||||
|
||||
"INFERENCE_PROMPT_CHAT_PARAM": [
|
||||
"--prompt-type", "llama3"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--tokenizer-not-use-fast",
|
||||
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
|
||||
"--task", "mmlu",
|
||||
"--max-new-tokens", "1",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "16",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--seed", "42",
|
||||
"--use-fused-rmsnorm",
|
||||
"--group-query-attention",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "8192",
|
||||
"--num-query-groups", "8",
|
||||
"--rotary-base", "500000"
|
||||
]
|
||||
}
|
@ -1,123 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import nltk
|
||||
import torch
|
||||
from torch import distributed as dist
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args, chat_get_instruction, chat_print_and_update_histories
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig, chat_type=None):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.auxiliary_param + config.tokenizer_param
|
||||
|
||||
if chat_type == "hf_chat":
|
||||
sys.argv = sys.argv + config.inference_hf_chat_param
|
||||
elif chat_type == "prompt_chat":
|
||||
sys.argv = sys.argv + config.inference_prompt_chat_param
|
||||
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
|
||||
def run_chat(self, model, turn0outputExpect):
|
||||
histories_no_template = []
|
||||
histories_template = []
|
||||
instruction = None
|
||||
|
||||
test_questions = ["你能推荐几本深度学习的书吗?", "上面推荐的书建议学习顺序呢?", "9.11和9.9谁大?"]
|
||||
|
||||
turns = 0
|
||||
while turns < 3:
|
||||
|
||||
prompt = test_questions[turns]
|
||||
|
||||
instruction = chat_get_instruction(self.args, histories_no_template, histories_template, prompt)
|
||||
|
||||
responses = model.generate(
|
||||
instruction,
|
||||
do_sample=True,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
tokenizer=None,
|
||||
temperature=self.args.temperature,
|
||||
max_new_tokens=self.args.max_new_tokens,
|
||||
stream=True
|
||||
)
|
||||
output = chat_print_and_update_histories(self.args, responses, histories_no_template, histories_template, prompt)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print("-------------------------------")
|
||||
print(output)
|
||||
|
||||
if(turns == 0):
|
||||
similarity1 = self.edit_distance_similarity(output[:30], turn0outputExpect[0][:30])
|
||||
similarity2 = self.edit_distance_similarity(output[:30], turn0outputExpect[1][:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity1:", similarity2)
|
||||
assert_judge(max(similarity1, similarity2) > 0.75)
|
||||
|
||||
turns = turns + 1
|
||||
|
||||
|
||||
def test_hf_chat(self):
|
||||
"""Interactive dialog mode with multiple rounds of conversation"""
|
||||
self.init(config=ParamConfig, chat_type="hf_chat")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
turn1outputExpect = []
|
||||
turn1outputExpect1 = "Here are some highly recommended books on deep learning that can help you dive deeper into the subject:"
|
||||
turn1outputExpect2 = '''Here are some highly recommended books for deep learning:\n\n**Foundational Books**\n\n1. **"Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville**: This is the bible of deep learning.'''
|
||||
|
||||
turn1outputExpect.append(turn1outputExpect1)
|
||||
turn1outputExpect.append(turn1outputExpect2)
|
||||
|
||||
self.run_chat(model, turn1outputExpect)
|
||||
|
||||
|
||||
def test_prompt_type_chat(self):
|
||||
"""Interactive dialog mode with multiple rounds of conversation"""
|
||||
self.init(config=ParamConfig, chat_type="prompt_chat")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
turn1outputExpect = []
|
||||
turn1outputExpect1 = "Here are some highly recommended books on deep learning that can help you dive deeper into the subject:"
|
||||
turn1outputExpect2 = '''Here are some highly recommended books for deep learning:\n\n**Foundational Books**\n\n1. **"Deep Learning" by Ian Goodfellow, Yoshua Bengio, and Aaron Courville**: This is the bible of deep learning.'''
|
||||
|
||||
turn1outputExpect.append(turn1outputExpect1)
|
||||
turn1outputExpect.append(turn1outputExpect2)
|
||||
|
||||
self.run_chat(model, turn1outputExpect)
|
||||
|
@ -1,60 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from utils import ParamConfig
|
||||
import torch
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
satetensors_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.safetensors"))
|
||||
self.assertEqual(len(satetensors_file), 4)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "model.safetensors.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([16032, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
print(weight_common_content['encoder']["layers.31.mlp.dense_h_to_4h._extra_state"])
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([768, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([3584, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1792]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([16032, 4096]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,94 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
|
||||
config.evaluation_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df[0:10]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0].strip()
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.687) < 0.02)
|
||||
|
@ -1,97 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from megatron.training import get_args
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + \
|
||||
config.inference_param + config.auxiliary_param + config.tokenizer_param
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
self.args = get_args()
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["春夏秋冬,四个季节"]
|
||||
output = model.generate(instruction, detokenize=False)
|
||||
expected_output1 = [3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
|
||||
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
|
||||
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602,
|
||||
3922, 64803, 19483, 105343, 56602, 3922, 64803, 19483, 105343, 56602]
|
||||
expected_output2 = [3922, 64803, 19483, 13646, 125436, 3922, 64803, 19483, 24273, 25129,
|
||||
3922, 64803, 19483, 27384, 24273, 25129, 3922, 64803, 19483, 31809,
|
||||
24273, 25129, 3922, 64803, 19483, 27384, 24273, 25129, 9554, 64803,
|
||||
19483, 31809, 24273, 25129, 3922, 64803, 19483, 31809, 24273, 25129]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim1 = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
|
||||
output[:40].unsqueeze(0).float())
|
||||
cos_sim2 = similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
|
||||
output[:40].unsqueeze(0).float())
|
||||
cos_sim = torch.max(cos_sim1, cos_sim2)
|
||||
print("similarity: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.95)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "北京奥运会"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
expected_output = [9554, 30867, 106633, 29430, 17905, 3922, 102446, 110125, 35287, 28038,
|
||||
70090, 108025, 109169, 57668, 26123, 34208, 28038, 37046, 34208, 57668,
|
||||
26123, 78640, 61075, 104261, 103302, 1811, 1049, 23, 8107, 24,
|
||||
9953, 3922, 110284, 35287, 19000, 70090, 108448, 23039, 9554, 30537]
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output).unsqueeze(0).float().npu(),
|
||||
output[:40].unsqueeze(0).float())
|
||||
print("similarity: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.95)
|
@ -1,38 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
convert_ckpt_param,
|
||||
network_size,
|
||||
tokenizer_param,
|
||||
distributed_param,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
tokenizer_param = config_file["TOKENIZER_PARAM"]
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
|
||||
inference_hf_chat_param = config_file["INFERENCE_HF_CHAT_PARAM"]
|
||||
inference_prompt_chat_param = config_file["INFERENCE_PROMPT_CHAT_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,9 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
python tests/pipeline/mistral-7B/test_process_pretrain_data.py
|
||||
python tests/pipeline/mistral-7B/test_process_instruction_data.py
|
||||
python tests/pipeline/mistral-7B/test_convert_ckpt_from_huggingface.py
|
||||
|
||||
pytest -s tests/pipeline/mistral-7B/test_generation.py
|
||||
pytest -s tests/pipeline/mistral-7B/test_evaluation.py
|
||||
pytest -s tests/pipeline/mistral-7B/test_trainer.py
|
@ -1,123 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "14336",
|
||||
"--num-attention-heads", "32",
|
||||
"--max-position-embeddings", "32768",
|
||||
"--position-embedding-type", "rope",
|
||||
"--group-query-attention",
|
||||
"--num-query-groups", "8",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--sliding-window", "4096"
|
||||
],
|
||||
|
||||
"INFERENCE_AUX": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
|
||||
"--load", "/home/dataset/mistral-7B-tp8-pp1",
|
||||
"--seed", "42",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--task-data-path", "/home/dataset/eval_dataset/boolq/test/", "/home/dataset/eval_dataset/mmlu/test/",
|
||||
"--max-new-tokens", "2"
|
||||
],
|
||||
|
||||
"TRAINING_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
|
||||
"--load", "/home/dataset/mistral-7B-tp8-pp1",
|
||||
"--save", "/autotest/dataset/save-weight-mistral-7B",
|
||||
"--data-path", "/home/dataset/pretrain-dataset-mistral-7B/alpaca_text_document",
|
||||
"--train-iters", "15"
|
||||
],
|
||||
|
||||
"REGULARIZATION": [
|
||||
"--attention-dropout", "0.0",
|
||||
"--hidden-dropout", "0.0",
|
||||
"--weight-decay", "1e-1",
|
||||
"--clip-grad", "1.0",
|
||||
"--adam-beta1", "0.9",
|
||||
"--adam-beta2", "0.95"
|
||||
],
|
||||
|
||||
"LEARNING_RATE": [
|
||||
"--lr", "1.25e-6",
|
||||
"--lr-decay-style", "cosine",
|
||||
"--lr-warmup-fraction", "0.01",
|
||||
"--min-lr", "1.25e-7"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "32",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "8192"
|
||||
],
|
||||
|
||||
"TRAINING_AUX": [
|
||||
"--sequence-parallel",
|
||||
"--initial-loss-scale", "65536",
|
||||
"--use-flash-attn",
|
||||
"--use-fused-rmsnorm",
|
||||
"--init-method-std", "0.01",
|
||||
"--split", "100,0,0",
|
||||
"--log-interval", "1",
|
||||
"--save-interval", "10",
|
||||
"--eval-interval", "1000",
|
||||
"--eval-iters", "0",
|
||||
"--num-workers", "0",
|
||||
"--distributed-backend", "nccl"
|
||||
],
|
||||
|
||||
"PROCESS_PRETRAIN_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--output-prefix", "/home/dataset/pretrain-dataset-mistral-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA": [
|
||||
"--input", "/home/dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "GeneralInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-mistral-7B/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/mistral-7B",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod"
|
||||
],
|
||||
|
||||
"CONVERT_CKPT_FROM_HF": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--load-dir", "/home/dataset/mistral-7B",
|
||||
"--save-dir", "/home/dataset/mistral-7B-tp8-pp1",
|
||||
"--tokenizer-model", "/home/dataset/mistral-7B/tokenizer.model"
|
||||
]
|
||||
}
|
@ -1,97 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
||||
export NPU_ASD_ENABLE=0
|
||||
|
||||
GPUS_PER_NODE=8
|
||||
MASTER_ADDR=localhost
|
||||
MASTER_PORT=6000
|
||||
NNODES=1
|
||||
NODE_RANK=0
|
||||
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
|
||||
|
||||
DATA_PATH=/home/dataset/pretrain-dataset-mistral-7B/alpaca_text_document
|
||||
TOKENIZER_MODEL=/home/dataset/mistral-7B
|
||||
TP=8
|
||||
PP=1
|
||||
NUM_LAYERS=32
|
||||
|
||||
DISTRIBUTED_ARGS="
|
||||
--nproc_per_node $GPUS_PER_NODE \
|
||||
--nnodes $NNODES \
|
||||
--node_rank $NODE_RANK \
|
||||
--master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT
|
||||
"
|
||||
|
||||
GPT_ARGS="
|
||||
--tensor-model-parallel-size ${TP} \
|
||||
--pipeline-model-parallel-size ${PP} \
|
||||
--sequence-parallel \
|
||||
--sliding-window 4096 \
|
||||
--num-layers ${NUM_LAYERS} \
|
||||
--hidden-size 4096 \
|
||||
--ffn-hidden-size 14336 \
|
||||
--num-attention-heads 32 \
|
||||
--group-query-attention \
|
||||
--num-query-groups 8 \
|
||||
--tokenizer-type PretrainedFromHF \
|
||||
--tokenizer-name-or-path ${TOKENIZER_MODEL} \
|
||||
--seq-length 32768 \
|
||||
--max-position-embeddings 32768 \
|
||||
--micro-batch-size 1 \
|
||||
--global-batch-size 32 \
|
||||
--make-vocab-size-divisible-by 1 \
|
||||
--lr 1.25e-6 \
|
||||
--train-iters 2000 \
|
||||
--lr-decay-style cosine \
|
||||
--untie-embeddings-and-output-weights \
|
||||
--disable-bias-linear \
|
||||
--attention-dropout 0.0 \
|
||||
--init-method-std 0.01 \
|
||||
--hidden-dropout 0.0 \
|
||||
--position-embedding-type rope \
|
||||
--normalization RMSNorm \
|
||||
--use-fused-rmsnorm \
|
||||
--swiglu \
|
||||
--use-flash-attn \
|
||||
--no-masked-softmax-fusion \
|
||||
--attention-softmax-in-fp32 \
|
||||
--min-lr 1.25e-7 \
|
||||
--weight-decay 1e-1 \
|
||||
--lr-warmup-fraction 0.01 \
|
||||
--clip-grad 1.0 \
|
||||
--adam-beta1 0.9 \
|
||||
--initial-loss-scale 65536 \
|
||||
--adam-beta2 0.95 \
|
||||
--no-gradient-accumulation-fusion \
|
||||
--no-load-optim \
|
||||
--no-load-rng \
|
||||
--use-mc2 \
|
||||
--use-fused-swiglu \
|
||||
--use-rotary-position-embeddings \
|
||||
--use-fused-rotary-pos-emb \
|
||||
--use-distributed-optimizer \
|
||||
--overlap-grad-reduce \
|
||||
--bf16
|
||||
"
|
||||
# --save ${CKPT_SAVE_DIR} \
|
||||
# --load ${CKPT_LOAD_DIR} \
|
||||
|
||||
DATA_ARGS="
|
||||
--data-path $DATA_PATH \
|
||||
--split 100,0,0
|
||||
"
|
||||
|
||||
OUTPUT_ARGS="
|
||||
--log-interval 1 \
|
||||
--save-interval 10000 \
|
||||
--eval-interval 5000 \
|
||||
--eval-iters 0 \
|
||||
"
|
||||
|
||||
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
|
||||
$GPT_ARGS \
|
||||
$DATA_ARGS \
|
||||
$OUTPUT_ARGS \
|
||||
--distributed-backend nccl 2>&1 | tee /home/dataset/new_mistral-7B.log
|
@ -1,64 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import torch
|
||||
from utils import ParamConfig
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[9], "*.safetensors"))
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[9], "model.safetensors.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[11], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[7]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([4000, 4096]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([768, 4096]))
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([4096, 512]))
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([3584, 4096]))
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([4096, 1792]))
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([4096]))
|
||||
self.assertEqual(
|
||||
weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([4096]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([4000, 4096]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,107 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
|
||||
config.inference_aux + config.evaluation_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def get_result(self, tokenizer, result):
|
||||
if result:
|
||||
final_result = [result[0]]
|
||||
if result[1][0][tokenizer.encode("Yes")[-1]] >= result[1][0][tokenizer.encode("No")[-1]]:
|
||||
final_result.append('T')
|
||||
else:
|
||||
final_result.append('F')
|
||||
else:
|
||||
final_result = None
|
||||
return final_result
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
temp = []
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df[0:10]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0][0]
|
||||
temp.append(answer)
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(f"==================== final acc: {final_acc} ====================")
|
||||
assert_judge(abs(final_acc - 0.594) < 0.01)
|
||||
|
@ -1,117 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import nltk
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param +\
|
||||
config.inference_aux + config.inference_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
instruction = ["how are you?", "Give me three tips for staying healthy."]
|
||||
output = model.generate(instruction)
|
||||
|
||||
expect_output1 = [
|
||||
"I'm doing well, thank you for asking! I've been keeping busy with work and various projects. "
|
||||
"How about you? How have you been?"
|
||||
]
|
||||
expect_output2 = [
|
||||
'\n\n1. Eat a balanced diet: Consuming a variety of nutrient-dense foods from all the food groups is '
|
||||
'essential for maintaining good health.\n\n2. Stay hydrated: Water is essential for maintaining '
|
||||
'good health. Aim to drink at least eight glasses of water a day.\n\n3. Get enough sleep: '
|
||||
'Sleep is essential for maintaining good health. Aim to get at least seven to eight hours of'
|
||||
' quality sleep each night.'
|
||||
]
|
||||
|
||||
expect_output1_seq = "".join(expect_output1)
|
||||
expect_output2_seq = ''.join(expect_output2)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output[0])
|
||||
print(output[1])
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
|
||||
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity2:", similarity2)
|
||||
assert_judge(similarity1 > 0.85)
|
||||
assert_judge(similarity2 > 0.85)
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig)
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "What is the whether like today?"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False
|
||||
)
|
||||
|
||||
expected_output = [
|
||||
"The weather today is described as mostly sunny with a high temperature around 70 degrees "
|
||||
"Fahrenheit (21 degrees Celsius).\n\nTo determine if the weather will be similar tomorrow, "
|
||||
"you would need to check the weather forecast for tomorrow. The forecast may "
|
||||
"indicate similar weather conditions, or it may suggest different conditions such as rain or clouds."
|
||||
"\n\nTherefore, to answer your question, I would need to check the weather forecast for tomorrow. "
|
||||
"Once I have that information, I can tell you whether the weather is expected to be similar to today, "
|
||||
"or if it is expected to be different."
|
||||
]
|
||||
expected_output_seq = "".join(expected_output)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
similarity = self.edit_distance_similarity(output[:40], expected_output_seq[:40])
|
||||
print("similarity:", similarity)
|
||||
assert_judge(similarity > 0.75)
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 32000)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [1, 523, 28734, 7355, 28787, 28767])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), 'ฉ')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralInstructionHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 90, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,82 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import GeneralPretrainHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessPretrainData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.pretrain_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eos append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
self.assertEqual(self.tokenizer.vocab_size, 32000)
|
||||
self.assertEqual(self.tokenizer.tokenize('bug'), [1, 10079])
|
||||
self.assertEqual(self.tokenizer.detokenize(23961), 'behaviors')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eos), '</s>')
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("instruction")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("input")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("output")), 52002)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("text")), 52002)
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, GeneralPretrainHandler)
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.pretrain_data_param[5].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 1)
|
||||
self.assertEqual(len(idx_file), 1)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 26, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,154 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import torch
|
||||
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.core.enums import ModelType
|
||||
from megatron.core.utils import get_model_config
|
||||
from megatron.training.training import setup_model_and_optimizer, build_train_valid_test_data_iterators, num_floating_point_operations
|
||||
|
||||
|
||||
class TestTraining(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.training_aux + config.network_size + \
|
||||
config.auxiliary_param + config.learning_rate_param + config.regularization + config.training_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step, training_log, save_checkpoint_and_time
|
||||
from megatron.core import mpu
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, valid_data_iterator, test_data_iterator \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
if self.args.eval_iters == 0:
|
||||
assert_judge(valid_data_iterator is None)
|
||||
assert_judge(test_data_iterator is None)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
total_loss_dict = {}
|
||||
iteration = self.args.iteration
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
report_memory_flag = True
|
||||
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
num_floating_point_operations_so_far = 0
|
||||
|
||||
|
||||
while iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
batch_size = mpu.get_data_parallel_world_size() * \
|
||||
self.args.micro_batch_size * \
|
||||
get_num_microbatches()
|
||||
self.args.consumed_train_samples += batch_size
|
||||
num_floating_point_operations_so_far += num_floating_point_operations(self.args, batch_size)
|
||||
loss_scale = optimizer.get_loss_scale().item()
|
||||
params_norm = None
|
||||
learning_rate = None
|
||||
decoupled_learning_rate = None
|
||||
for param_group in optimizer.param_groups:
|
||||
if param_group['is_decoupled_lr']:
|
||||
decoupled_learning_rate = param_group['lr']
|
||||
else:
|
||||
learning_rate = param_group['lr']
|
||||
report_memory_flag = training_log(loss_dict, total_loss_dict, learning_rate,
|
||||
decoupled_learning_rate,
|
||||
iteration, loss_scale,
|
||||
report_memory_flag, skipped_iter,
|
||||
grad_norm, params_norm, num_zeros_in_grad)
|
||||
saved_checkpoint = False
|
||||
if self.args.save and self.args.save_interval and \
|
||||
iteration % self.args.save_interval == 0:
|
||||
save_checkpoint_and_time(iteration, model, optimizer, lr_scheduler, num_floating_point_operations_so_far)
|
||||
saved_checkpoint = True
|
||||
break
|
||||
|
||||
if saved_checkpoint:
|
||||
for file_name in os.listdir(self.args.save):
|
||||
file_path = os.path.join(self.args.save, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
assert_judge(file_path.endswith(".txt"))
|
||||
else:
|
||||
assert_judge(len(os.listdir(file_path)) == self.args.tensor_model_parallel_size)
|
||||
|
||||
|
||||
def test_breakpoint_renewal_training(self):
|
||||
self.init(config=ParamConfig)
|
||||
self.args.load = self.args.save
|
||||
torch.npu.set_compile_mode(jit_compile=True)
|
||||
from pretrain_gpt import model_provider, forward_step
|
||||
from pretrain_gpt import train_valid_test_datasets_provider
|
||||
from megatron.training.global_vars import update_num_microbatches, get_timers
|
||||
from megatron.training.training import train_step
|
||||
if self.args.load == self.args.save: # We can regard it as Breakpoint Renewal Training situation
|
||||
model, optimizer, lr_scheduler = setup_model_and_optimizer(
|
||||
model_provider, ModelType.encoder_or_decoder)
|
||||
assert_judge(isinstance(model, list))
|
||||
|
||||
config = get_model_config(model[0])
|
||||
train_valid_test_datasets_provider.is_distributed = True
|
||||
train_data_iterator, _, _ \
|
||||
= build_train_valid_test_data_iterators(
|
||||
train_valid_test_datasets_provider
|
||||
)
|
||||
|
||||
for model_module in model:
|
||||
model_module.train()
|
||||
|
||||
timers = get_timers()
|
||||
iteration = self.args.iteration
|
||||
assert_judge(iteration == 10)
|
||||
config.grad_scale_func = optimizer.scale_loss
|
||||
config.timers = timers
|
||||
timers('interval-time', log_level=0).start(barrier=True)
|
||||
|
||||
if iteration < self.args.train_iters:
|
||||
update_num_microbatches(self.args.consumed_train_samples)
|
||||
self.args.curr_iteration = iteration
|
||||
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
|
||||
train_step(forward_step,
|
||||
train_data_iterator,
|
||||
model,
|
||||
optimizer,
|
||||
lr_scheduler,
|
||||
config)
|
||||
iteration += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(f"iteration {iteration}: loss {loss_dict.get('lm loss')}")
|
||||
assert_judge(abs(1.0998 - loss_dict.get('lm loss')) < 0.2)
|
@ -1,43 +0,0 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
distributed_param,
|
||||
network_size,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
training_param,
|
||||
training_auxiliary,
|
||||
learning_rate,
|
||||
regularization,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
inference_aux = config_file["INFERENCE_AUX"]
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
evaluation_param = config_file["EVALUATION_PARAM"]
|
||||
training_param = config_file["TRAINING_PARAM"]
|
||||
training_aux = config_file["TRAINING_AUX"]
|
||||
learning_rate_param = config_file["LEARNING_RATE"]
|
||||
regularization = config_file["REGULARIZATION"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
pretrain_data_param = config_file["PROCESS_PRETRAIN_DATA"]
|
||||
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
|
||||
convert_ckpt_param = config_file["CONVERT_CKPT_FROM_HF"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -1,151 +0,0 @@
|
||||
{
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "32",
|
||||
"--hidden-size", "4096",
|
||||
"--ffn-hidden-size", "11008",
|
||||
"--num-attention-heads", "32",
|
||||
"--max-position-embeddings", "32768",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "16",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--add-qkv-bias"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "1",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "2",
|
||||
"--global-batch-size", "64",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--seed", "42",
|
||||
"--use-fused-rmsnorm",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "8192",
|
||||
"--padded-vocab-size", "151936",
|
||||
"--attention-softmax-in-fp32"
|
||||
],
|
||||
|
||||
"OUTPUT_PARAM": [
|
||||
"--log-interval", "1",
|
||||
"--save-interval", "10000",
|
||||
"--eval-interval", "1000",
|
||||
"--eval-iters", "5"
|
||||
],
|
||||
|
||||
"INSTRUCTION_PARAM": [
|
||||
"--finetune",
|
||||
"--is-instruction-dataset",
|
||||
"--data-path", "/home/dataset/tune-dataset-qwen-7B/alpaca",
|
||||
"--split", "90,5,5",
|
||||
"--train-iters", "5"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM_TP8_PP1": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA": [
|
||||
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "LlamaFactoryInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod",
|
||||
"--prompt-type", "qwen",
|
||||
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
|
||||
"--overwrite-cache"
|
||||
],
|
||||
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA_MIX1": [
|
||||
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "LlamaFactoryInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod",
|
||||
"--prompt-type", "qwen",
|
||||
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
|
||||
"--overwrite-cache",
|
||||
"--interleave-probs", "0.1, 0.2, 0.3, 0.4",
|
||||
"--mix-strategy", "interleave_under",
|
||||
"--max-samples", "10"
|
||||
],
|
||||
|
||||
"PROCESS_INSTRUCTION_DATA_MIX2": [
|
||||
"--input", "train-00000-of-00001-a09b74b3ef9c3b56, alpaca_zh, sharegpt1, sharegpt2",
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--handler-name", "LlamaFactoryInstructionHandler",
|
||||
"--output-prefix", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/alpaca",
|
||||
"--tokenizer-name-or-path", "/home/dataset/qwen-7b-hf/",
|
||||
"--workers", "4",
|
||||
"--log-interval", "1000",
|
||||
"--append-eod",
|
||||
"--prompt-type", "qwen",
|
||||
"--dataset-dir", "/home/dataset/tune-dataset-qwen-7B/lfhandler_tune_dataset/dataset/",
|
||||
"--overwrite-cache",
|
||||
"--interleave-probs", "0.1, 0.2, 0.3, 0.4",
|
||||
"--mix-strategy", "interleave_over",
|
||||
"--max-samples", "10"
|
||||
],
|
||||
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint",
|
||||
"--attention-softmax-in-fp32",
|
||||
"--prompt-type", "qwen",
|
||||
"--seed", "42",
|
||||
"--load", "/home/dataset/Qwen-7B-v0.1-tp8-pp1/"
|
||||
],
|
||||
|
||||
|
||||
"BEAM_SEARCH_AUXILIARY_PARAM": [
|
||||
"--task", "beam_search",
|
||||
"--top-p", "0.95",
|
||||
"--top-k", "50"
|
||||
],
|
||||
|
||||
"GREEDY_SEARCH_AUXILIARY_PARAM": [
|
||||
"--task", "greedy"
|
||||
],
|
||||
|
||||
"DO_SAMPLE_AUXILIARY_PARAM": [
|
||||
"--task", "do_sample",
|
||||
"--top-p", "0.95",
|
||||
"--top-k", "50"
|
||||
],
|
||||
|
||||
"BEAM_SEARCH_WITH_SAMPLING_AUXILIARY_PARAM": [
|
||||
"--task", "beam_search_with_sampling",
|
||||
"--top-p", "0.95",
|
||||
"--top-k", "50"
|
||||
],
|
||||
|
||||
"RETURN_OUTPUT_LOG_PROBS_AUXILIARY_PARAM": [
|
||||
"--task", "return_output_log_probs",
|
||||
"--temperature 0.6",
|
||||
"--top-p", "0.95",
|
||||
"--top-k", "50"
|
||||
]
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
# Provide uniform access for piepline.
|
||||
|
||||
pytest -s ./tests/pipeline/qwen-7B/test_instruction.py
|
||||
pytest -s ./tests/pipeline/qwen-7B/test_process_instruction_data.py
|
||||
pytest -s ./tests/pipeline/qwen-7B/test_generation.py
|
||||
pytest -s ./tests/pipeline/qwen-7B/test_generation2.py
|
@ -1,141 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import nltk
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from megatron.training import get_args, get_tokenizer
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig, task=None):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param_tp8_pp1 + config.network_size + \
|
||||
config.inference_param + config.beam_search_auxliary_param + config.auxiliary_param + config.tokenizer_param
|
||||
|
||||
if task == "beam_search_with_sampling":
|
||||
sys.argv = sys.argv + config.beam_search_with_sampling_auxliary_param
|
||||
elif task == "return_output_log_probs":
|
||||
sys.argv = sys.argv + config.return_output_log_probs_auxliary_param
|
||||
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
self.args = get_args()
|
||||
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
|
||||
def test_beam_search_with_sampling(self):
|
||||
"""Beam Search with sampling"""
|
||||
self.init(config=ParamConfig, task="beam_search_with_sampling")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
instruction = "Give me three tips for staying healthy."
|
||||
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
do_sample=True,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=self.args.max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False
|
||||
)
|
||||
|
||||
expect_output1 = '''1. Get enough sleep. A good night's sleep is important for your physical and mental health.\n2. Eat a balanced diet. Eating a variety of healthy foods can help you get the nutrients your body needs.\n3. Exercise regularly. Exercise can help you maintain a healthy weight, reduce stress, and improve your overall health.'''
|
||||
|
||||
expect_output2 = '''Sure, here are three tips for staying healthy:\n1. Eat a balanced diet that includes fruits, vegetables, whole grains, and lean proteins.\n2. Get regular exercise, such as going for a walk or doing yoga.\n3. Get enough sleep each night, ideally 7-8 hours.'''
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
tokenizer = get_tokenizer()
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[:30], expect_output1[:30])
|
||||
similarity2 = self.edit_distance_similarity(output[:30], expect_output2[:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity1:", similarity2)
|
||||
assert_judge(max(similarity1, similarity2) > 0.75)
|
||||
|
||||
|
||||
def test_return_output_log_probs(self):
|
||||
"""Returns the probability distribution of tokens"""
|
||||
self.init(config=ParamConfig, task="return_output_log_probs")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
instruction = "What is the whether like today?"
|
||||
|
||||
output1, log_probs = model.generate(
|
||||
instruction,
|
||||
do_sample=True,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
temperature=self.args.temperature,
|
||||
max_new_tokens=self.args.max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
tokenizer = get_tokenizer()
|
||||
print("--------------output1-------------")
|
||||
print(output1)
|
||||
print(tokenizer.decode(output1))
|
||||
|
||||
expected_output1 = [2132, 686, 6761, 389, 1380, 498, 525, 304, 279, 1879,
|
||||
13, 576, 9104, 646, 387, 2155, 304, 2155, 7482, 624,
|
||||
872, 198, 3838, 374, 279, 9104, 1075, 304, 7148, 5267,
|
||||
77091, 198, 785, 9104, 304, 7148, 3351, 374, 39698, 323]
|
||||
|
||||
expected_output1_ext = [2132, 686, 6761, 389, 1380, 498, 525, 7407, 13, 16503,
|
||||
498, 3291, 752, 697, 3728, 5267, 872, 198, 29596, 11902,
|
||||
198, 77091, 198, 641, 9656, 11902, 11, 432, 594, 39698,
|
||||
3351, 13, 576, 9315, 374, 220, 23, 15, 12348, 68723]
|
||||
expected_output1_ext2 = [2132, 374, 83253, 16916, 3351, 382, 77091, 198, 3838, 374,
|
||||
279, 9104, 1075, 3351, 5267, 2610, 525, 264, 10950, 17847,
|
||||
13, 279, 198, 3838, 374, 279, 9104, 1075, 3351, 5267,
|
||||
2610, 525, 264, 10950, 17847, 13, 279, 198, 3838, 374]
|
||||
print("--------------log_probs----------------")
|
||||
print(log_probs.shape)
|
||||
assert_judge(log_probs.shape[0] == 256)
|
||||
assert_judge(log_probs.shape[1] == 151936)
|
||||
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output1[:40]).unsqueeze(0).float().npu(),
|
||||
output1[:40].unsqueeze(0).float())
|
||||
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output1_ext[:40]).unsqueeze(0).float().npu(),
|
||||
output1[:40].unsqueeze(0).float()))
|
||||
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output1_ext2[:40]).unsqueeze(0).float().npu(),
|
||||
output1[:40].unsqueeze(0).float()))
|
||||
print("similarity1: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.75)
|
@ -1,165 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import torch
|
||||
import nltk
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from megatron.training import get_args, get_tokenizer
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
from modellink.tasks.inference.text_generation.infer_base import add_text_generate_args
|
||||
|
||||
|
||||
class TestGeneration(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig, task=None):
|
||||
"""
|
||||
initialize the environment and arguments
|
||||
"""
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param_tp8_pp1 + config.network_size + \
|
||||
config.inference_param + config.beam_search_auxliary_param + config.auxiliary_param + config.tokenizer_param
|
||||
|
||||
if task == "beam_search":
|
||||
sys.argv = sys.argv + config.beam_search_auxliary_param
|
||||
elif task == "greedy":
|
||||
sys.argv = sys.argv + config.greedy_search_auxliary_param
|
||||
elif task == "do_sample":
|
||||
sys.argv = sys.argv + config.do_sample_auxliary_param
|
||||
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
self.args = get_args()
|
||||
|
||||
|
||||
def test_beam_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `beam search`.
|
||||
"""
|
||||
self.init(config=ParamConfig, task="beam_search")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
instruction = "如何提高身体素质"
|
||||
output = model.generate(
|
||||
instruction,
|
||||
num_beams=2,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False,
|
||||
detokenize=False
|
||||
)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print("----------------------output-------------------------")
|
||||
print(output)
|
||||
expected_output1 = [100627, 101099, 100838, 104339, 101194, 3837, 87752, 99639, 6684, 31338,
|
||||
96422, 28311, 16, 13, 4891, 251, 248, 68878, 101079, 5122,
|
||||
106854, 104102, 71817, 16, 20, 15, 83031, 9370, 15946, 49567,
|
||||
102660, 18830, 100316, 101079, 3837, 29524, 99234, 99314, 5373, 107530]
|
||||
|
||||
expected_output2 = [30534, 100627, 101099, 100838, 3837, 73670, 103975, 87752, 101082, 28311,
|
||||
16, 13, 4891, 223, 98, 99446, 104579, 5122, 101907, 109635,
|
||||
103170, 107151, 5373, 100912, 52510, 116570, 5373, 105349, 5373, 105373,
|
||||
33108, 117094, 49567, 102100, 101252, 3837, 101153, 44636, 108461, 5373]
|
||||
|
||||
similarity = torch.nn.CosineSimilarity(dim=1)
|
||||
cos_sim = similarity(torch.tensor(expected_output1).unsqueeze(0).float().npu(),
|
||||
output[:40].unsqueeze(0).float())
|
||||
cos_sim = max(cos_sim, similarity(torch.tensor(expected_output2).unsqueeze(0).float().npu(),
|
||||
output[:40].unsqueeze(0).float()))
|
||||
print("similarity: ", cos_sim)
|
||||
assert_judge(cos_sim > 0.85)
|
||||
|
||||
|
||||
def edit_distance_similarity(self, text1, text2):
|
||||
"""
|
||||
edit distance: to compare the similarity between two texts.
|
||||
"""
|
||||
distance = nltk.edit_distance(text1, text2)
|
||||
try:
|
||||
similarity = 1 - (distance / max(len(text1), len(text2)))
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
return similarity
|
||||
|
||||
|
||||
def test_greedy_search(self):
|
||||
"""
|
||||
load weight to get model and construct the prompts to generate output,
|
||||
and compare with expected for `greedy search`.
|
||||
"""
|
||||
self.init(config=ParamConfig, task="greedy")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
instruction = ["What are the characteristics of Suzhou?", "Introducing the Forbidden City in Beijing."]
|
||||
output = model.generate(instruction)
|
||||
expect_output1 = [
|
||||
"Suzhou is a city in China. It is known for its beautiful gardens, canals, and classical Chinese architecture. It is also known for its silk production and traditional arts and crafts. The city has a rich cultural heritage and is home to many historic temples and museums. Additionally, Suzhou is known for its cuisine, which features local specialties such as sweet and sour fish and rice cakes."
|
||||
]
|
||||
expect_output2 = [
|
||||
'The Forbidden City is a palace complex in Beijing, China. It was the home of the emperors of China for almost 500 years, from the Ming Dynasty to the end of the Qing Dynasty. The complex covers an area of 72 hectares and has over 9,000 rooms. It is a UNESCO World Heritage Site and one of the most popular tourist attractions in China..'
|
||||
]
|
||||
|
||||
expect_output1_seq = "".join(expect_output1)
|
||||
expect_output2_seq = ''.join(expect_output2)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print("----------------------output1-------------------------")
|
||||
print(output[0])
|
||||
print("----------------------output2-------------------------")
|
||||
print(output[1])
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
|
||||
similarity2 = self.edit_distance_similarity(output[1][:30], expect_output2_seq[:30])
|
||||
print("similarity1:", similarity1)
|
||||
print("similarity2:", similarity2)
|
||||
assert_judge(similarity1 > 0.85)
|
||||
assert_judge(similarity2 > 0.85)
|
||||
|
||||
|
||||
def test_do_sample(self):
|
||||
"""Do Sample"""
|
||||
self.init(config=ParamConfig, task="do_sample")
|
||||
from inference import model_provider
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_model_name_or_path=self.args.load
|
||||
)
|
||||
|
||||
instruction = "what is Disneyland?"
|
||||
|
||||
output = model.generate(
|
||||
[instruction, instruction],
|
||||
do_sample=True,
|
||||
top_k=self.args.top_k,
|
||||
top_p=self.args.top_p,
|
||||
max_new_tokens=self.args.max_new_tokens,
|
||||
tokenizer=None,
|
||||
stream=False
|
||||
)
|
||||
|
||||
expect_output1 = "Disneyland Park is an entertainment park located in Anaheim, California, United States. It is owned by the Disney Parks, Experiences and Consumer Products division of the American multinational conglomerate corporation the Walt Disney Company. It is also the first of seven theme parks built at Walt Disney's original vision, where visitors can enjoy various attractions, entertainment, and dining."
|
||||
expect_output1_seq = "".join(expect_output1)
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(output)
|
||||
tokenizer = get_tokenizer()
|
||||
|
||||
similarity1 = self.edit_distance_similarity(output[0][:30], expect_output1_seq[:30])
|
||||
print("similarity1:", similarity1)
|
||||
assert_judge(similarity1 > 0.85)
|
@ -1,138 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
import torch
|
||||
from common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.data.dataset_utils import get_train_valid_test_split_
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
from megatron.training import get_args
|
||||
from modellink.data.mtf_dataset import get_packed_indexed_dataset, MTFDataset
|
||||
from modellink.data.decoder_packed_mtf_dataset import DecoderPackedMTFDataset, _build_shuffle_idx
|
||||
|
||||
|
||||
class TestInstruction(DistributedTest):
|
||||
world_size = 1
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = ([sys.argv[0]] + config.distributed_param + config.network_size + config.tokenizer_param +
|
||||
config.auxiliary_param + config.instruction_param + config.output_param)
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=None,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
self.args = get_args()
|
||||
|
||||
def _build_index_mappings(
|
||||
self,
|
||||
name,
|
||||
data_prefix,
|
||||
start_index,
|
||||
nb_documents,
|
||||
num_samples: int,
|
||||
seed,
|
||||
):
|
||||
"""
|
||||
- `shuffle_index` is [num_epoch * len(self.mtf)]
|
||||
- `sample_index` is [num_sample, 2] (storing the start and end of the sample). We query the sample via `self.shuffle_index[start:end]`
|
||||
"""
|
||||
|
||||
# rng state
|
||||
np_rng = np.random.RandomState(seed=seed)
|
||||
|
||||
# Filename of the index mappings.
|
||||
_filename = data_prefix
|
||||
_filename += '_{}_indexmap'.format(name)
|
||||
_filename += '_{}ns'.format(num_samples)
|
||||
_filename += '_{}s'.format(seed)
|
||||
shuffle_idx_filename = _filename + '_decoder_packed_shuffle_idx.npy'
|
||||
if os.path.isfile(shuffle_idx_filename):
|
||||
os.remove(shuffle_idx_filename)
|
||||
# Build the indexed mapping if not exist.
|
||||
if not os.path.isfile(shuffle_idx_filename):
|
||||
|
||||
print(' > WARNING: could not find index map files, building '
|
||||
'the indices on rank 0 ...')
|
||||
|
||||
# iteratively add the entire dataset for every epoch and see if it's enough given current packing strategy
|
||||
start_time = time.time()
|
||||
epoch = 0
|
||||
shuffle_idx = []
|
||||
while len(shuffle_idx) <= num_samples:
|
||||
new_document_ids = _build_shuffle_idx(nb_documents=nb_documents, start_index=start_index, np_rng=np_rng)
|
||||
# Generate a shuffling of the entire dataset
|
||||
shuffle_idx.extend(new_document_ids.tolist())
|
||||
epoch += 1
|
||||
|
||||
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
|
||||
print(' > elasped time to build and save shuffle-idx and sample-idx mapping'
|
||||
' (seconds): {:4f}'.format(time.time() - start_time))
|
||||
|
||||
|
||||
# Load mappings.
|
||||
start_time = time.time()
|
||||
print(' > loading shuffle-idx mapping from {}'.format(
|
||||
shuffle_idx_filename))
|
||||
shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r')
|
||||
print(' loaded indexed file in {:3.3f} seconds'.format(
|
||||
time.time() - start_time))
|
||||
|
||||
return shuffle_idx, epoch
|
||||
|
||||
def test_train_valid_test_split(self):
|
||||
self.init(config=ParamConfig)
|
||||
data_prefix = self.args.data_path[0]
|
||||
packed_indexed_dataset = get_packed_indexed_dataset(data_prefix=data_prefix)
|
||||
total_num_of_documents = len(list(packed_indexed_dataset.values())[0])
|
||||
|
||||
assert_judge(52002 == total_num_of_documents)
|
||||
|
||||
splits = get_train_valid_test_split_(self.args.split, total_num_of_documents)
|
||||
if self.args.train_samples:
|
||||
train_samples = self.args.train_samples
|
||||
else:
|
||||
train_samples = self.args.train_iters * self.args.global_batch_size
|
||||
eval_iters = (self.args.train_iters // self.args.eval_interval + 1) * \
|
||||
self.args.eval_iters
|
||||
test_iters = self.args.eval_iters
|
||||
train_val_test_num_samples = [train_samples,
|
||||
eval_iters * self.args.global_batch_size,
|
||||
test_iters * self.args.global_batch_size]
|
||||
|
||||
def build_shuffle_index(index, name):
|
||||
shuffle_index = None
|
||||
if splits[index + 1] > splits[index]:
|
||||
documents = np.arange(start=splits[index], stop=splits[index + 1],
|
||||
step=1, dtype=np.int32)
|
||||
mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, documents=documents)
|
||||
shuffle_index = self._build_index_mappings(name=name, data_prefix=data_prefix, start_index=documents[0], nb_documents=len(documents), num_samples=train_val_test_num_samples[index], seed=self.args.seed)
|
||||
return shuffle_index
|
||||
|
||||
train_shuffle_index, train_epoch = build_shuffle_index(0, 'train')
|
||||
valid_shuffle_index, valid_epoch = build_shuffle_index(1, 'valid')
|
||||
test_shuffle_index, test_epoch = build_shuffle_index(2, 'test')
|
||||
|
||||
### 数量验证
|
||||
assert_judge(abs(len(train_shuffle_index) - 0.9 * total_num_of_documents * train_epoch) <= train_epoch)
|
||||
assert_judge(abs(len(valid_shuffle_index) - 0.05 * total_num_of_documents * valid_epoch) <= valid_epoch)
|
||||
assert_judge(abs(len(test_shuffle_index) - 0.05 * total_num_of_documents * test_epoch) <= test_epoch)
|
||||
|
||||
### document划分验证
|
||||
train_shuffle_index_set = set(train_shuffle_index)
|
||||
valid_shuffle_index_set = set(valid_shuffle_index)
|
||||
test_shuffle_index_set = set(test_shuffle_index)
|
||||
|
||||
assert_judge(len(train_shuffle_index_set & valid_shuffle_index_set) == 0)
|
||||
assert_judge(len(test_shuffle_index_set & valid_shuffle_index_set) == 0)
|
||||
assert_judge(len(train_shuffle_index_set & test_shuffle_index_set) == 0)
|
||||
|
||||
|
||||
def test_instruction(self):
|
||||
pass
|
@ -1,157 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import glob
|
||||
from utils import ParamConfig
|
||||
|
||||
from modellink.tokenizer import build_tokenizer
|
||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
||||
from modellink.tasks.preprocess.data_handler import LlamaFactoryInstructionHandler
|
||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
||||
from preprocess_data import get_args, build_splitter
|
||||
|
||||
|
||||
class TestProcessInstructionData(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(self):
|
||||
# configure params, the index starts from 1
|
||||
self.config = ParamConfig
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_param
|
||||
self.args = get_args()
|
||||
self.tokenizer = build_tokenizer(self.args)
|
||||
self.splitter = build_splitter(self.args)
|
||||
self.raw_dataset = build_dataset(self.args)
|
||||
self.handler = get_dataset_handler(self.args, self.raw_dataset, self.tokenizer, self.splitter)
|
||||
|
||||
# for test_build_dataset_mix1
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_mix_param1
|
||||
self.args = get_args()
|
||||
self.raw_dataset_mix1 = build_dataset(self.args)
|
||||
|
||||
# for test_build_dataset_mix2
|
||||
sys.argv = [sys.argv[0]] + self.config.instruction_data_mix_param2
|
||||
self.args = get_args()
|
||||
self.raw_dataset_mix2 = build_dataset(self.args)
|
||||
|
||||
|
||||
def test_build_tokenizer(self):
|
||||
"""
|
||||
Test normal function of the tokenizer:
|
||||
the instance of tokenizer
|
||||
the length of vocabulary
|
||||
the encode function
|
||||
the decode function
|
||||
the eod append
|
||||
...(If missed something else, welcome to add)
|
||||
"""
|
||||
self.assertIsInstance(self.tokenizer, _AutoTokenizer)
|
||||
|
||||
self.assertEqual(self.tokenizer.vocab_size, 151851)
|
||||
self.assertEqual(self.tokenizer.tokenize('<0xF7>'), [27, 15, 9770, 22, 29])
|
||||
self.assertEqual(self.tokenizer.detokenize(31338), '<EFBFBD>建')
|
||||
self.assertEqual(self.tokenizer.detokenize(self.tokenizer.eod), '<|im_end|>')
|
||||
|
||||
|
||||
def test_build_splitter(self):
|
||||
"""
|
||||
If there's no split_sentence, default process is `IdentitySplitter()`.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def test_build_dataset_mix1(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
outputs["prompt"] = prompt
|
||||
outputs["response"] = response
|
||||
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
|
||||
outputs["tools"].append("")
|
||||
"""
|
||||
print("-------------------test_build_dataset_mix1-------------------------")
|
||||
print(len(self.raw_dataset_mix1.__getitem__("prompt")))
|
||||
print(len(self.raw_dataset_mix1.__getitem__("response")))
|
||||
print(len(self.raw_dataset_mix1.__getitem__("system")))
|
||||
print(len(self.raw_dataset_mix1.__getitem__("tools")))
|
||||
|
||||
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("prompt")), 40)
|
||||
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("response")), 40)
|
||||
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("system")), 40)
|
||||
self.assertLessEqual(len(self.raw_dataset_mix1.__getitem__("tools")), 40)
|
||||
|
||||
|
||||
def test_build_dataset_mix2(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
outputs["prompt"] = prompt
|
||||
outputs["response"] = response
|
||||
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
|
||||
outputs["tools"].append("")
|
||||
"""
|
||||
print("----------------test_build_dataset_mix2--------------------------")
|
||||
print(len(self.raw_dataset_mix2.__getitem__("prompt")))
|
||||
print(len(self.raw_dataset_mix2.__getitem__("response")))
|
||||
print(len(self.raw_dataset_mix2.__getitem__("system")))
|
||||
print(len(self.raw_dataset_mix2.__getitem__("tools")))
|
||||
|
||||
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("prompt")), 40)
|
||||
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("response")), 40)
|
||||
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("system")), 40)
|
||||
self.assertGreaterEqual(len(self.raw_dataset_mix2.__getitem__("tools")), 40)
|
||||
|
||||
|
||||
def test_build_dataset(self):
|
||||
"""
|
||||
Test the raw_dataset, need to test number of columns and rows
|
||||
outputs["prompt"] = prompt
|
||||
outputs["response"] = response
|
||||
outputs["system"].append(sample[dataset_attr.system] if dataset_attr.system else "")
|
||||
outputs["tools"].append("")
|
||||
"""
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("prompt")), 62981)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("response")), 62981)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("system")), 62981)
|
||||
self.assertEqual(len(self.raw_dataset.__getitem__("tools")), 62981)
|
||||
|
||||
|
||||
def test_get_dataset_handler(self):
|
||||
"""
|
||||
Test if get the right data handler for pretrain
|
||||
"""
|
||||
self.assertIsInstance(self.handler, LlamaFactoryInstructionHandler)
|
||||
|
||||
|
||||
def test_serialize_to_disk(self):
|
||||
"""
|
||||
Test generate pretrain object files and files are not None(MB).
|
||||
"""
|
||||
self.handler.serialize_to_disk()
|
||||
folder_path = self.config.instruction_data_param[7].replace("/alpaca", "")
|
||||
bin_file = glob.glob(os.path.join(folder_path, "*.bin"))
|
||||
idx_file = glob.glob(os.path.join(folder_path, "*.idx"))
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
file_path = os.path.join(folder_path, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
self.assertEqual(len(bin_file), 3)
|
||||
self.assertEqual(len(idx_file), 3)
|
||||
self.assertAlmostEqual((total_size / (1024 * 1024)), 111, delta=1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,52 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Huawei Technologies Co., Ltd. 2020-2020. All rights reserved.
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParamConfig:
|
||||
"""
|
||||
We can config the params in the `.json` file including:
|
||||
convert_ckpt_param,
|
||||
network_size,
|
||||
tokenizer_param,
|
||||
distributed_param,
|
||||
inference_param,
|
||||
evaluation_param,
|
||||
and other auxiliary_param.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent
|
||||
param_config = os.path.join(base_dir, "param_config.json")
|
||||
with open(param_config) as f:
|
||||
config_file = json.load(f)
|
||||
|
||||
network_size = config_file["NETWORK_SIZE"]
|
||||
tokenizer_param = config_file["TOKENIZER_PARAM"]
|
||||
distributed_param = config_file["DISTRIBUTED_PARAM"]
|
||||
distributed_param_tp8_pp1 = config_file["DISTRIBUTED_PARAM_TP8_PP1"]
|
||||
auxiliary_param = config_file["AUXILIARY_PARAM"]
|
||||
instruction_param = config_file["INSTRUCTION_PARAM"]
|
||||
output_param = config_file["OUTPUT_PARAM"]
|
||||
|
||||
# prepreocess instruction data
|
||||
instruction_data_param = config_file["PROCESS_INSTRUCTION_DATA"]
|
||||
instruction_data_mix_param1 = config_file["PROCESS_INSTRUCTION_DATA_MIX1"]
|
||||
instruction_data_mix_param2 = config_file["PROCESS_INSTRUCTION_DATA_MIX2"]
|
||||
|
||||
# inference
|
||||
inference_param = config_file["INFERENCE_PARAM"]
|
||||
beam_search_auxliary_param = config_file["BEAM_SEARCH_AUXILIARY_PARAM"]
|
||||
greedy_search_auxliary_param = config_file["GREEDY_SEARCH_AUXILIARY_PARAM"]
|
||||
do_sample_auxliary_param = config_file["DO_SAMPLE_AUXILIARY_PARAM"]
|
||||
beam_search_with_sampling_auxliary_param = config_file["BEAM_SEARCH_WITH_SAMPLING_AUXILIARY_PARAM"]
|
||||
return_output_log_probs_auxliary_param = config_file["RETURN_OUTPUT_LOG_PROBS_AUXILIARY_PARAM"]
|
||||
|
||||
|
||||
def assert_judge(expression):
|
||||
if not expression:
|
||||
raise AssertionError
|
@ -11,7 +11,7 @@ import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
|
@ -6,7 +6,7 @@ import sys
|
||||
import os
|
||||
import torch
|
||||
import torch_npu
|
||||
from tests.common import DistributedTest
|
||||
from tests.test_tools.dist_test import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
|
@ -1,69 +0,0 @@
|
||||
{
|
||||
"CONVERT_CKPT_PARAM": [
|
||||
"--model-type", "GPT",
|
||||
"--loader", "llama2_hf",
|
||||
"--saver", "megatron",
|
||||
"--load-dir", "/home/dataset/yi-34B-hf",
|
||||
"--save-dir", "/home/dataset/yi-34B-mt-t8p1",
|
||||
"--target-tensor-parallel-size", "8",
|
||||
"--target-pipeline-parallel-size", "1",
|
||||
"--tokenizer-model", "None"
|
||||
],
|
||||
|
||||
"NETWORK_SIZE": [
|
||||
"--num-layers", "60",
|
||||
"--hidden-size", "7168",
|
||||
"--ffn-hidden-size", "20480",
|
||||
"--num-attention-heads", "56",
|
||||
"--max-position-embeddings", "4096",
|
||||
"--position-embedding-type", "rope",
|
||||
"--make-vocab-size-divisible-by", "1",
|
||||
"--normalization", "RMSNorm",
|
||||
"--swiglu",
|
||||
"--untie-embeddings-and-output-weights",
|
||||
"--load", "/home/dataset/yi-34B-mt-t8p1"
|
||||
],
|
||||
|
||||
"TOKENIZER_PARAM": [
|
||||
"--tokenizer-type", "PretrainedFromHF",
|
||||
"--tokenizer-name-or-path", "/home/dataset/yi-34B-hf"
|
||||
],
|
||||
|
||||
"DISTRIBUTED_PARAM": [
|
||||
"--tensor-model-parallel-size", "8",
|
||||
"--pipeline-model-parallel-size", "1"
|
||||
],
|
||||
|
||||
"INFERENCE_PARAM": [
|
||||
"--max-new-tokens", "256",
|
||||
"--tokenizer-not-use-fast",
|
||||
"--exit-on-missing-checkpoint",
|
||||
"--attention-softmax-in-fp32"
|
||||
],
|
||||
|
||||
"EVALUATION_PARAM": [
|
||||
"--tokenizer-not-use-fast",
|
||||
"--task-data-path", "/home/dataset/eval_dataset/mmlu/test",
|
||||
"--task", "mmlu",
|
||||
"--max-new-tokens", "1",
|
||||
"--exit-on-missing-checkpoint"
|
||||
],
|
||||
|
||||
"AUXILIARY_PARAM": [
|
||||
"--micro-batch-size", "1",
|
||||
"--global-batch-size", "16",
|
||||
"--no-masked-softmax-fusion",
|
||||
"--disable-bias-linear",
|
||||
"--no-gradient-accumulation-fusion",
|
||||
"--bf16",
|
||||
"--seed", "42",
|
||||
"--use-fused-rmsnorm",
|
||||
"--group-query-attention",
|
||||
"--no-load-optim",
|
||||
"--no-load-rng",
|
||||
"--seq-length", "4096",
|
||||
"--num-query-groups", "8",
|
||||
"--vocab-size", "64000",
|
||||
"--rotary-base", "5000000"
|
||||
]
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
import unittest
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import glob
|
||||
from pathlib import Path
|
||||
from utils import ParamConfig
|
||||
import torch
|
||||
|
||||
import modellink
|
||||
|
||||
|
||||
class TestConvertCkptFromHuggingface(unittest.TestCase):
|
||||
def setUp(self, config=ParamConfig):
|
||||
# configure params, the index starts from 1
|
||||
self.config = config
|
||||
sys.argv = [sys.argv[0]] + self.config.convert_ckpt_param
|
||||
|
||||
def test_file_exsit(self):
|
||||
"""
|
||||
Test if the file in the `--load-dir` exsit, including `.bin`, `.json`...
|
||||
"""
|
||||
bin_file = glob.glob(os.path.join(self.config.convert_ckpt_param[7], "*.bin"))
|
||||
self.assertEqual(len(bin_file), 7)
|
||||
self.assertTrue(os.path.exists(os.path.join(self.config.convert_ckpt_param[7], "pytorch_model.bin.index.json")))
|
||||
|
||||
def test_convert_weights_form_huggingface(self):
|
||||
"""
|
||||
Test whether the weight to be converted as we want in `--save-dir`. We will check the model layer name,
|
||||
including embedding, final_norm, output and encoder. In the encoder, there will be some different layers
|
||||
to compose the unique transformer layer and all these layer stack to compose the entity of the model.
|
||||
"""
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
file_path = os.path.join(base_dir, "convert_ckpt.py")
|
||||
arguments = sys.argv[1:]
|
||||
subprocess.run(["python", file_path] + arguments)
|
||||
output_dir = os.path.join(self.config.convert_ckpt_param[9], "iter_0000001")
|
||||
weight_content = torch.load(os.path.join(output_dir, "mp_rank_00/model_optim_rng.pt"))
|
||||
weight_common_content = weight_content['model']['language_model'] # extract commmon content
|
||||
|
||||
# embedding, encoder, output_layer is three out layers.
|
||||
self.assertEqual(len(os.listdir(output_dir)), int(self.config.convert_ckpt_param[11]))
|
||||
self.assertEqual(weight_common_content['embedding']['word_embeddings']['weight'].size(), torch.Size([8000, 7168]))
|
||||
self.assertEqual(weight_common_content['encoder']['final_norm.weight'].size(), torch.Size([7168]))
|
||||
|
||||
# encoder has a common final_norm and each one has folliowing six layers
|
||||
weight_common_content['encoder'].pop('final_norm.weight')
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.query_key_value.weight'].size(), torch.Size([1152, 7168]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.self_attention.dense.weight'].size(), torch.Size([7168, 896]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_h_to_4h.weight'].size(), torch.Size([5120, 7168]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.mlp.dense_4h_to_h.weight'].size(), torch.Size([7168, 2560]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.input_norm.weight'].size(), torch.Size([7168]))
|
||||
self.assertEqual(weight_common_content['encoder']['layers.0.post_attention_norm.weight'].size(), torch.Size([7168]))
|
||||
|
||||
self.assertEqual(weight_common_content['output_layer']['weight'].size(), torch.Size([8000, 7168]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
@ -1,94 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch_npu
|
||||
from transformers import AutoTokenizer
|
||||
from tests.common import DistributedTest
|
||||
from utils import ParamConfig, assert_judge
|
||||
import modellink
|
||||
from megatron.legacy.model import GPTModel
|
||||
from modellink.tasks.evaluation.utils import add_text_generate_args
|
||||
|
||||
|
||||
class TestEvaluation(DistributedTest):
|
||||
world_size = 8
|
||||
|
||||
def init(self, config=ParamConfig):
|
||||
sys.argv = [sys.argv[0]] + config.distributed_param + config.network_size + config.auxiliary_param + \
|
||||
config.evaluation_param + config.tokenizer_param
|
||||
from megatron.training.initialize import initialize_megatron
|
||||
os.environ.update({"CUDA_DEVICE_MAX_CONNECTIONS": "1"})
|
||||
initialize_megatron(extra_args_provider=add_text_generate_args,
|
||||
args_defaults={'no_load_rng': True,
|
||||
'no_load_optim': True})
|
||||
|
||||
from megatron.training import get_args
|
||||
self.args = get_args()
|
||||
|
||||
def test_mmlu_evaluation(self):
|
||||
self.init(config=ParamConfig)
|
||||
from evaluation import model_provider
|
||||
from modellink.tasks.evaluation.eval_impl.template import MMLU_TEMPLATE_DIR
|
||||
model = GPTModel.from_pretrained(
|
||||
model_provider=model_provider,
|
||||
pretrained_name_or_path=self.args.load
|
||||
)
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer_name_or_path)
|
||||
max_new_tokens = self.args.max_new_tokens
|
||||
|
||||
instruction_template = "{few_shot_examples}\n\n{question}\nAnswer:"
|
||||
|
||||
total_acc_n = 0
|
||||
total_n = 0
|
||||
|
||||
test_dir = None
|
||||
for path in self.args.task_data_path:
|
||||
if "mmlu" in path:
|
||||
test_dir = path
|
||||
base_dir = Path(__file__).absolute().parent.parent.parent.parent
|
||||
template_dir = os.path.join(base_dir, MMLU_TEMPLATE_DIR)
|
||||
with open(template_dir, encoding='utf-8') as f:
|
||||
mmlu_few_shot_template = json.load(f)
|
||||
|
||||
for file in tqdm.tqdm(os.listdir(test_dir)):
|
||||
file_path = os.path.join(test_dir, file)
|
||||
data_df = pd.read_csv(file_path, names=['question', 'A', 'B', 'C', 'D', 'answer'])
|
||||
subject_name = file[0: -9]
|
||||
subject = subject_name.replace("_", " ")
|
||||
acc_n = 0
|
||||
data_df_test = data_df[0:10]
|
||||
for index, row in data_df_test.iterrows():
|
||||
test_question = f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}"
|
||||
instruction = instruction_template.format(few_shot_examples=mmlu_few_shot_template[subject_name],
|
||||
subject=subject,
|
||||
question=test_question)
|
||||
chat_result = model.generate(
|
||||
instruction,
|
||||
do_sample=False,
|
||||
max_new_tokens=max_new_tokens,
|
||||
tokenizer=tokenizer,
|
||||
stream=False,
|
||||
return_output_log_probs=True
|
||||
)
|
||||
assert_judge(isinstance(chat_result, tuple))
|
||||
assert_judge(isinstance(chat_result[1], torch.Tensor))
|
||||
answer = None
|
||||
if chat_result:
|
||||
answer = chat_result[0].strip()
|
||||
if answer == row['answer']:
|
||||
acc_n += 1
|
||||
if torch.distributed.get_rank() == 0:
|
||||
total_n += len(data_df_test)
|
||||
total_acc_n += acc_n
|
||||
if torch.distributed.get_rank() == 0:
|
||||
try:
|
||||
final_acc = total_acc_n / total_n
|
||||
except ZeroDivisionError as e:
|
||||
raise e
|
||||
print(final_acc)
|
||||
assert_judge(abs(final_acc - 0.803) < 0.01)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user