mirror of
https://gitee.com/ascend/ModelLink.git
synced 2024-11-29 18:38:39 +08:00
!1688 optimize llamafactory process dataset ut
Merge pull request !1688 from LeiZhenzhen/master
This commit is contained in:
parent
80d27b3371
commit
cd014cfde1
@ -52,7 +52,8 @@ def choose_skip_ci(raw_txt_file):
|
|||||||
def filter_exec_ut(raw_txt_file):
|
def filter_exec_ut(raw_txt_file):
|
||||||
file_list = read_files_from_txt(raw_txt_file)
|
file_list = read_files_from_txt(raw_txt_file)
|
||||||
filter_conds = [
|
filter_conds = [
|
||||||
is_ut
|
is_ut,
|
||||||
|
is_markdown
|
||||||
]
|
]
|
||||||
for file in file_list:
|
for file in file_list:
|
||||||
if not any(condition(file) for condition in filter_conds):
|
if not any(condition(file) for condition in filter_conds):
|
||||||
|
61
tests/ut/process_data/test_process_instruction_data_lf.json
Normal file
61
tests/ut/process_data/test_process_instruction_data_lf.json
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
{
|
||||||
|
"test_alpaca_dataset": [
|
||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
||||||
|
"tokenizer-type": "PretrainedFromHF",
|
||||||
|
"handler-name": "AlpacaStyleInstructionHandler",
|
||||||
|
"output-prefix": "/data/tune_dataset/alpaca/alpaca",
|
||||||
|
"tokenizer-name-or-path": "/data/qwen-7b/",
|
||||||
|
"workers": 4,
|
||||||
|
"log-interval": 1000,
|
||||||
|
"prompt-type": "qwen"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"test_alpaca_history_dataset": [
|
||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"input": "/data/tune_dataset/oaast_sft.json",
|
||||||
|
"tokenizer-type": "PretrainedFromHF",
|
||||||
|
"handler-name": "AlpacaStyleInstructionHandler",
|
||||||
|
"output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his",
|
||||||
|
"tokenizer-name-or-path": "/data/qwen-7b/",
|
||||||
|
"workers": 4,
|
||||||
|
"log-interval": 1000,
|
||||||
|
"prompt-type": "qwen",
|
||||||
|
"map-keys": "{\"history\":\"history\"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"test_sharegpt_dataset": [
|
||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"input": "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
|
||||||
|
"tokenizer-type": "PretrainedFromHF",
|
||||||
|
"handler-name": "SharegptStyleInstructionHandler",
|
||||||
|
"output-prefix": "/data/tune_dataset/sharegpt/sharegpt",
|
||||||
|
"tokenizer-name-or-path": "/data/qwen-7b/",
|
||||||
|
"workers": 4,
|
||||||
|
"log-interval": 1000,
|
||||||
|
"prompt-type": "qwen",
|
||||||
|
"map-keys": "{\"system\":\"system_prompt\"}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"test_openai_dataset": [
|
||||||
|
{
|
||||||
|
"params": {
|
||||||
|
"input": "/data/tune_dataset/sss.json",
|
||||||
|
"tokenizer-type": "PretrainedFromHF",
|
||||||
|
"handler-name": "SharegptStyleInstructionHandler",
|
||||||
|
"output-prefix": "/data/tune_dataset/sharegpt/sharegpt",
|
||||||
|
"tokenizer-name-or-path": "/data/qwen-7b/",
|
||||||
|
"workers": 4,
|
||||||
|
"log-interval": 1000,
|
||||||
|
"prompt-type": "qwen",
|
||||||
|
"map-keys": "{\"messages\":\"messages\", \"tags\": {\"role_tag\": \"role\", \"content_tag\": \"content\", \"user_tag\": \"user\", \"assistant_tag\": \"assistant\", \"system_tag\": \"system\"}}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@ -1,158 +1,56 @@
|
|||||||
import sys
|
|
||||||
import os
|
import os
|
||||||
import math
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import modellink
|
import modellink
|
||||||
from tests.test_tools.utils import judge_expression, get_md5sum
|
from tests.test_tools.utils import build_args, create_testconfig, compare_file_md5_same
|
||||||
from modellink.tokenizer import build_tokenizer
|
from preprocess_data import main
|
||||||
from modellink.tokenizer.tokenizer import _AutoTokenizer
|
|
||||||
from modellink.tasks.preprocess.data_handler import AlpacaStyleInstructionHandler, SharegptStyleInstructionHandler
|
|
||||||
from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
|
|
||||||
from preprocess_data import get_args, build_splitter
|
|
||||||
|
|
||||||
|
|
||||||
class TestProcessInstructionDataLf:
|
class TestProcessInstructionDataLf:
|
||||||
|
|
||||||
def setup_class(self):
|
|
||||||
# test for alpaca
|
|
||||||
sys.argv = [
|
|
||||||
sys.argv[0],
|
|
||||||
"--input", "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
|
|
||||||
"--tokenizer-type", "PretrainedFromHF",
|
|
||||||
"--handler-name", "AlpacaStyleInstructionHandler",
|
|
||||||
"--output-prefix", "/data/tune_dataset/alpaca/alpaca",
|
|
||||||
"--tokenizer-name-or-path", "/data/qwen-7b/",
|
|
||||||
"--workers", "4",
|
|
||||||
"--log-interval", "1000",
|
|
||||||
"--prompt-type", "qwen"
|
|
||||||
]
|
|
||||||
self.args = get_args()
|
|
||||||
self.tokenizer = build_tokenizer(self.args)
|
|
||||||
self.splitter = build_splitter(self.args)
|
|
||||||
self.raw_dataset_alpaca = build_dataset(self.args)
|
|
||||||
self.handler_alpaca = get_dataset_handler(self.args, self.raw_dataset_alpaca, self.tokenizer, self.splitter)
|
|
||||||
|
|
||||||
# test for alpaca history
|
|
||||||
sys.argv = [
|
|
||||||
sys.argv[0],
|
|
||||||
"--input", "/data/tune_dataset/oaast_sft.json",
|
|
||||||
"--tokenizer-type", "PretrainedFromHF",
|
|
||||||
"--handler-name", "AlpacaStyleInstructionHandler",
|
|
||||||
"--output-prefix", "/data/tune_dataset/alpaca_his/alpaca_his",
|
|
||||||
"--tokenizer-name-or-path", "/data/qwen-7b/",
|
|
||||||
"--workers", "4",
|
|
||||||
"--log-interval", "1000",
|
|
||||||
"--prompt-type", "qwen",
|
|
||||||
"--map-keys", '{"history":"history"}'
|
|
||||||
]
|
|
||||||
self.args = get_args()
|
|
||||||
self.raw_dataset_alpaca_his = build_dataset(self.args)
|
|
||||||
self.handler_alpaca_his = get_dataset_handler(self.args, self.raw_dataset_alpaca_his, self.tokenizer, self.splitter)
|
|
||||||
|
|
||||||
# test for sharegpt
|
|
||||||
sys.argv = [
|
|
||||||
sys.argv[0],
|
|
||||||
"--input", "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
|
|
||||||
"--tokenizer-type", "PretrainedFromHF",
|
|
||||||
"--handler-name", "SharegptStyleInstructionHandler",
|
|
||||||
"--output-prefix", "/data/tune_dataset/sharegpt/sharegpt",
|
|
||||||
"--tokenizer-name-or-path", "/data/qwen-7b/",
|
|
||||||
"--workers", "4",
|
|
||||||
"--log-interval", "1000",
|
|
||||||
"--prompt-type", "qwen",
|
|
||||||
"--map-keys", '{"system":"system_prompt"}'
|
|
||||||
]
|
|
||||||
|
|
||||||
self.args = get_args()
|
|
||||||
self.raw_dataset_sharegpt = build_dataset(self.args)
|
|
||||||
self.handler_sharegpt = get_dataset_handler(self.args, self.raw_dataset_sharegpt, self.tokenizer, self.splitter)
|
|
||||||
|
|
||||||
# test for openai
|
|
||||||
sys.argv = [
|
|
||||||
sys.argv[0],
|
|
||||||
"--input", "/data/tune_dataset/sss.json",
|
|
||||||
"--tokenizer-type", "PretrainedFromHF",
|
|
||||||
"--handler-name", "SharegptStyleInstructionHandler",
|
|
||||||
"--output-prefix", "/data/tune_dataset/openai/openai",
|
|
||||||
"--tokenizer-name-or-path", "/data/qwen-7b/",
|
|
||||||
"--workers", "4",
|
|
||||||
"--log-interval", "1000",
|
|
||||||
"--prompt-type", "qwen",
|
|
||||||
"--map-keys", '{"messages":"messages", "tags":{"role_tag": "role","content_tag": "content","user_tag": "user","assistant_tag": "assistant","system_tag": "system"} }'
|
|
||||||
]
|
|
||||||
|
|
||||||
self.args = get_args()
|
|
||||||
self.raw_dataset_openai = build_dataset(self.args)
|
|
||||||
self.handler_openai = get_dataset_handler(self.args, self.raw_dataset_openai, self.tokenizer, self.splitter)
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_dataset_handler(self):
|
test_config = create_testconfig(Path(__file__).with_suffix(".json"))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("params, base_path",
|
||||||
|
[
|
||||||
|
(test_config["test_alpaca_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/alpaca/alpaca"),
|
||||||
|
(test_config["test_alpaca_history_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/alpaca_history/alpaca_history"),
|
||||||
|
(test_config["test_sharegpt_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/sharegpt/sharegpt_lf"),
|
||||||
|
(test_config["test_openai_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/openai/sss")
|
||||||
|
])
|
||||||
|
def test_datasets(self, build_args, params, base_path):
|
||||||
"""
|
"""
|
||||||
Test if get the right data handler for pretrain
|
Tests dataset preprocessing and validates output files by comparing MD5 checksums.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
- params: dict
|
||||||
|
A dictionary containing dataset-specific configurations, such as input files,
|
||||||
|
output prefix, and tokenizer information. Extracted from `test_config`.
|
||||||
|
- base_path: str
|
||||||
|
The base path of the reference dataset files (e.g., Alpaca, Alpaca History, ShareGPT, OpenAI).
|
||||||
|
Used to locate the ground truth files for comparison with the generated output.
|
||||||
"""
|
"""
|
||||||
judge_expression(isinstance(self.handler_alpaca, AlpacaStyleInstructionHandler))
|
# create output dir if it doesn't exist
|
||||||
judge_expression(isinstance(self.handler_alpaca_his, AlpacaStyleInstructionHandler))
|
out_dir = os.path.dirname(params["output-prefix"])
|
||||||
judge_expression(isinstance(self.handler_sharegpt, SharegptStyleInstructionHandler))
|
if not os.path.isdir(out_dir):
|
||||||
judge_expression(isinstance(self.handler_openai, SharegptStyleInstructionHandler))
|
os.makedirs(out_dir)
|
||||||
|
|
||||||
|
# run the main preprocessing function
|
||||||
|
main()
|
||||||
|
|
||||||
def test_serialize_to_disk(self):
|
# print dataset name for clarity
|
||||||
"""
|
dataset_name = base_path.split('/')[-1]
|
||||||
Test generate pretrain object files and files are not None(MB).
|
print(f"=============== test_{dataset_name}_dataset =============")
|
||||||
"""
|
|
||||||
self.handler_alpaca.serialize_to_disk()
|
|
||||||
self.handler_alpaca_his.serialize_to_disk()
|
|
||||||
self.handler_sharegpt.serialize_to_disk()
|
|
||||||
self.handler_openai.serialize_to_disk()
|
|
||||||
folder_path1 = "/data/tune_dataset/alpaca/"
|
|
||||||
folder_path2 = "/data/tune_dataset/alpaca_his/"
|
|
||||||
folder_path3 = "/data/tune_dataset/sharegpt/"
|
|
||||||
folder_path4 = "/data/tune_dataset/openai/"
|
|
||||||
|
|
||||||
def check_file_num(folder_path):
|
prefix_str = params["output-prefix"].split('/')[-1]
|
||||||
bin_file = 0
|
mid_strs = ["_packed_attention_mask_document", "_packed_input_ids_document", "_packed_labels_document"]
|
||||||
idx_file = 0
|
end_suffixs = [".bin", ".idx"]
|
||||||
total_size = 0
|
|
||||||
for file_name in os.listdir(folder_path):
|
|
||||||
file_path = os.path.join(folder_path, file_name)
|
|
||||||
if os.path.isfile(file_path):
|
|
||||||
if file_path.endswith(".bin"):
|
|
||||||
bin_file += 1
|
|
||||||
if file_path.endswith(".idx"):
|
|
||||||
idx_file += 1
|
|
||||||
total_size += os.path.getsize(file_path)
|
|
||||||
judge_expression(bin_file == 3)
|
|
||||||
judge_expression(idx_file == 3)
|
|
||||||
|
|
||||||
check_file_num(folder_path1)
|
|
||||||
check_file_num(folder_path2)
|
|
||||||
check_file_num(folder_path3)
|
|
||||||
check_file_num(folder_path4)
|
|
||||||
|
|
||||||
|
|
||||||
def test_md5sum_with_llamafactoryhandler(self):
|
|
||||||
file_path_alpaca = "/data/tune_dataset/alpaca/alpaca"
|
|
||||||
file_path_alpaca_his = "/data/tune_dataset/alpaca_his/alpaca_his"
|
|
||||||
file_path_sharegpt = "/data/tune_dataset/sharegpt/sharegpt"
|
|
||||||
file_path_openai = "/data/tune_dataset/openai/openai"
|
|
||||||
|
|
||||||
file_path_compare_alpaca = "/data/tune_dataset/Llamafactoryhandler/alpaca/alpaca"
|
|
||||||
file_path_compare_alpaca_his = "/data/tune_dataset/Llamafactoryhandler/alpaca_history/alpaca_history"
|
|
||||||
file_path_compare_sharegpt = "/data/tune_dataset/Llamafactoryhandler/sharegpt/sharegpt_lf"
|
|
||||||
file_path_compare_openai = "/data/tune_dataset/Llamafactoryhandler/openai/sss"
|
|
||||||
|
|
||||||
def compare_md5sum(file_path1, file_path2):
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_attention_mask_document.idx") == get_md5sum(file_path2 + "_packed_attention_mask_document.idx"))
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_attention_mask_document.bin") == get_md5sum(file_path2 + "_packed_attention_mask_document.bin"))
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_input_ids_document.idx") == get_md5sum(file_path2 + "_packed_input_ids_document.idx"))
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_input_ids_document.bin") == get_md5sum(file_path2 + "_packed_input_ids_document.bin"))
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_labels_document.idx") == get_md5sum(file_path2 + "_packed_labels_document.idx"))
|
|
||||||
judge_expression(get_md5sum(file_path1 + "_packed_labels_document.bin") == get_md5sum(file_path2 + "_packed_labels_document.bin"))
|
|
||||||
|
|
||||||
compare_md5sum(file_path_alpaca, file_path_compare_alpaca)
|
|
||||||
compare_md5sum(file_path_alpaca_his, file_path_compare_alpaca_his)
|
|
||||||
compare_md5sum(file_path_sharegpt, file_path_compare_sharegpt)
|
|
||||||
compare_md5sum(file_path_openai, file_path_compare_openai)
|
|
||||||
|
|
||||||
|
# loop through mid_strs and end_suffixs, checking file MD5 hashes
|
||||||
|
for mid_str in mid_strs:
|
||||||
|
for end_suffix in end_suffixs:
|
||||||
|
end_str = mid_str + end_suffix
|
||||||
|
base_file = base_path + end_str
|
||||||
|
test_file = params["output-prefix"] + end_str
|
||||||
|
assert compare_file_md5_same(base_file, test_file)
|
||||||
|
Loading…
Reference in New Issue
Block a user