!1688 optimize llamafactory process dataset ut

Merge pull request !1688 from LeiZhenzhen/master
2024-11-29 18:38:39 +08:00 · 2024-09-19 06:41:39 +00:00 · 2024-09-19 06:41:39 +00:00 · cd014cfde1
commit cd014cfde1
parent 80d27b3371
3 changed files with 106 additions and 146 deletions
--- a/ci/access_control_test.py
+++ b/ci/access_control_test.py
@ -52,7 +52,8 @@ def choose_skip_ci(raw_txt_file):
 def filter_exec_ut(raw_txt_file):
    file_list = read_files_from_txt(raw_txt_file)
    filter_conds = [
-        is_ut
+        is_ut,
        is_markdown
    ]
    for file in file_list:
        if not any(condition(file) for condition in filter_conds):
--- a/tests/ut/process_data/test_process_instruction_data_lf.json
+++ b/tests/ut/process_data/test_process_instruction_data_lf.json
@ -0,0 +1,61 @@
 {
    "test_alpaca_dataset": [
        {
            "params": {
                "input": "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
                "tokenizer-type": "PretrainedFromHF",
                "handler-name": "AlpacaStyleInstructionHandler",
                "output-prefix": "/data/tune_dataset/alpaca/alpaca", 
                "tokenizer-name-or-path": "/data/qwen-7b/",
                "workers": 4,
                "log-interval": 1000,
                "prompt-type": "qwen"
            }
        }
    ],
    "test_alpaca_history_dataset": [
        {
            "params": {
                "input": "/data/tune_dataset/oaast_sft.json",
                "tokenizer-type": "PretrainedFromHF",
                "handler-name": "AlpacaStyleInstructionHandler",
                "output-prefix": "/data/tune_dataset/alpaca_his/alpaca_his", 
                "tokenizer-name-or-path": "/data/qwen-7b/",
                "workers": 4,
                "log-interval": 1000,
                "prompt-type": "qwen",
                "map-keys": "{\"history\":\"history\"}"
            }
        }
    ],
    "test_sharegpt_dataset": [
        {
            "params": {
                "input": "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
                "tokenizer-type": "PretrainedFromHF",
                "handler-name": "SharegptStyleInstructionHandler",
                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", 
                "tokenizer-name-or-path": "/data/qwen-7b/",
                "workers": 4,
                "log-interval": 1000,
                "prompt-type": "qwen",
                "map-keys": "{\"system\":\"system_prompt\"}"
            }
        }
    ],
    "test_openai_dataset": [
        {
            "params": {
                "input": "/data/tune_dataset/sss.json",
                "tokenizer-type": "PretrainedFromHF",
                "handler-name": "SharegptStyleInstructionHandler",
                "output-prefix": "/data/tune_dataset/sharegpt/sharegpt", 
                "tokenizer-name-or-path": "/data/qwen-7b/",
                "workers": 4,
                "log-interval": 1000,
                "prompt-type": "qwen",
                "map-keys": "{\"messages\":\"messages\", \"tags\": {\"role_tag\": \"role\", \"content_tag\": \"content\", \"user_tag\": \"user\", \"assistant_tag\": \"assistant\", \"system_tag\": \"system\"}}"
            }
        }
    ]
 }
--- a/tests/ut/process_data/test_process_instruction_data_lf.py
+++ b/tests/ut/process_data/test_process_instruction_data_lf.py
@ -1,158 +1,56 @@
 import sys
 import os
-import math
+from pathlib import Path
-
+import pytest
 import pandas as pd
 import modellink
-from tests.test_tools.utils import judge_expression, get_md5sum
+from tests.test_tools.utils import build_args, create_testconfig, compare_file_md5_same
-from modellink.tokenizer import build_tokenizer
+from preprocess_data import main
 from modellink.tokenizer.tokenizer import _AutoTokenizer
 from modellink.tasks.preprocess.data_handler import AlpacaStyleInstructionHandler, SharegptStyleInstructionHandler
 from modellink.tasks.preprocess.data_handler import build_dataset, get_dataset_handler
 from preprocess_data import get_args, build_splitter
 class TestProcessInstructionDataLf:
    def setup_class(self):
        # test for alpaca
        sys.argv = [
            sys.argv[0],
            "--input", "/data/tune_dataset/train-00000-of-00001-a09b74b3ef9c3b56.parquet",
            "--tokenizer-type", "PretrainedFromHF",
            "--handler-name", "AlpacaStyleInstructionHandler",
            "--output-prefix", "/data/tune_dataset/alpaca/alpaca",
            "--tokenizer-name-or-path", "/data/qwen-7b/",
            "--workers", "4",
            "--log-interval", "1000",
            "--prompt-type", "qwen"
        ]
        self.args = get_args()
        self.tokenizer = build_tokenizer(self.args)
        self.splitter = build_splitter(self.args)
        self.raw_dataset_alpaca = build_dataset(self.args)
        self.handler_alpaca = get_dataset_handler(self.args, self.raw_dataset_alpaca, self.tokenizer, self.splitter)
        # test for alpaca history
        sys.argv = [
            sys.argv[0],
            "--input", "/data/tune_dataset/oaast_sft.json",
            "--tokenizer-type", "PretrainedFromHF",
            "--handler-name", "AlpacaStyleInstructionHandler",
            "--output-prefix", "/data/tune_dataset/alpaca_his/alpaca_his",
            "--tokenizer-name-or-path", "/data/qwen-7b/",
            "--workers", "4",
            "--log-interval", "1000",
            "--prompt-type", "qwen",
            "--map-keys", '{"history":"history"}'
        ]
        self.args = get_args()
        self.raw_dataset_alpaca_his = build_dataset(self.args)
        self.handler_alpaca_his = get_dataset_handler(self.args, self.raw_dataset_alpaca_his, self.tokenizer, self.splitter)
        # test for sharegpt
        sys.argv = [
            sys.argv[0],
            "--input", "/data/tune_dataset/sharegpt_formatted_data-evol-gpt4.jsonl",
            "--tokenizer-type", "PretrainedFromHF",
            "--handler-name", "SharegptStyleInstructionHandler",
            "--output-prefix", "/data/tune_dataset/sharegpt/sharegpt",
            "--tokenizer-name-or-path", "/data/qwen-7b/",
            "--workers", "4",
            "--log-interval", "1000",
            "--prompt-type", "qwen",
            "--map-keys", '{"system":"system_prompt"}'
        ]
        self.args = get_args()
        self.raw_dataset_sharegpt = build_dataset(self.args)
        self.handler_sharegpt = get_dataset_handler(self.args, self.raw_dataset_sharegpt, self.tokenizer, self.splitter)
        # test for openai
        sys.argv = [
            sys.argv[0],
            "--input", "/data/tune_dataset/sss.json",
            "--tokenizer-type", "PretrainedFromHF",
            "--handler-name", "SharegptStyleInstructionHandler",
            "--output-prefix", "/data/tune_dataset/openai/openai",
            "--tokenizer-name-or-path", "/data/qwen-7b/",
            "--workers", "4",
            "--log-interval", "1000",
            "--prompt-type", "qwen",
            "--map-keys", '{"messages":"messages", "tags":{"role_tag": "role","content_tag": "content","user_tag": "user","assistant_tag": "assistant","system_tag": "system"} }'
        ]
        self.args = get_args()
        self.raw_dataset_openai = build_dataset(self.args)
        self.handler_openai = get_dataset_handler(self.args, self.raw_dataset_openai, self.tokenizer, self.splitter)
-    def test_get_dataset_handler(self):
+    test_config = create_testconfig(Path(__file__).with_suffix(".json"))
    @pytest.mark.parametrize("params, base_path", 
        [
            (test_config["test_alpaca_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/alpaca/alpaca"),
            (test_config["test_alpaca_history_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/alpaca_history/alpaca_history"),
            (test_config["test_sharegpt_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/sharegpt/sharegpt_lf"),
            (test_config["test_openai_dataset"][0], "/data/tune_dataset/Llamafactoryhandler/openai/sss")
        ])
    def test_datasets(self, build_args, params, base_path):
        """
-        Test if get the right data handler for pretrain
+        Tests dataset preprocessing and validates output files by comparing MD5 checksums.
        Parameters:
        - params: dict
            A dictionary containing dataset-specific configurations, such as input files,
            output prefix, and tokenizer information. Extracted from `test_config`.
        - base_path: str
            The base path of the reference dataset files (e.g., Alpaca, Alpaca History, ShareGPT, OpenAI).
            Used to locate the ground truth files for comparison with the generated output.
        """
-        judge_expression(isinstance(self.handler_alpaca, AlpacaStyleInstructionHandler))
+        # create output dir if it doesn't exist
-        judge_expression(isinstance(self.handler_alpaca_his, AlpacaStyleInstructionHandler))
+        out_dir = os.path.dirname(params["output-prefix"])
-        judge_expression(isinstance(self.handler_sharegpt, SharegptStyleInstructionHandler))
+        if not os.path.isdir(out_dir):
-        judge_expression(isinstance(self.handler_openai, SharegptStyleInstructionHandler))
+            os.makedirs(out_dir)
        # run the main preprocessing function
        main()
-    def test_serialize_to_disk(self):
+        # print dataset name for clarity
-        """
+        dataset_name = base_path.split('/')[-1]
-        Test generate pretrain object files and files are not None(MB).
+        print(f"=============== test_{dataset_name}_dataset =============")
        """
        self.handler_alpaca.serialize_to_disk()
        self.handler_alpaca_his.serialize_to_disk()
        self.handler_sharegpt.serialize_to_disk()
        self.handler_openai.serialize_to_disk()
        folder_path1 = "/data/tune_dataset/alpaca/"
        folder_path2 = "/data/tune_dataset/alpaca_his/"
        folder_path3 = "/data/tune_dataset/sharegpt/"
        folder_path4 = "/data/tune_dataset/openai/"
-        def check_file_num(folder_path):
+        prefix_str = params["output-prefix"].split('/')[-1]
-            bin_file = 0
+        mid_strs = ["_packed_attention_mask_document", "_packed_input_ids_document", "_packed_labels_document"]
-            idx_file = 0
+        end_suffixs = [".bin", ".idx"]
            total_size = 0
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    if file_path.endswith(".bin"):
                        bin_file += 1
                    if file_path.endswith(".idx"):
                        idx_file += 1
                    total_size += os.path.getsize(file_path)
            judge_expression(bin_file == 3)
            judge_expression(idx_file == 3)
        check_file_num(folder_path1)
        check_file_num(folder_path2)
        check_file_num(folder_path3)
        check_file_num(folder_path4)
    def test_md5sum_with_llamafactoryhandler(self):
        file_path_alpaca = "/data/tune_dataset/alpaca/alpaca"
        file_path_alpaca_his = "/data/tune_dataset/alpaca_his/alpaca_his"
        file_path_sharegpt = "/data/tune_dataset/sharegpt/sharegpt"
        file_path_openai = "/data/tune_dataset/openai/openai"
        file_path_compare_alpaca = "/data/tune_dataset/Llamafactoryhandler/alpaca/alpaca"
        file_path_compare_alpaca_his = "/data/tune_dataset/Llamafactoryhandler/alpaca_history/alpaca_history"
        file_path_compare_sharegpt = "/data/tune_dataset/Llamafactoryhandler/sharegpt/sharegpt_lf"
        file_path_compare_openai = "/data/tune_dataset/Llamafactoryhandler/openai/sss"
        def compare_md5sum(file_path1, file_path2):
            judge_expression(get_md5sum(file_path1 + "_packed_attention_mask_document.idx") == get_md5sum(file_path2 + "_packed_attention_mask_document.idx"))
            judge_expression(get_md5sum(file_path1 + "_packed_attention_mask_document.bin") == get_md5sum(file_path2 + "_packed_attention_mask_document.bin"))
            judge_expression(get_md5sum(file_path1 + "_packed_input_ids_document.idx") == get_md5sum(file_path2 + "_packed_input_ids_document.idx"))
            judge_expression(get_md5sum(file_path1 + "_packed_input_ids_document.bin") == get_md5sum(file_path2 + "_packed_input_ids_document.bin"))
            judge_expression(get_md5sum(file_path1 + "_packed_labels_document.idx") == get_md5sum(file_path2 + "_packed_labels_document.idx"))
            judge_expression(get_md5sum(file_path1 + "_packed_labels_document.bin") == get_md5sum(file_path2 + "_packed_labels_document.bin"))
        compare_md5sum(file_path_alpaca, file_path_compare_alpaca)
        compare_md5sum(file_path_alpaca_his, file_path_compare_alpaca_his)
        compare_md5sum(file_path_sharegpt, file_path_compare_sharegpt)
        compare_md5sum(file_path_openai, file_path_compare_openai)
        # loop through mid_strs and end_suffixs, checking file MD5 hashes
        for mid_str in mid_strs:
            for end_suffix in end_suffixs:
                end_str = mid_str + end_suffix
                base_file = base_path + end_str
                test_file = params["output-prefix"] + end_str
                assert compare_file_md5_same(base_file, test_file)