From 0fc6c634b0b95702a3ebb729adae3ca779448966 Mon Sep 17 00:00:00 2001
From: zhuwenxing <wenxing.zhu@zilliz.com>
Date: Tue, 5 Nov 2024 08:42:23 +0800
Subject: [PATCH] test: fix tokenizer and monkey patch faker function (#37119)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
Signed-off-by: zhuwenxing <wxzhuyeah@gmail.com>
---
 tests/python_client/chaos/checker.py          |  20 +--
 tests/python_client/common/common_func.py     |  76 +++++++-
 .../testcases/test_full_text_search.py        |  14 +-
 tests/python_client/testcases/test_query.py   | 158 +++++++++++++++-
 tests/python_client/testcases/test_search.py  | 169 +++++++++++++++++-
 tests/restful_client_v2/base/testbase.py      |  13 +-
 .../testcases/test_jobs_operation.py          |  69 ++++---
 7 files changed, 460 insertions(+), 59 deletions(-)

diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py
index 3164325468..598cda58e8 100644
--- a/tests/python_client/chaos/checker.py
+++ b/tests/python_client/chaos/checker.py
@@ -414,7 +414,7 @@ class Checker:
             self.insert_data(nb=constants.ENTITIES_FOR_SEARCH, partition_name=self.p_name)
             log.info(f"insert data for collection {c_name} cost {time.perf_counter() - t0}s")
 
-        self.initial_entities = self.c_wrap.num_entities  # do as a flush
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.scale = 100000  # timestamp scale to make time.time() as int64
 
     def insert_data(self, nb=constants.DELTA_PER_INS, partition_name=None):
@@ -759,8 +759,7 @@ class InsertFlushChecker(Checker):
     def __init__(self, collection_name=None, flush=False, shards_num=2, schema=None):
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
-
+        self.initial_entities = self.c_wrap.collection.num_entities
     def keep_running(self):
         while True:
             t0 = time.time()
@@ -803,17 +802,12 @@ class FlushChecker(Checker):
         if collection_name is None:
             collection_name = cf.gen_unique_str("FlushChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
 
     @trace()
     def flush(self):
-        num_entities = self.c_wrap.num_entities
-        if num_entities >= (self.initial_entities + constants.DELTA_PER_INS):
-            result = True
-            self.initial_entities += constants.DELTA_PER_INS
-        else:
-            result = False
-        return num_entities, result
+        res, result = self.c_wrap.flush()
+        return res, result
 
     @exception_handler()
     def run_task(self):
@@ -839,7 +833,7 @@ class InsertChecker(Checker):
             collection_name = cf.gen_unique_str("InsertChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.inserted_data = []
         self.scale = 1 * 10 ** 6
         self.start_time_stamp = int(time.time() * self.scale)  # us
@@ -917,7 +911,7 @@ class InsertFreshnessChecker(Checker):
             collection_name = cf.gen_unique_str("InsertChecker_")
         super().__init__(collection_name=collection_name, shards_num=shards_num, schema=schema)
         self._flush = flush
-        self.initial_entities = self.c_wrap.num_entities
+        self.initial_entities = self.c_wrap.collection.num_entities
         self.inserted_data = []
         self.scale = 1 * 10 ** 6
         self.start_time_stamp = int(time.time() * self.scale)  # us
diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
index 56b50a4dbd..b47d630108 100644
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@@ -80,6 +80,72 @@ class ParamInfo:
 
 param_info = ParamInfo()
 
+en_vocabularies_distribution = {
+    "hello": 0.01,
+    "milvus": 0.01,
+    "vector": 0.01,
+    "database": 0.01
+}
+
+zh_vocabularies_distribution = {
+    "你好": 0.01,
+    "向量": 0.01,
+    "数据": 0.01,
+    "库": 0.01
+}
+
+def patch_faker_text(fake_instance, vocabularies_distribution):
+    """
+    Monkey patch the text() method of a Faker instance to include custom vocabulary.
+    Each word in vocabularies_distribution has an independent chance to be inserted.
+
+    Args:
+        fake_instance: Faker instance to patch
+        vocabularies_distribution: Dictionary where:
+            - key: word to insert
+            - value: probability (0-1) of inserting this word into each sentence
+
+    Example:
+        vocabularies_distribution = {
+            "hello": 0.1,    # 10% chance to insert "hello" in each sentence
+            "milvus": 0.1,   # 10% chance to insert "milvus" in each sentence
+        }
+    """
+    original_text = fake_instance.text
+
+    def new_text(nb_sentences=100, *args, **kwargs):
+        sentences = []
+        # Split original text into sentences
+        original_sentences = original_text(nb_sentences).split('.')
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+
+        for base_sentence in original_sentences:
+            words = base_sentence.split()
+
+            # Independently decide whether to insert each word
+            for word, probability in vocabularies_distribution.items():
+                if random.random() < probability:
+                    # Choose random position to insert the word
+                    insert_pos = random.randint(0, len(words))
+                    words.insert(insert_pos, word)
+
+            # Reconstruct the sentence
+            base_sentence = ' '.join(words)
+
+            # Ensure proper capitalization
+            base_sentence = base_sentence[0].upper() + base_sentence[1:]
+            sentences.append(base_sentence)
+
+        return '. '.join(sentences) + '.'
+
+
+
+    # Replace the original text method with our custom one
+    fake_instance.text = new_text
+
+
+
+
 
 def get_bm25_ground_truth(corpus, queries, top_k=100, language="en"):
     """
@@ -147,6 +213,14 @@ def custom_tokenizer(language="en"):
         )
     return tokenizer
 
+def manual_check_text_match(df, word, col):
+    id_list = []
+    for i in range(len(df)):
+        row = df.iloc[i]
+        # log.info(f"word :{word}, row: {row[col]}")
+        if word in row[col]:
+            id_list.append(row["id"])
+    return id_list
 
 def analyze_documents(texts, language="en"):
 
@@ -188,8 +262,8 @@ def check_token_overlap(text_a, text_b, language="en"):
 
 def split_dataframes(df, fields, language="en"):
     df_copy = df.copy()
-    tokenizer = custom_tokenizer(language)
     for col in fields:
+        tokenizer = custom_tokenizer(language)
         texts = df[col].to_list()
         tokenized = tokenizer.tokenize(texts, return_as="tuple")
         new_texts = []
diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py
index 608d547690..c54933c7e7 100644
--- a/tests/python_client/testcases/test_full_text_search.py
+++ b/tests/python_client/testcases/test_full_text_search.py
@@ -15,6 +15,11 @@ from faker import Faker
 Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 prefix = "full_text_search_collection"
@@ -2214,6 +2219,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
                 if i + batch_size < len(df)
                 else data[i: len(df)]
             )
+        collection_w.flush()
         collection_w.create_index(
             "emb",
             {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@@ -2429,9 +2435,10 @@ class TestSearchWithFullTextSearch(TestcaseBase):
             collection_w.create_index("text", {"index_type": "INVERTED"})
         collection_w.load()
         limit = 100
-        search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)]
+        token = random.choice(tokens)
+        search_data = [fake.text().lower() + " " + token for _ in range(nq)]
         if expr == "text_match":
-            filter = f"text_match(text, '{tokens[0]}')"
+            filter = f"text_match(text, '{token}')"
             res, _ = collection_w.query(
                 expr=filter,
             )
@@ -2488,7 +2495,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
                 result_text = r.text
                 # verify search result satisfies the filter
                 if expr == "text_match":
-                    assert tokens[0] in result_text
+                    assert token in result_text
                 if expr == "id_range":
                     assert _id < data_size // 2
                 # verify search result has overlap with search text
@@ -2497,7 +2504,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
                 assert len(
                     overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
 
-
     @pytest.mark.tags(CaseLabel.L1)
     @pytest.mark.parametrize("nq", [2])
     @pytest.mark.parametrize("empty_percent", [0])
diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py
index cb7257efe9..dc1e6622c2 100644
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@@ -29,6 +29,11 @@ Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
 fake_de = Faker("de_DE")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 
@@ -4436,8 +4441,8 @@ class TestQueryTextMatch(TestcaseBase):
     @pytest.mark.tags(CaseLabel.L0)
     @pytest.mark.parametrize("enable_partition_key", [True, False])
     @pytest.mark.parametrize("enable_inverted_index", [True, False])
-    @pytest.mark.parametrize("tokenizer", ["jieba", "default"])
-    def test_query_text_match_normal(
+    @pytest.mark.parametrize("tokenizer", ["default"])
+    def test_query_text_match_en_normal(
         self, tokenizer, enable_inverted_index, enable_partition_key
     ):
         """
@@ -4569,6 +4574,145 @@ class TestQueryTextMatch(TestcaseBase):
             for r in res:
                 assert any([token in r[field] for token in top_10_tokens])
 
+    @pytest.mark.tags(CaseLabel.L0)
+    @pytest.mark.parametrize("enable_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_inverted_index", [True, False])
+    @pytest.mark.parametrize("tokenizer", ["jieba"])
+    @pytest.mark.xfail(reason="unstable")
+    def test_query_text_match_zh_normal(
+            self, tokenizer, enable_inverted_index, enable_partition_key
+    ):
+        """
+        target: test text match normal
+        method: 1. enable text match and insert data with varchar
+                2. get the most common words and query with text match
+                3. verify the result
+        expected: text match successfully and result is correct
+        """
+        tokenizer_params = {
+            "tokenizer": tokenizer,
+        }
+        dim = 128
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
+            FieldSchema(
+                name="word",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+                enable_match=True,
+                is_partition_key=enable_partition_key,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="sentence",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+                enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="paragraph",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+                enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="text",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+                enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
+        ]
+        schema = CollectionSchema(fields=fields, description="test collection")
+        data_size = 3000
+        collection_w = self.init_collection_wrap(
+            name=cf.gen_unique_str(prefix), schema=schema
+        )
+        fake = fake_en
+        if tokenizer == "jieba":
+            language = "zh"
+            fake = fake_zh
+        else:
+            language = "en"
+
+        data = [
+            {
+                "id": i,
+                "word": fake.word().lower(),
+                "sentence": fake.sentence().lower(),
+                "paragraph": fake.paragraph().lower(),
+                "text": fake.text().lower(),
+                "emb": [random.random() for _ in range(dim)],
+            }
+            for i in range(data_size)
+        ]
+        df = pd.DataFrame(data)
+        log.info(f"dataframe\n{df}")
+        batch_size = 5000
+        for i in range(0, len(df), batch_size):
+            collection_w.insert(
+                data[i: i + batch_size]
+                if i + batch_size < len(df)
+                else data[i: len(df)]
+            )
+        # only if the collection is flushed, the inverted index ca be applied.
+        # growing segment may be not applied, although in strong consistency.
+        collection_w.flush()
+        collection_w.create_index(
+            "emb",
+            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
+        )
+        if enable_inverted_index:
+            collection_w.create_index("word", {"index_type": "INVERTED"})
+        collection_w.load()
+        # analyze the croup
+        text_fields = ["word", "sentence", "paragraph", "text"]
+        wf_map = {}
+        for field in text_fields:
+            wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
+        # query single field for one token
+        for field in text_fields:
+            token = wf_map[field].most_common()[0][0]
+            expr = f"text_match({field}, '{token}')"
+            log.info(f"expr: {expr}")
+            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+            assert len(res) > 0
+            log.info(f"res len {len(res)}")
+            for r in res:
+                assert token in r[field]
+
+            # verify inverted index
+            if enable_inverted_index:
+                if field == "word":
+                    expr = f"{field} == '{token}'"
+                    log.info(f"expr: {expr}")
+                    res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+                    log.info(f"res len {len(res)}")
+                    for r in res:
+                        assert r[field] == token
+        # query single field for multi-word
+        for field in text_fields:
+            # match top 10 most common words
+            top_10_tokens = []
+            for word, count in wf_map[field].most_common(10):
+                top_10_tokens.append(word)
+            string_of_top_10_words = " ".join(top_10_tokens)
+            expr = f"text_match({field}, '{string_of_top_10_words}')"
+            log.info(f"expr {expr}")
+            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+            log.info(f"res len {len(res)}")
+            for r in res:
+                assert any([token in r[field] for token in top_10_tokens])
+
+
+
     @pytest.mark.skip("unimplemented")
     @pytest.mark.tags(CaseLabel.L0)
     def test_query_text_match_custom_analyzer(self):
@@ -4787,6 +4931,7 @@ class TestQueryTextMatch(TestcaseBase):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
 
         df_new = cf.split_dataframes(df, fields=text_fields)
+        log.info(f"df \n{df}")
         log.info(f"new df \n{df_new}")
         for field in text_fields:
             expr_list = []
@@ -4796,16 +4941,15 @@ class TestQueryTextMatch(TestcaseBase):
                 tmp = f"text_match({field}, '{word}')"
                 log.info(f"tmp expr {tmp}")
                 expr_list.append(tmp)
-                manual_result = df_new[
-                    df_new.apply(lambda row: word in row[field], axis=1)
-                ]
-                tmp_res = set(manual_result["id"].tolist())
-                log.info(f"manual check result for  {tmp} {len(manual_result)}")
+                tmp_res = cf.manual_check_text_match(df_new, word, field)
+                log.info(f"manual check result for  {tmp} {len(tmp_res)}")
                 pd_tmp_res_list.append(tmp_res)
+            log.info(f"manual res {len(pd_tmp_res_list)}, {pd_tmp_res_list}")
             final_res = set(pd_tmp_res_list[0])
             for i in range(1, len(pd_tmp_res_list)):
                 final_res = final_res.intersection(set(pd_tmp_res_list[i]))
             log.info(f"intersection res {len(final_res)}")
+            log.info(f"final res {final_res}")
             and_expr = " and ".join(expr_list)
             log.info(f"expr: {and_expr}")
             res, _ = collection_w.query(expr=and_expr, output_fields=text_fields)
diff --git a/tests/python_client/testcases/test_search.py b/tests/python_client/testcases/test_search.py
index 634a83f167..ee21195a10 100644
--- a/tests/python_client/testcases/test_search.py
+++ b/tests/python_client/testcases/test_search.py
@@ -29,6 +29,11 @@ from faker import Faker
 Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
+
+# patch faker to generate text with specific distribution
+cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
+cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution)
+
 pd.set_option("expand_frame_repr", False)
 
 
@@ -13285,8 +13290,8 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
     @pytest.mark.tags(CaseLabel.L0)
     @pytest.mark.parametrize("enable_partition_key", [True, False])
     @pytest.mark.parametrize("enable_inverted_index", [True, False])
-    @pytest.mark.parametrize("tokenizer", ["jieba", "default"])
-    def test_search_with_text_match_filter_normal(
+    @pytest.mark.parametrize("tokenizer", ["default"])
+    def test_search_with_text_match_filter_normal_en(
         self, tokenizer, enable_inverted_index, enable_partition_key
     ):
         """
@@ -13442,3 +13447,163 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
                         r = r.to_dict()
                         assert any([token in r["entity"][field] for token in top_10_tokens])
 
+    @pytest.mark.tags(CaseLabel.L0)
+    @pytest.mark.parametrize("enable_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_inverted_index", [True, False])
+    @pytest.mark.parametrize("tokenizer", ["jieba"])
+    @pytest.mark.xfail(reason="unstable case")
+    def test_search_with_text_match_filter_normal_zh(
+        self, tokenizer, enable_inverted_index, enable_partition_key
+    ):
+        """
+        target: test text match normal
+        method: 1. enable text match and insert data with varchar
+                2. get the most common words and query with text match
+                3. verify the result
+        expected: text match successfully and result is correct
+        """
+        tokenizer_params = {
+            "tokenizer": tokenizer,
+        }
+        dim = 128
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
+            FieldSchema(
+                name="word",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+				enable_match=True,
+                is_partition_key=enable_partition_key,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="sentence",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+				enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="paragraph",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+				enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(
+                name="text",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_tokenizer=True,
+				enable_match=True,
+                tokenizer_params=tokenizer_params,
+            ),
+            FieldSchema(name="float32_emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
+            FieldSchema(name="sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR),
+        ]
+        schema = CollectionSchema(fields=fields, description="test collection")
+        data_size = 5000
+        collection_w = self.init_collection_wrap(
+            name=cf.gen_unique_str(prefix), schema=schema
+        )
+        log.info(f"collection {collection_w.describe()}")
+        fake = fake_en
+        if tokenizer == "jieba":
+            language = "zh"
+            fake = fake_zh
+        else:
+            language = "en"
+
+        data = [
+            {
+                "id": i,
+                "word": fake.word().lower(),
+                "sentence": fake.sentence().lower(),
+                "paragraph": fake.paragraph().lower(),
+                "text": fake.text().lower(),
+                "float32_emb": [random.random() for _ in range(dim)],
+                "sparse_emb": cf.gen_sparse_vectors(1, dim=10000)[0],
+            }
+            for i in range(data_size)
+        ]
+        df = pd.DataFrame(data)
+        log.info(f"dataframe\n{df}")
+        batch_size = 5000
+        for i in range(0, len(df), batch_size):
+            collection_w.insert(
+                data[i : i + batch_size]
+                if i + batch_size < len(df)
+                else data[i : len(df)]
+            )
+            collection_w.flush()
+        collection_w.create_index(
+            "float32_emb",
+            {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
+        )
+        collection_w.create_index(
+            "sparse_emb",
+            {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"},
+        )
+        if enable_inverted_index:
+            collection_w.create_index("word", {"index_type": "INVERTED"})
+        collection_w.load()
+        # analyze the croup
+        text_fields = ["word", "sentence", "paragraph", "text"]
+        wf_map = {}
+        for field in text_fields:
+            wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
+        # search with filter single field for one token
+        df_split = cf.split_dataframes(df, text_fields, language=language)
+        log.info(f"df_split\n{df_split}")
+        for ann_field in ["float32_emb", "sparse_emb"]:
+            log.info(f"ann_field {ann_field}")
+            if ann_field == "float32_emb":
+                search_data = [[random.random() for _ in range(dim)]]
+            elif ann_field == "sparse_emb":
+                search_data = cf.gen_sparse_vectors(1,dim=10000)
+            else:
+                search_data = [[random.random() for _ in range(dim)]]
+            for field in text_fields:
+                token = wf_map[field].most_common()[0][0]
+                expr = f"text_match({field}, '{token}')"
+                manual_result = df_split[
+                    df_split.apply(lambda row: token in row[field], axis=1)
+                ]
+                log.info(f"expr: {expr}, manual_check_result: {len(manual_result)}")
+                res_list, _ = collection_w.search(
+                    data=search_data,
+                    anns_field=ann_field,
+                    param={},
+                    limit=100,
+                    expr=expr, output_fields=["id", field])
+                for res in res_list:
+                    log.info(f"res len {len(res)} res {res}")
+                    assert len(res) > 0
+                    for r in res:
+                        r = r.to_dict()
+                        assert token in r["entity"][field]
+
+            # search with filter single field for multi-token
+            for field in text_fields:
+                # match top 10 most common words
+                top_10_tokens = []
+                for word, count in wf_map[field].most_common(10):
+                    top_10_tokens.append(word)
+                string_of_top_10_words = " ".join(top_10_tokens)
+                expr = f"text_match({field}, '{string_of_top_10_words}')"
+                log.info(f"expr {expr}")
+                res_list, _ = collection_w.search(
+                    data=search_data,
+                    anns_field=ann_field,
+                    param={},
+                    limit=100,
+                    expr=expr, output_fields=["id", field])
+                for res in res_list:
+                    log.info(f"res len {len(res)} res {res}")
+                    assert len(res) > 0
+                    for r in res:
+                        r = r.to_dict()
+                        assert any([token in r["entity"][field] for token in top_10_tokens])
\ No newline at end of file
diff --git a/tests/restful_client_v2/base/testbase.py b/tests/restful_client_v2/base/testbase.py
index d4556aefeb..85762fb338 100644
--- a/tests/restful_client_v2/base/testbase.py
+++ b/tests/restful_client_v2/base/testbase.py
@@ -3,7 +3,7 @@ import sys
 import pytest
 import time
 import uuid
-from pymilvus import connections, db
+from pymilvus import connections, db, MilvusClient
 from utils.util_log import test_log as logger
 from api.milvus import (VectorClient, CollectionClient, PartitionClient, IndexClient, AliasClient,
                         UserClient, RoleClient, ImportJobClient, StorageClient, Requests)
@@ -33,6 +33,7 @@ class Base:
     role_client = None
     import_job_client = None
     storage_client = None
+    milvus_client = None
 
 
 class TestBase(Base):
@@ -171,5 +172,11 @@ class TestBase(Base):
         self.vector_client.db_name = db_name
         self.import_job_client.db_name = db_name
 
-
-
+    def wait_load_completed(self, collection_name, db_name="default", timeout=60):
+        t0 = time.time()
+        while True and time.time() - t0 < timeout:
+            rsp = self.collection_client.collection_describe(collection_name, db_name=db_name)
+            if "data" in rsp and "load" in rsp["data"] and rsp["data"]["load"] == "LoadStateLoaded":
+                break
+            else:
+                time.sleep(5)
diff --git a/tests/restful_client_v2/testcases/test_jobs_operation.py b/tests/restful_client_v2/testcases/test_jobs_operation.py
index 9cbe9b9688..f51f07ab26 100644
--- a/tests/restful_client_v2/testcases/test_jobs_operation.py
+++ b/tests/restful_client_v2/testcases/test_jobs_operation.py
@@ -6,7 +6,7 @@ from sklearn import preprocessing
 from pathlib import Path
 import pandas as pd
 import numpy as np
-from pymilvus import Collection
+from pymilvus import Collection, utility
 from utils.utils import gen_collection_name
 from utils.util_log import test_log as logger
 import pytest
@@ -50,8 +50,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
-
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
         # upload file to storage
         data = []
         for i in range(insert_num):
@@ -100,7 +100,7 @@ class TestCreateImportJob(TestBase):
                 if time.time() - t0 > IMPORT_TIMEOUT:
                     assert False, "import job timeout"
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -140,8 +140,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
-
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
         # upload file to storage
         data = []
         for i in range(insert_num):
@@ -190,7 +190,7 @@ class TestCreateImportJob(TestBase):
                 if time.time() - t0 > IMPORT_TIMEOUT:
                     assert False, "import job timeout"
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -229,7 +229,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         data = []
@@ -282,7 +283,7 @@ class TestCreateImportJob(TestBase):
                 if time.time() - t0 > IMPORT_TIMEOUT:
                     assert False, "import job timeout"
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -321,7 +322,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         file_nums = 2
@@ -373,7 +375,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -402,7 +404,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         file_nums = 2
@@ -454,7 +457,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -483,7 +486,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         file_nums = 2
@@ -540,7 +544,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -569,7 +573,8 @@ class TestCreateImportJob(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         file_nums = 2
@@ -665,7 +670,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == 6000
         # assert import data can be queried
         payload = {
@@ -722,8 +727,8 @@ class TestCreateImportJob(TestBase):
                 {"fieldName": "image_emb", "indexName": "image_emb", "metricType": "L2"}
             ]
         }
-        rsp = self.collection_client.collection_create(payload)
-        assert rsp['code'] == 0
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
         # create restore collection
         restore_collection_name = f"{name}_restore"
         payload["collectionName"] = restore_collection_name
@@ -848,7 +853,8 @@ class TestImportJobAdvance(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         file_nums = 10
@@ -916,7 +922,7 @@ class TestImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == file_nums * batch_size
         # assert import data can be queried
         payload = {
@@ -948,7 +954,8 @@ class TestCreateImportJobAdvance(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         task_num = 48
@@ -1009,7 +1016,7 @@ class TestCreateImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == file_nums * batch_size * task_num
         # assert import data can be queried
         payload = {
@@ -1038,7 +1045,8 @@ class TestCreateImportJobAdvance(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         task_num = 1000
@@ -1099,7 +1107,7 @@ class TestCreateImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
-        c.load(_refresh=True)
+        c.load(_refresh=True, timeou=120)
         assert c.num_entities == file_nums * batch_size * task_num
         # assert import data can be queried
         payload = {
@@ -1140,7 +1148,8 @@ class TestCreateImportJobNegative(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # upload file to storage
         data = []
@@ -1211,7 +1220,8 @@ class TestCreateImportJobNegative(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # create import job
         payload = {
@@ -1265,7 +1275,8 @@ class TestCreateImportJobNegative(TestBase):
             },
             "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", "metricType": "L2"}]
         }
-        rsp = self.collection_client.collection_create(payload)
+        self.collection_client.collection_create(payload)
+        self.wait_load_completed(name)
 
         # create import job
         payload = {
@@ -1520,8 +1531,8 @@ class TestCreateImportJobNegative(TestBase):
                 if time.time() - t0 > IMPORT_TIMEOUT:
                     assert False, "import job timeout"
         c = Collection(name)
-        c.load(_refresh=True)
         time.sleep(10)
+        c.load(_refresh=True, timeou=120)
         res = c.query(
             expr="",
             output_fields=["count(*)"],