test: update jieba tokenizer in test (#37199)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2024-10-28 19:22:22 +08:00 committed by GitHub
parent f75660456d
commit 4c108b1564
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 12 additions and 24 deletions

View File

@ -127,7 +127,7 @@ def custom_tokenizer(language="en"):
# Tokenize the corpus
def jieba_split(text):
text_without_punctuation = remove_punctuation(text)
return jieba.lcut(text_without_punctuation)
return jieba.cut_for_search(text_without_punctuation)
def blank_space_split(text):
text_without_punctuation = remove_punctuation(text)
@ -169,8 +169,13 @@ def analyze_documents(texts, language="en"):
# Convert token ids back to words
word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
log.debug(f"word freq {word_freq.most_common(10)}")
# if language in ["zh", "cn", "chinese"], remove the long words
# this is a trick to make the text match test case verification simple, because the long word can be still split
if language in ["zh", "cn", "chinese"]:
word_freq = Counter({word: count for word, count in word_freq.items() if 1< len(word) <= 3})
log.info(f"word freq {word_freq.most_common(10)}")
return word_freq

View File

@ -506,7 +506,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -658,7 +657,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
if i + batch_size < len(data)
else data[i: len(data)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -800,7 +798,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(df[i: i + batch_size])
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -938,7 +935,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
num_entities = collection_w.num_entities
# query with count(*)
res, _ = collection_w.query(
@ -1190,7 +1186,6 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1348,7 +1343,6 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1486,7 +1480,6 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1651,7 +1644,6 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1775,7 +1767,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1884,7 +1875,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -1993,8 +1983,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
collection_w.create_index(
"emb",
@ -2091,7 +2079,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -2227,7 +2214,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -2316,7 +2302,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [10])
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@ -2409,7 +2395,10 @@ class TestSearchWithFullTextSearch(TestcaseBase):
log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
tokens = []
for item in word_freq.most_common(20):
if len(item[0]) == 2:
tokens.append(item[0])
if len(tokens) == 0:
log.info(f"empty tokens, add a dummy token")
tokens = ["dummy"]
@ -2420,7 +2409,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -2612,7 +2600,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -2778,7 +2765,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -2925,7 +2911,6 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -3062,7 +3047,6 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
@ -3200,7 +3184,6 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
if i + batch_size < len(df)
else data[i: len(df)]
)
collection_w.flush()
collection_w.create_index(
"dense_emb",
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},