mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
test: update jieba tokenizer in test (#37199)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
f75660456d
commit
4c108b1564
@ -127,7 +127,7 @@ def custom_tokenizer(language="en"):
|
||||
# Tokenize the corpus
|
||||
def jieba_split(text):
|
||||
text_without_punctuation = remove_punctuation(text)
|
||||
return jieba.lcut(text_without_punctuation)
|
||||
return jieba.cut_for_search(text_without_punctuation)
|
||||
|
||||
def blank_space_split(text):
|
||||
text_without_punctuation = remove_punctuation(text)
|
||||
@ -169,8 +169,13 @@ def analyze_documents(texts, language="en"):
|
||||
|
||||
# Convert token ids back to words
|
||||
word_freq = Counter({id_to_word[token_id]: count for token_id, count in freq.items()})
|
||||
log.debug(f"word freq {word_freq.most_common(10)}")
|
||||
|
||||
|
||||
# if language in ["zh", "cn", "chinese"], remove the long words
|
||||
# this is a trick to make the text match test case verification simple, because the long word can be still split
|
||||
if language in ["zh", "cn", "chinese"]:
|
||||
word_freq = Counter({word: count for word, count in word_freq.items() if 1< len(word) <= 3})
|
||||
log.info(f"word freq {word_freq.most_common(10)}")
|
||||
return word_freq
|
||||
|
||||
|
||||
|
@ -506,7 +506,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -658,7 +657,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(data)
|
||||
else data[i: len(data)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -800,7 +798,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(df[i: i + batch_size])
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -938,7 +935,6 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
num_entities = collection_w.num_entities
|
||||
# query with count(*)
|
||||
res, _ = collection_w.query(
|
||||
@ -1190,7 +1186,6 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1348,7 +1343,6 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1486,7 +1480,6 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1651,7 +1644,6 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1775,7 +1767,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1884,7 +1875,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -1993,8 +1983,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
|
||||
error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"}
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
@ -2091,7 +2079,6 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -2227,7 +2214,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -2316,7 +2302,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("nq", [10])
|
||||
@pytest.mark.parametrize("nq", [2])
|
||||
@pytest.mark.parametrize("empty_percent", [0.5])
|
||||
@pytest.mark.parametrize("enable_partition_key", [True])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@ -2409,7 +2395,10 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = list(word_freq.keys())
|
||||
tokens = []
|
||||
for item in word_freq.most_common(20):
|
||||
if len(item[0]) == 2:
|
||||
tokens.append(item[0])
|
||||
if len(tokens) == 0:
|
||||
log.info(f"empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
@ -2420,7 +2409,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -2612,7 +2600,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -2778,7 +2765,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -2925,7 +2911,6 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -3062,7 +3047,6 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
@ -3200,7 +3184,6 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"dense_emb",
|
||||
{"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}},
|
||||
|
Loading…
Reference in New Issue
Block a user