test:[cherry-pick]Update tests for range search and add test for query with dup ids (#34069)

related issue: https://github.com/milvus-io/milvus/issues/33883
pr: #34057

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
This commit is contained in:
yanliang567 2024-06-24 11:34:03 +08:00 committed by GitHub
parent 22e6807e9a
commit 59d910320d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 83 additions and 48 deletions

View File

@ -2275,6 +2275,41 @@ class TestQueryOperation(TestcaseBase):
collection_w.query(term_expr, output_fields=["*"], check_items=CheckTasks.check_query_results, collection_w.query(term_expr, output_fields=["*"], check_items=CheckTasks.check_query_results,
check_task={exp_res: res}) check_task={exp_res: res})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("with_growing", [True])
def test_query_to_get_latest_entity_with_dup_ids(self, with_growing):
"""
target: test query to get latest entity with duplicate primary keys
method: 1.create collection and insert dup primary key = 0
2.query with expr=dup_id
expected: return the latest entity; verify the result is same as dedup entities
"""
collection_w = self.init_collection_general(prefix, dim=16, is_flush=False, insert_data=False, is_index=False,
vector_data_type=ct.float_type, with_json=False)[0]
nb = 50
rounds = 10
for i in range(rounds):
df = cf.gen_default_dataframe_data(dim=16, nb=nb, start=i * nb, with_json=False)
df[ct.default_int64_field_name] = i
collection_w.insert(df)
# re-insert the last piece of data in df to refresh the timestamp
last_piece = df.iloc[-1:]
collection_w.insert(last_piece)
if not with_growing:
collection_w.flush()
collection_w.create_index(ct.default_float_vec_field_name, index_params=ct.default_index)
collection_w.load()
# verify the result returns the latest entity if there are duplicate primary keys
expr = f'{ct.default_int64_field_name} == 0'
res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
assert len(res) == 1 and res[0][ct.default_float_field_name] == (nb - 1) * 1.0
# verify the result is same as dedup entities
expr = f'{ct.default_int64_field_name} >= 0'
res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
assert len(res) == rounds
@pytest.mark.tags(CaseLabel.L0) @pytest.mark.tags(CaseLabel.L0)
def test_query_after_index(self): def test_query_after_index(self):
""" """

View File

@ -6926,20 +6926,22 @@ class TestCollectionRangeSearch(TestcaseBase):
""" """
@pytest.mark.tags(CaseLabel.L0) @pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("vector_data_type", ct.all_dense_vector_types) @pytest.mark.parametrize("vector_data_type", ct.all_dense_vector_types)
def test_range_search_default(self, index_type, metric, vector_data_type): @pytest.mark.parametrize("with_growing", [False, True])
def test_range_search_default(self, index_type, metric, vector_data_type, with_growing):
""" """
target: verify the range search returns correct results target: verify the range search returns correct results
method: 1. create collection, insert 8000 vectors, method: 1. create collection, insert 10k vectors,
2. search with topk=1000 2. search with topk=1000
3. range search from the 30th-330th distance as filter 3. range search from the 30th-330th distance as filter
4. verified the range search results is same as the search results in the range 4. verified the range search results is same as the search results in the range
""" """
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False, collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
vector_data_type=vector_data_type, with_json=False)[0] vector_data_type=vector_data_type, with_json=False)[0]
nb = 2000 nb = 1000
for i in range(3): rounds = 10
data = cf.gen_general_default_list_data(nb=nb, auto_id=True, for i in range(rounds):
vector_data_type=vector_data_type, with_json=False) data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
with_json=False, start=i*nb)
collection_w.insert(data) collection_w.insert(data)
collection_w.flush() collection_w.flush()
@ -6947,51 +6949,49 @@ class TestCollectionRangeSearch(TestcaseBase):
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params) collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
collection_w.load() collection_w.load()
for i in range(2): if with_growing is True:
with_growing = bool(i % 2) # add some growing segments
if with_growing is True: for j in range(rounds//2):
# add some growing segments data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
for _ in range(2): with_json=False, start=(rounds+j)*nb)
data = cf.gen_general_default_list_data(nb=nb, auto_id=True, collection_w.insert(data)
vector_data_type=vector_data_type, with_json=False)
collection_w.insert(data)
search_params = {"params": {}} search_params = {"params": {}}
nq = 1 nq = 1
search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type) search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type)
search_res = collection_w.search(search_vectors, default_search_field, search_res = collection_w.search(search_vectors, default_search_field,
search_params, limit=1000)[0] search_params, limit=1000)[0]
assert len(search_res[0].ids) == 1000 assert len(search_res[0].ids) == 1000
log.debug(f"search topk=1000 returns {len(search_res[0].ids)}") log.debug(f"search topk=1000 returns {len(search_res[0].ids)}")
check_topk = 300 check_topk = 300
check_from = 30 check_from = 30
ids = search_res[0].ids[check_from:check_from + check_topk] ids = search_res[0].ids[check_from:check_from + check_topk]
radius = search_res[0].distances[check_from + check_topk] radius = search_res[0].distances[check_from + check_topk]
range_filter = search_res[0].distances[check_from] range_filter = search_res[0].distances[check_from]
# rebuild the collection with test target index # rebuild the collection with test target index
collection_w.release() collection_w.release()
collection_w.indexes[0].drop() collection_w.indexes[0].drop()
_index_params = {"index_type": index_type, "metric_type": metric, _index_params = {"index_type": index_type, "metric_type": metric,
"params": cf.get_index_params_params(index_type)} "params": cf.get_index_params_params(index_type)}
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params) collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
collection_w.load() collection_w.load()
params = cf.get_search_params_params(index_type) params = cf.get_search_params_params(index_type)
params.update({"radius": radius, "range_filter": range_filter}) params.update({"radius": radius, "range_filter": range_filter})
if index_type == "HNSW": if index_type == "HNSW":
params.update({"ef": check_topk+100}) params.update({"ef": check_topk+100})
if index_type == "IVF_PQ": if index_type == "IVF_PQ":
params.update({"max_empty_result_buckets": 100}) params.update({"max_empty_result_buckets": 100})
range_search_params = {"params": params} range_search_params = {"params": params}
range_res = collection_w.search(search_vectors, default_search_field, range_res = collection_w.search(search_vectors, default_search_field,
range_search_params, limit=check_topk)[0] range_search_params, limit=check_topk)[0]
range_ids = range_res[0].ids range_ids = range_res[0].ids
# assert len(range_ids) == check_topk # assert len(range_ids) == check_topk
log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}") log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}")
hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2) hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2)
log.debug(f"range search results with growing {bool(i % 2)} hit rate: {hit_rate}") log.debug(f"{vector_data_type} range search results {index_type} {metric} with_growing {with_growing} hit_rate: {hit_rate}")
assert hit_rate >= 0.2 # issue #32630 to improve the accuracy assert hit_rate >= 0.2 # issue #32630 to improve the accuracy
@pytest.mark.tags(CaseLabel.L2) @pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("range_filter", [1000, 1000.0]) @pytest.mark.parametrize("range_filter", [1000, 1000.0])