mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-04 04:49:08 +08:00
test:[cherry-pick]Update tests for range search and add test for query with dup ids (#34069)
related issue: https://github.com/milvus-io/milvus/issues/33883 pr: #34057 Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
This commit is contained in:
parent
22e6807e9a
commit
59d910320d
@ -2275,6 +2275,41 @@ class TestQueryOperation(TestcaseBase):
|
|||||||
collection_w.query(term_expr, output_fields=["*"], check_items=CheckTasks.check_query_results,
|
collection_w.query(term_expr, output_fields=["*"], check_items=CheckTasks.check_query_results,
|
||||||
check_task={exp_res: res})
|
check_task={exp_res: res})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
@pytest.mark.parametrize("with_growing", [True])
|
||||||
|
def test_query_to_get_latest_entity_with_dup_ids(self, with_growing):
|
||||||
|
"""
|
||||||
|
target: test query to get latest entity with duplicate primary keys
|
||||||
|
method: 1.create collection and insert dup primary key = 0
|
||||||
|
2.query with expr=dup_id
|
||||||
|
expected: return the latest entity; verify the result is same as dedup entities
|
||||||
|
"""
|
||||||
|
collection_w = self.init_collection_general(prefix, dim=16, is_flush=False, insert_data=False, is_index=False,
|
||||||
|
vector_data_type=ct.float_type, with_json=False)[0]
|
||||||
|
nb = 50
|
||||||
|
rounds = 10
|
||||||
|
for i in range(rounds):
|
||||||
|
df = cf.gen_default_dataframe_data(dim=16, nb=nb, start=i * nb, with_json=False)
|
||||||
|
df[ct.default_int64_field_name] = i
|
||||||
|
collection_w.insert(df)
|
||||||
|
# re-insert the last piece of data in df to refresh the timestamp
|
||||||
|
last_piece = df.iloc[-1:]
|
||||||
|
collection_w.insert(last_piece)
|
||||||
|
|
||||||
|
if not with_growing:
|
||||||
|
collection_w.flush()
|
||||||
|
collection_w.create_index(ct.default_float_vec_field_name, index_params=ct.default_index)
|
||||||
|
collection_w.load()
|
||||||
|
# verify the result returns the latest entity if there are duplicate primary keys
|
||||||
|
expr = f'{ct.default_int64_field_name} == 0'
|
||||||
|
res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
|
||||||
|
assert len(res) == 1 and res[0][ct.default_float_field_name] == (nb - 1) * 1.0
|
||||||
|
|
||||||
|
# verify the result is same as dedup entities
|
||||||
|
expr = f'{ct.default_int64_field_name} >= 0'
|
||||||
|
res = collection_w.query(expr=expr, output_fields=[ct.default_int64_field_name, ct.default_float_field_name])[0]
|
||||||
|
assert len(res) == rounds
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L0)
|
@pytest.mark.tags(CaseLabel.L0)
|
||||||
def test_query_after_index(self):
|
def test_query_after_index(self):
|
||||||
"""
|
"""
|
||||||
|
@ -6926,20 +6926,22 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||||||
"""
|
"""
|
||||||
@pytest.mark.tags(CaseLabel.L0)
|
@pytest.mark.tags(CaseLabel.L0)
|
||||||
@pytest.mark.parametrize("vector_data_type", ct.all_dense_vector_types)
|
@pytest.mark.parametrize("vector_data_type", ct.all_dense_vector_types)
|
||||||
def test_range_search_default(self, index_type, metric, vector_data_type):
|
@pytest.mark.parametrize("with_growing", [False, True])
|
||||||
|
def test_range_search_default(self, index_type, metric, vector_data_type, with_growing):
|
||||||
"""
|
"""
|
||||||
target: verify the range search returns correct results
|
target: verify the range search returns correct results
|
||||||
method: 1. create collection, insert 8000 vectors,
|
method: 1. create collection, insert 10k vectors,
|
||||||
2. search with topk=1000
|
2. search with topk=1000
|
||||||
3. range search from the 30th-330th distance as filter
|
3. range search from the 30th-330th distance as filter
|
||||||
4. verified the range search results is same as the search results in the range
|
4. verified the range search results is same as the search results in the range
|
||||||
"""
|
"""
|
||||||
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
|
collection_w = self.init_collection_general(prefix, auto_id=True, insert_data=False, is_index=False,
|
||||||
vector_data_type=vector_data_type, with_json=False)[0]
|
vector_data_type=vector_data_type, with_json=False)[0]
|
||||||
nb = 2000
|
nb = 1000
|
||||||
for i in range(3):
|
rounds = 10
|
||||||
data = cf.gen_general_default_list_data(nb=nb, auto_id=True,
|
for i in range(rounds):
|
||||||
vector_data_type=vector_data_type, with_json=False)
|
data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
|
||||||
|
with_json=False, start=i*nb)
|
||||||
collection_w.insert(data)
|
collection_w.insert(data)
|
||||||
|
|
||||||
collection_w.flush()
|
collection_w.flush()
|
||||||
@ -6947,51 +6949,49 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
|
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
|
||||||
collection_w.load()
|
collection_w.load()
|
||||||
|
|
||||||
for i in range(2):
|
if with_growing is True:
|
||||||
with_growing = bool(i % 2)
|
# add some growing segments
|
||||||
if with_growing is True:
|
for j in range(rounds//2):
|
||||||
# add some growing segments
|
data = cf.gen_general_default_list_data(nb=nb, auto_id=True, vector_data_type=vector_data_type,
|
||||||
for _ in range(2):
|
with_json=False, start=(rounds+j)*nb)
|
||||||
data = cf.gen_general_default_list_data(nb=nb, auto_id=True,
|
collection_w.insert(data)
|
||||||
vector_data_type=vector_data_type, with_json=False)
|
|
||||||
collection_w.insert(data)
|
|
||||||
|
|
||||||
search_params = {"params": {}}
|
search_params = {"params": {}}
|
||||||
nq = 1
|
nq = 1
|
||||||
search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type)
|
search_vectors = cf.gen_vectors(nq, ct.default_dim, vector_data_type=vector_data_type)
|
||||||
search_res = collection_w.search(search_vectors, default_search_field,
|
search_res = collection_w.search(search_vectors, default_search_field,
|
||||||
search_params, limit=1000)[0]
|
search_params, limit=1000)[0]
|
||||||
assert len(search_res[0].ids) == 1000
|
assert len(search_res[0].ids) == 1000
|
||||||
log.debug(f"search topk=1000 returns {len(search_res[0].ids)}")
|
log.debug(f"search topk=1000 returns {len(search_res[0].ids)}")
|
||||||
check_topk = 300
|
check_topk = 300
|
||||||
check_from = 30
|
check_from = 30
|
||||||
ids = search_res[0].ids[check_from:check_from + check_topk]
|
ids = search_res[0].ids[check_from:check_from + check_topk]
|
||||||
radius = search_res[0].distances[check_from + check_topk]
|
radius = search_res[0].distances[check_from + check_topk]
|
||||||
range_filter = search_res[0].distances[check_from]
|
range_filter = search_res[0].distances[check_from]
|
||||||
|
|
||||||
# rebuild the collection with test target index
|
# rebuild the collection with test target index
|
||||||
collection_w.release()
|
collection_w.release()
|
||||||
collection_w.indexes[0].drop()
|
collection_w.indexes[0].drop()
|
||||||
_index_params = {"index_type": index_type, "metric_type": metric,
|
_index_params = {"index_type": index_type, "metric_type": metric,
|
||||||
"params": cf.get_index_params_params(index_type)}
|
"params": cf.get_index_params_params(index_type)}
|
||||||
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
|
collection_w.create_index(ct.default_float_vec_field_name, index_params=_index_params)
|
||||||
collection_w.load()
|
collection_w.load()
|
||||||
|
|
||||||
params = cf.get_search_params_params(index_type)
|
params = cf.get_search_params_params(index_type)
|
||||||
params.update({"radius": radius, "range_filter": range_filter})
|
params.update({"radius": radius, "range_filter": range_filter})
|
||||||
if index_type == "HNSW":
|
if index_type == "HNSW":
|
||||||
params.update({"ef": check_topk+100})
|
params.update({"ef": check_topk+100})
|
||||||
if index_type == "IVF_PQ":
|
if index_type == "IVF_PQ":
|
||||||
params.update({"max_empty_result_buckets": 100})
|
params.update({"max_empty_result_buckets": 100})
|
||||||
range_search_params = {"params": params}
|
range_search_params = {"params": params}
|
||||||
range_res = collection_w.search(search_vectors, default_search_field,
|
range_res = collection_w.search(search_vectors, default_search_field,
|
||||||
range_search_params, limit=check_topk)[0]
|
range_search_params, limit=check_topk)[0]
|
||||||
range_ids = range_res[0].ids
|
range_ids = range_res[0].ids
|
||||||
# assert len(range_ids) == check_topk
|
# assert len(range_ids) == check_topk
|
||||||
log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}")
|
log.debug(f"range search radius={radius}, range_filter={range_filter}, range results num: {len(range_ids)}")
|
||||||
hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2)
|
hit_rate = round(len(set(ids).intersection(set(range_ids))) / len(set(ids)), 2)
|
||||||
log.debug(f"range search results with growing {bool(i % 2)} hit rate: {hit_rate}")
|
log.debug(f"{vector_data_type} range search results {index_type} {metric} with_growing {with_growing} hit_rate: {hit_rate}")
|
||||||
assert hit_rate >= 0.2 # issue #32630 to improve the accuracy
|
assert hit_rate >= 0.2 # issue #32630 to improve the accuracy
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L2)
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
@pytest.mark.parametrize("range_filter", [1000, 1000.0])
|
@pytest.mark.parametrize("range_filter", [1000, 1000.0])
|
||||||
|
Loading…
Reference in New Issue
Block a user