test: add null and default test cases (#36539)

issue: #36129

Signed-off-by: binbin lv <binbin.lv@zilliz.com>
This commit is contained in:
binbin 2024-09-29 09:39:22 +08:00 committed by GitHub
parent 8ed34dce84
commit d1d5a50014
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 276 additions and 19 deletions

View File

@ -589,40 +589,39 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
else:
data = pd.Series([json.dumps({
gen_unique_str(): None}) for _ in range(start, rows + start)])
data =[json.dumps({gen_unique_str():None}) for _ in range(start, rows + start)]
elif data_field == DataField.array_bool_field:
if not nullable:
data = pd.Series(
[np.array([random.choice([True, False]) for _ in range(array_length)], dtype=np.dtype("bool"))
for i in range(start, rows + start)])
else:
data = pd.Series(
[None for i in range(start, rows + start)])
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.array_int_field:
if not nullable:
data = pd.Series(
[np.array([random.randint(-999999, 9999999) for _ in range(array_length)], dtype=np.dtype("int64"))
for i in range(start, rows + start)])
else:
data = pd.Series(
[None for i in range(start, rows + start)])
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.array_float_field:
if not nullable:
data = pd.Series(
[np.array([random.random() for _ in range(array_length)], dtype=np.dtype("float32"))
for i in range(start, rows + start)])
else:
data = pd.Series(
[None for i in range(start, rows + start)])
data = [None for _ in range(start, rows + start)]
elif data_field == DataField.array_string_field:
if not nullable:
data = pd.Series(
[np.array([gen_unique_str(str(i)) for _ in range(array_length)], dtype=np.dtype("str"))
for i in range(start, rows + start)])
else:
data = pd.Series(
[None for i in range(start, rows + start)])
data = [None for _ in range(start, rows + start)]
else:
raise Exception("unsupported field name")
return data

View File

@ -676,7 +676,7 @@ def gen_array_collection_schema(description=ct.default_desc, primary_field=ct.de
gen_array_field(name=ct.default_float_array_field_name, element_type=DataType.FLOAT,
max_capacity=max_capacity),
gen_array_field(name=ct.default_string_array_field_name, element_type=DataType.VARCHAR,
max_capacity=max_capacity, max_length=max_length)]
max_capacity=max_capacity, max_length=max_length, nullable=True)]
if with_json is False:
fields.remove(gen_json_field())
@ -2934,7 +2934,7 @@ def gen_sparse_vectors(nb, dim=1000, sparse_format="dok"):
return vectors
def gen_vectors_based_on_vector_type(num, dim, vector_data_type):
def gen_vectors_based_on_vector_type(num, dim, vector_data_type=ct.float_type):
"""
generate float16 vector data
raw_vectors : the vectors

View File

@ -887,6 +887,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
expr_field = df.string_field
expr = f"{expr_field} >= '0'"
else:
res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field, df.int_field])
assert len(res) == 0
expr_field = df.pk_field
expr = f"{expr_field} >= 0"
@ -925,7 +927,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
if enable_dynamic_field is False and include_meta is True:
pytest.skip("include_meta only works with enable_dynamic_field")
if nullable is True:
pytest.skip("issue #36241")
pytest.skip("not support bulk insert numpy files in field which set nullable == true")
float_vec_field_dim = dim
binary_vec_field_dim = ((dim+random.randint(-16, 32)) // 8) * 8
bf16_vec_field_dim = dim+random.randint(-16, 32)
@ -1201,18 +1203,26 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
assert "name" in fields_from_search
assert "address" in fields_from_search
# query data
res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field])
if nullable is False:
assert len(res) == entities
query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field])
if nullable is False:
assert len(res) == len(query_data)
if not nullable:
expr_field = df.string_field
expr = f"{expr_field} >= '0'"
else:
res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field])
assert len(res) == 0
expr_field = df.pk_field
expr = f"{expr_field} >= 0"
res, _ = self.collection_wrap.query(expr=f"{expr}", output_fields=[df.string_field])
assert len(res) == entities
query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)]
res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field])
assert len(res) == len(query_data)
res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field])
if nullable is False:
if not nullable:
assert len(res) == entities
else:
assert 0 < len(res) < entities
if enable_partition_key:
assert len(self.collection_wrap.partitions) > 1

View File

@ -4664,3 +4664,48 @@ class TestCollectionDefaultValueInvalid(TestcaseBase):
self.field_schema_wrap.init_field_schema(name="int8_null", dtype=DataType.INT8, default_value=None,
check_task=CheckTasks.err_res, check_items=error)
class TestCollectionDefaultValueValid(TestcaseBase):
""" Test case of collection interface """
"""
******************************************************************
# The followings are valid cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue 36457")
def test_create_collection_default_value_twice(self):
"""
target: test create collection with set default value twice
method: create collection with default value twice
expected: successfully
"""
self._connect()
int_fields = []
c_name = cf.gen_unique_str(prefix)
# add other vector fields to maximum fields num
int_fields.append(cf.gen_int64_field(is_primary=True))
int_fields.append(cf.gen_float_field(default_value=numpy.float32(10.0)))
int_fields.append(cf.gen_float_vec_field())
schema = cf.gen_collection_schema(fields=int_fields)
self.collection_wrap.init_collection(c_name, schema=schema)
self.collection_wrap.init_collection(c_name, schema=schema)
@pytest.mark.tags(CaseLabel.L1)
def test_create_collection_none_twice(self):
"""
target: test create collection with nullable field twice
method: create collection with nullable field twice
expected: successfully
"""
self._connect()
int_fields = []
c_name = cf.gen_unique_str(prefix)
int_fields.append(cf.gen_int64_field(is_primary=True))
int_fields.append(cf.gen_float_field(nullable=True))
int_fields.append(cf.gen_float_vec_field())
schema = cf.gen_collection_schema(fields=int_fields)
self.collection_wrap.init_collection(c_name, schema=schema)
self.collection_wrap.init_collection(c_name, schema=schema)

View File

@ -2336,3 +2336,84 @@ class TestDeleteComplexExpr(TestcaseBase):
check_task=CheckTasks.check_query_results,
check_items={'count(*)': nb - len(filter_ids)})
class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
"""
Test case of delete interface with None data
"""
@pytest.fixture(scope="function", params=[0, 0.5, 1])
def null_data_percent(self, request):
yield request.param
@pytest.mark.tags(CaseLabel.L1)
def test_delete_search_with_none_data(self, null_data_percent):
"""
target: test delete and search when there is None data
method: search entities after it was deleted
expected: deleted entity is not in the search result
"""
# init collection with nb default data
collection_w, _, _, ids = self.init_collection_general(prefix, insert_data=True,
nullable_fields={ct.default_float_field_name: null_data_percent},
default_value_fields = {ct.default_string_field_name: "data"})[0:4]
entity, _ = collection_w.query(tmp_expr, output_fields=["*"])
search_res, _ = collection_w.search([entity[0][ct.default_float_vec_field_name]],
ct.default_float_vec_field_name,
ct.default_search_params, ct.default_limit)
# assert search results contains entity
assert 0 in search_res[0].ids
expr = f'{ct.default_int64_field_name} in {ids[:ct.default_nb // 2]}'
collection_w.delete(expr)
search_res_2, _ = collection_w.search([entity[0][ct.default_float_vec_field_name]],
ct.default_float_vec_field_name,
ct.default_search_params, ct.default_limit)
# assert search result is not equal to entity
log.debug(f"Second search result ids: {search_res_2[0].ids}")
inter = set(ids[:ct.default_nb // 2]
).intersection(set(search_res_2[0].ids))
# Using bounded staleness, we could still search the "deleted" entities,
# since the search requests arrived query nodes earlier than query nodes consume the delete requests.
assert len(inter) == 0
@pytest.mark.tags(CaseLabel.L2)
def test_delete_entities_repeatedly_with_string_none_data(self, null_data_percent):
"""
target: test delete entities twice with string expr
method: delete with same expr twice
expected: No exception for second deletion
"""
# init collection with nb default data
collection_w = \
self.init_collection_general(prefix, nb=tmp_nb, insert_data=True, primary_field=ct.default_string_field_name,
nullable_fields={ct.default_float_field_name: null_data_percent},
default_value_fields={ct.default_int64_field_name: 100})[0]
# assert delete successfully and no exception
collection_w.delete(expr=default_string_expr)
collection_w.num_entities
collection_w.query(default_string_expr,
check_task=CheckTasks.check_query_empty)
collection_w.delete(expr=default_string_expr)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="waiting for the expr code part to be merged")
def test_delete_entities_repeatedly_with_expr_on_none_fields(self, null_data_percent):
"""
target: test delete entities twice with string expr
method: delete with same expr twice
expected: No exception for second deletion
"""
# init collection with nb default data
collection_w = \
self.init_collection_general(prefix, nb=tmp_nb, insert_data=True, primary_field=ct.default_string_field_name,
nullable_fields={ct.default_float_field_name: null_data_percent},
default_value_fields={ct.default_int64_field_name: 100})[0]
# assert delete successfully and no exception
collection_w.delete(expr=default_string_expr)
collection_w.num_entities
collection_w.query(default_string_expr,
check_task=CheckTasks.check_query_empty)
collection_w.delete(expr=default_string_expr)

View File

@ -3965,6 +3965,49 @@ class TestQueryCount(TestcaseBase):
check_items={"count": ct.default_nb,
"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.repeat(3)
@pytest.mark.skip(reason="issue #36538")
def test_count_query_search_after_release_partition_load(self):
"""
target: test query count(*) after release collection and load partition
method: 1. create a collection and 2 partitions with nullable and default value fields
2. insert data
3. load one partition
4. delete half data in each partition
5. release the collection and load one partition
6. search
expected: No exception
"""
# insert data
collection_w = self.init_collection_general(prefix, True, 200, partition_num=1, is_index=True)[0]
collection_w.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 200}]})
collection_w.release()
partition_w1, partition_w2 = collection_w.partitions
# load
partition_w1.load()
# delete data
delete_ids = [i for i in range(50, 150)]
collection_w.delete(f"int64 in {delete_ids}")
# release
collection_w.release()
# partition_w1.load()
collection_w.load(partition_names=[partition_w1.name])
# search on collection, partition1, partition2
collection_w.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 50}]})
partition_w1.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 50}]})
vectors = [[random.random() for _ in range(ct.default_dim)] for _ in range(ct.default_nq)]
collection_w.search(vectors[:1], ct.default_float_vec_field_name, ct.default_search_params, 200,
partition_names=[partition_w2.name],
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: 'not loaded'})
class TestQueryIterator(TestcaseBase):
"""
@ -4503,6 +4546,51 @@ class TestQueryNoneAndDefaultData(TestcaseBase):
collection_w.query(term_expr, output_fields=[ct.default_int64_field_name, default_float_field_name],
check_task=CheckTasks.check_query_results, check_items={exp_res: res})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="issue #36538")
def test_query_none_count(self, null_data_percent):
"""
target: test query count(*) with None and default data
method: 1. create a collection and 2 partitions with nullable and default value fields
2. insert data
3. load one partition
4. delete half data in each partition
5. release the collection and load one partition
6. search
expected: No exception
"""
# insert data
collection_w = self.init_collection_general(prefix, True, 200, partition_num=1, is_index=True,
nullable_fields={ct.default_float_field_name: null_data_percent},
default_value_fields={ct.default_string_field_name: "data"})[0]
collection_w.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 200}]})
collection_w.release()
partition_w1, partition_w2 = collection_w.partitions
# load
partition_w1.load()
# delete data
delete_ids = [i for i in range(50, 150)]
collection_w.delete(f"int64 in {delete_ids}")
# release
collection_w.release()
# partition_w1.load()
collection_w.load(partition_names=[partition_w1.name])
# search on collection, partition1, partition2
collection_w.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 50}]})
partition_w1.query(expr='', output_fields=[ct.default_count_output],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{ct.default_count_output: 50}]})
vectors = [[random.random() for _ in range(ct.default_dim)] for _ in range(ct.default_nq)]
collection_w.search(vectors[:1], ct.default_float_vec_field_name, ct.default_search_params, 200,
partition_names=[partition_w2.name],
check_task=CheckTasks.err_res,
check_items={ct.err_code: 1, ct.err_msg: 'not loaded'})
class TestQueryTextMatch(TestcaseBase):
"""
******************************************************************

View File

@ -13113,6 +13113,39 @@ class TestCollectionSearchNoneAndDefaultData(TestcaseBase):
"output_fields": [default_int64_field_name,
default_float_field_name]})
@pytest.mark.tags(CaseLabel.L2)
def test_search_none_data_partial_load(self, is_flush, enable_dynamic_field, null_data_percent):
"""
target: test search normal case with none data inserted
method: create connection, collection with nullable fields, insert data including none, and search
expected: 1. search successfully with limit(topK)
"""
# 1. initialize with data
collection_w, _, _, insert_ids, time_stamp = \
self.init_collection_general(prefix, True, is_flush=is_flush,
enable_dynamic_field=enable_dynamic_field,
nullable_fields={ct.default_float_field_name: null_data_percent})[0:5]
# 2. release and partial load again
collection_w.release()
loaded_fields = [default_int64_field_name, ct.default_float_vec_field_name]
if not enable_dynamic_field:
loaded_fields.append(default_float_field_name)
collection_w.load(load_fields=loaded_fields)
# 3. generate search data
vectors = cf.gen_vectors_based_on_vector_type(default_nq, default_dim)
# 4. search after partial load field with None data
output_fields = [default_int64_field_name, default_float_field_name]
collection_w.search(vectors[:default_nq], default_search_field,
default_search_params, default_limit,
default_search_exp,
output_fields=output_fields,
check_task=CheckTasks.check_search_results,
check_items={"nq": default_nq,
"ids": insert_ids,
"limit": default_limit,
"output_fields": output_fields})
class TestSearchWithTextMatchFilter(TestcaseBase):
"""
******************************************************************
@ -13259,3 +13292,4 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
for r in res:
r = r.to_dict()
assert any([token in r["entity"][field] for token in top_10_tokens])