test: Avoid unstable case of bulkinsert (#28679)

test: There are too many test cases for bulkinsert+partition_key. Each
case creates 10 bulkinsert tasks to import a file with 100~200 rows. The
default num_partitions is 64 for partition_key. So, each task will
generate 64 tiny segments. There are 10 cases, each case 10 tasks, each
task 64 tiny segment, totally there are 6400 tiny segments generated.
And all these segment row count is less than 1024, no need to build
index, and take part in compaction. There will be lots of compaction
tasks generated. It costs too much time to process these compaction
tasks. Eventually, some cases are timeout after waiting 5 minutes for
their segments to be ready and cases fail.

Specifying the num_partitions to a small value can avoid this problem.

```
[2023-11-21T03:41:16.187Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_json_file[int_scalar-True-True] PASSED [ 54%]
[2023-11-21T03:41:42.796Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_json_file[int_scalar-False-True] PASSED [ 57%]
[2023-11-21T03:42:04.694Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_json_file[string_scalar-True-True] PASSED [ 60%]
[2023-11-21T03:42:31.205Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_json_file[string_scalar-False-True] PASSED [ 63%]
[2023-11-21T03:43:38.876Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_multi_numpy_files[10-150-13-True] XPASS [ 66%]
[2023-11-21T03:49:00.357Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_multi_numpy_files[10-150-13-False] XFAIL [ 69%]
[2023-11-21T03:53:51.811Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_csv_file[int_scalar-True] FAILED [ 72%]
[2023-11-21T03:58:58.283Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_csv_file[int_scalar-False] FAILED [ 75%]
[2023-11-21T04:02:04.696Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_csv_file[string_scalar-True] PASSED [ 78%]
[2023-11-21T04:02:26.608Z] testcases/test_bulk_insert.py::TestBulkInsert::test_partition_key_on_csv_file[string_scalar-False] PASSED [ 81%]
```

Signed-off-by: yhmo <yihua.mo@zilliz.com>
This commit is contained in:
groot 2023-11-28 10:34:31 +08:00 committed by GitHub
parent ce2436127c
commit 9c9ab08f54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -783,8 +783,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_float_field(name=df.float_field),
]
schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id)
self.collection_wrap.init_collection(c_name, schema=schema)
assert len(self.collection_wrap.partitions) == ct.default_partition_num
self.collection_wrap.init_collection(c_name, schema=schema, num_partitions=10)
assert len(self.collection_wrap.partitions) == 10
# import data
t0 = time.time()
@ -858,7 +858,6 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
@pytest.mark.parametrize("dim", [13])
@pytest.mark.parametrize("entities", [150])
@pytest.mark.parametrize("file_nums", [10])
@pytest.mark.skip(reason="issue #28209")
def test_partition_key_on_multi_numpy_files(
self, auto_id, dim, entities, file_nums
):
@ -880,7 +879,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_float_vec_field(name=df.vec_field, dim=dim),
]
schema = cf.gen_collection_schema(fields=fields)
self.collection_wrap.init_collection(c_name, schema=schema)
self.collection_wrap.init_collection(c_name, schema=schema, num_partitions=10)
# build index
index_params = ct.default_index
self.collection_wrap.create_index(
@ -976,8 +975,8 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_float_field(name=df.float_field),
]
schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id)
self.collection_wrap.init_collection(c_name, schema=schema)
assert len(self.collection_wrap.partitions) == ct.default_partition_num
self.collection_wrap.init_collection(c_name, schema=schema, num_partitions=10)
assert len(self.collection_wrap.partitions) == 10
# import data
t0 = time.time()