// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include #include #include #include "knowhere/index/vector_index/adapter/VectorAdapter.h" #include "segcore/SegmentSealedImpl.h" #include "test_utils/DataGen.h" #include "index/IndexFactory.h" #include "segcore/segcore_init_c.h" using namespace milvus; using namespace milvus::query; using namespace milvus::segcore; using milvus::index::LoadIndexInfo; const int64_t ROW_COUNT = 100 * 1000; TEST(Sealed, without_predicate) { using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); auto dim = 16; auto topK = 5; auto metric_type = knowhere::metric::L2; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto float_fid = schema->AddDebugField("age", DataType::FLOAT); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); std::string dsl = R"({ "bool": { "must": [ { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 3 } } } ] } })"; auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(fake_id); for (int64_t i = 0; i < 1000 * dim; ++i) { vec_col.push_back(0); } auto query_ptr = vec_col.data() + 4200 * dim; auto segment = CreateGrowingSegment(schema); segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); Timestamp time = 1000000; std::vector ph_group_arr = {ph_group.get()}; auto sr = segment->Search(plan.get(), ph_group.get(), time); auto pre_result = SearchResultToJson(*sr); milvus::index::CreateIndexInfo create_index_info; create_index_info.field_type = DataType::VECTOR_FLOAT; create_index_info.metric_type = knowhere::metric::L2; create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT; auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, nullptr); auto build_conf = knowhere::Config{{knowhere::meta::METRIC_TYPE, knowhere::metric::L2}, {knowhere::meta::DIM, std::to_string(dim)}, {knowhere::indexparam::NLIST, "100"}}; auto search_conf = knowhere::Config{{knowhere::indexparam::NPROBE, 10}}; auto database = knowhere::GenDataset(N, dim, vec_col.data() + 1000 * dim); indexing->BuildWithDataset(database, build_conf); auto vec_index = dynamic_cast(indexing.get()); EXPECT_EQ(vec_index->Count(), N); EXPECT_EQ(vec_index->GetDim(), dim); auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr); milvus::SearchInfo searchInfo; searchInfo.topk_ = topK; searchInfo.metric_type_ = knowhere::metric::L2; searchInfo.search_params_ = search_conf; auto result = vec_index->Query(query_dataset, searchInfo, nullptr); auto ref_result = SearchResultToJson(*result); LoadIndexInfo load_info; load_info.field_id = fake_id.get(); load_info.index = std::move(indexing); load_info.index_params["metric_type"] = "L2"; // load index for vec field, load raw data for scalar filed auto sealed_segment = SealedCreator(schema, dataset); sealed_segment->DropFieldData(fake_id); sealed_segment->LoadIndex(load_info); sr = sealed_segment->Search(plan.get(), ph_group.get(), time); auto post_result = SearchResultToJson(*sr); std::cout << "ref_result" << std::endl; std::cout << ref_result.dump(1) << std::endl; std::cout << "post_result" << std::endl; std::cout << post_result.dump(1); // ASSERT_EQ(ref_result.dump(1), post_result.dump(1)); sr = sealed_segment->Search(plan.get(), ph_group.get(), 0); EXPECT_EQ(sr->get_total_result_count(), 0); } TEST(Sealed, with_predicate) { using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); auto dim = 16; auto topK = 5; auto metric_type = knowhere::metric::L2; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); std::string dsl = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 42000, "LT": 42005 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 6 } } } ] } })"; auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(fake_id); auto query_ptr = vec_col.data() + 42000 * dim; auto segment = CreateGrowingSegment(schema); segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); Timestamp time = 10000000; std::vector ph_group_arr = {ph_group.get()}; auto sr = segment->Search(plan.get(), ph_group.get(), time); milvus::index::CreateIndexInfo create_index_info; create_index_info.field_type = DataType::VECTOR_FLOAT; create_index_info.metric_type = knowhere::metric::L2; create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT; auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, nullptr); auto build_conf = knowhere::Config{{knowhere::meta::METRIC_TYPE, knowhere::metric::L2}, {knowhere::meta::DIM, std::to_string(dim)}, {knowhere::indexparam::NLIST, "100"}}; auto database = knowhere::GenDataset(N, dim, vec_col.data()); indexing->BuildWithDataset(database, build_conf); auto vec_index = dynamic_cast(indexing.get()); EXPECT_EQ(vec_index->Count(), N); EXPECT_EQ(vec_index->GetDim(), dim); auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr); auto search_conf = knowhere::Config{{knowhere::meta::METRIC_TYPE, knowhere::metric::L2}, {knowhere::indexparam::NPROBE, 10}}; milvus::SearchInfo searchInfo; searchInfo.topk_ = topK; searchInfo.metric_type_ = knowhere::metric::L2; searchInfo.search_params_ = search_conf; auto result = vec_index->Query(query_dataset, searchInfo, nullptr); LoadIndexInfo load_info; load_info.field_id = fake_id.get(); load_info.index = std::move(indexing); load_info.index_params["metric_type"] = "L2"; // load index for vec field, load raw data for scalar filed auto sealed_segment = SealedCreator(schema, dataset); sealed_segment->DropFieldData(fake_id); sealed_segment->LoadIndex(load_info); sr = sealed_segment->Search(plan.get(), ph_group.get(), time); for (int i = 0; i < num_queries; ++i) { auto offset = i * topK; ASSERT_EQ(sr->seg_offsets_[offset], 42000 + i); ASSERT_EQ(sr->distances_[offset], 0.0); } } TEST(Sealed, with_predicate_filter_all) { using namespace milvus::query; using namespace milvus::segcore; auto schema = std::make_shared(); auto dim = 16; auto topK = 5; // auto metric_type = MetricType::METRIC_L2; auto metric_type = knowhere::metric::L2; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); std::string dsl = R"({ "bool": { "must": [ { "range": { "counter": { "GE": 42000, "LT": 41999 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 6 } } } ] } })"; auto N = ROW_COUNT; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(fake_id); auto query_ptr = vec_col.data() + 42000 * dim; auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); Timestamp time = 10000000; std::vector ph_group_arr = {ph_group.get()}; milvus::index::CreateIndexInfo create_index_info; create_index_info.field_type = DataType::VECTOR_FLOAT; create_index_info.metric_type = knowhere::metric::L2; create_index_info.index_type = knowhere::IndexEnum::INDEX_FAISS_IVFFLAT; auto ivf_indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, nullptr); auto ivf_build_conf = knowhere::Config{{knowhere::meta::DIM, std::to_string(dim)}, {knowhere::indexparam::NLIST, "100"}, {knowhere::meta::METRIC_TYPE, knowhere::metric::L2}}; auto database = knowhere::GenDataset(N, dim, vec_col.data()); ivf_indexing->BuildWithDataset(database, ivf_build_conf); auto ivf_vec_index = dynamic_cast(ivf_indexing.get()); EXPECT_EQ(ivf_vec_index->Count(), N); EXPECT_EQ(ivf_vec_index->GetDim(), dim); LoadIndexInfo load_info; load_info.field_id = fake_id.get(); load_info.index = std::move(ivf_indexing); load_info.index_params["metric_type"] = "L2"; // load index for vec field, load raw data for scalar filed auto ivf_sealed_segment = SealedCreator(schema, dataset); ivf_sealed_segment->DropFieldData(fake_id); ivf_sealed_segment->LoadIndex(load_info); auto sr = ivf_sealed_segment->Search(plan.get(), ph_group.get(), time); EXPECT_EQ(sr->get_total_result_count(), 0); auto hnsw_conf = knowhere::Config{{knowhere::meta::DIM, std::to_string(dim)}, {knowhere::indexparam::HNSW_M, "16"}, {knowhere::indexparam::EFCONSTRUCTION, "200"}, {knowhere::indexparam::EF, "200"}, {knowhere::meta::METRIC_TYPE, knowhere::metric::L2}}; create_index_info.field_type = DataType::VECTOR_FLOAT; create_index_info.metric_type = knowhere::metric::L2; create_index_info.index_type = knowhere::IndexEnum::INDEX_HNSW; auto hnsw_indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(create_index_info, nullptr); hnsw_indexing->BuildWithDataset(database, hnsw_conf); auto hnsw_vec_index = dynamic_cast(hnsw_indexing.get()); EXPECT_EQ(hnsw_vec_index->Count(), N); EXPECT_EQ(hnsw_vec_index->GetDim(), dim); LoadIndexInfo hnsw_load_info; hnsw_load_info.field_id = fake_id.get(); hnsw_load_info.index = std::move(hnsw_indexing); hnsw_load_info.index_params["metric_type"] = "L2"; // load index for vec field, load raw data for scalar filed auto hnsw_sealed_segment = SealedCreator(schema, dataset); hnsw_sealed_segment->DropFieldData(fake_id); hnsw_sealed_segment->LoadIndex(hnsw_load_info); auto sr2 = hnsw_sealed_segment->Search(plan.get(), ph_group.get(), time); EXPECT_EQ(sr2->get_total_result_count(), 0); } TEST(Sealed, LoadFieldData) { auto dim = 16; auto topK = 5; auto N = ROW_COUNT; auto metric_type = knowhere::metric::L2; auto schema = std::make_shared(); auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto counter_id = schema->AddDebugField("counter", DataType::INT64); auto double_id = schema->AddDebugField("double", DataType::DOUBLE); auto nothing_id = schema->AddDebugField("nothing", DataType::INT32); auto str_id = schema->AddDebugField("str", DataType::VARCHAR); schema->set_primary_field_id(counter_id); auto dataset = DataGen(schema, N); auto fakevec = dataset.get_col(fakevec_id); auto indexing = GenVecIndexing(N, dim, fakevec.data()); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ "bool": { "must": [ { "range": { "double": { "GE": -1, "LT": 1 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 3 } } } ] } })"; Timestamp time = 1000000; auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); ASSERT_ANY_THROW(segment->Search(plan.get(), ph_group.get(), time)); SealedLoadFieldData(dataset, *segment); segment->DropFieldData(nothing_id); segment->Search(plan.get(), ph_group.get(), time); segment->DropFieldData(fakevec_id); ASSERT_ANY_THROW(segment->Search(plan.get(), ph_group.get(), time)); LoadIndexInfo vec_info; vec_info.field_id = fakevec_id.get(); vec_info.index = std::move(indexing); vec_info.index_params["metric_type"] = knowhere::metric::L2; segment->LoadIndex(vec_info); ASSERT_EQ(segment->num_chunk(), 1); ASSERT_EQ(segment->num_chunk_index(double_id), 0); ASSERT_EQ(segment->num_chunk_index(str_id), 0); auto chunk_span1 = segment->chunk_data(counter_id, 0); auto chunk_span2 = segment->chunk_data(double_id, 0); auto chunk_span3 = segment->chunk_data(str_id, 0); auto ref1 = dataset.get_col(counter_id); auto ref2 = dataset.get_col(double_id); auto ref3 = dataset.get_col(str_id)->scalars().string_data().data(); for (int i = 0; i < N; ++i) { ASSERT_EQ(chunk_span1[i], ref1[i]); ASSERT_EQ(chunk_span2[i], ref2[i]); ASSERT_EQ(chunk_span3[i], ref3[i]); } auto sr = segment->Search(plan.get(), ph_group.get(), time); auto json = SearchResultToJson(*sr); std::cout << json.dump(1); segment->DropIndex(fakevec_id); ASSERT_ANY_THROW(segment->Search(plan.get(), ph_group.get(), time)); // segment->LoadIndex(vec_info); // auto sr2 = segment->Search(plan.get(), ph_group.get(), time); // auto json2 = SearchResultToJson(*sr); // ASSERT_EQ(json.dump(-2), json2.dump(-2)); // segment->DropFieldData(double_id); // ASSERT_ANY_THROW(segment->Search(plan.get(), ph_group.get(), time)); //#ifdef __linux__ // auto std_json = Json::parse(R"( //[ // [ // ["982->0.000000", "25315->4.742000", "57893->4.758000", "48201->6.075000", "53853->6.223000"], // ["41772->10.111000", "74859->11.790000", "79777->11.842000", "3785->11.983000", "35888->12.193000"], // ["59251->2.543000", "65551->4.454000", "72204->5.332000", "96905->5.479000", "87833->5.765000"], // ["59219->5.458000", "21995->6.078000", "97922->6.764000", "25710->7.158000", "14048->7.294000"], // ["66353->5.696000", "30664->5.881000", "41087->5.917000", "10393->6.633000", "90215->7.202000"] // ] //])"); //#else // for mac // auto std_json = Json::parse(R"( //[ // [ // ["982->0.000000", "31864->4.270000", "18916->4.651000", "71547->5.125000", "86706->5.991000"], // ["96984->4.192000", "65514->6.011000", "89328->6.138000", "80284->6.526000", "68218->6.563000"], // ["30119->2.464000", "82365->4.725000", "74834->5.009000", "79995->5.725000", "33359->5.816000"], // ["99625->6.129000", "86582->6.900000", "85934->7.792000", "60450->8.087000", "19257->8.530000"], // ["37759->3.581000", "31292->5.780000", "98124->6.216000", "63535->6.439000", "11707->6.553000"] // ] //])"); //#endif // ASSERT_EQ(std_json.dump(-2), json.dump(-2)); } TEST(Sealed, LoadScalarIndex) { auto dim = 16; auto N = ROW_COUNT; auto metric_type = knowhere::metric::L2; auto schema = std::make_shared(); auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto counter_id = schema->AddDebugField("counter", DataType::INT64); auto double_id = schema->AddDebugField("double", DataType::DOUBLE); auto nothing_id = schema->AddDebugField("nothing", DataType::INT32); schema->set_primary_field_id(counter_id); auto dataset = DataGen(schema, N); auto fakevec = dataset.get_col(fakevec_id); auto indexing = GenVecIndexing(N, dim, fakevec.data()); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ "bool": { "must": [ { "range": { "double": { "GE": -1, "LT": 1 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 3 } } } ] } })"; Timestamp time = 1000000; auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); LoadFieldDataInfo row_id_info; FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64); auto array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta); row_id_info.field_data = array.release(); row_id_info.row_count = dataset.row_ids_.size(); row_id_info.field_id = RowFieldID.get(); // field id for RowId segment->LoadFieldData(row_id_info); LoadFieldDataInfo ts_info; FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64); array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta); ts_info.field_data = array.release(); ts_info.row_count = dataset.timestamps_.size(); ts_info.field_id = TimestampFieldID.get(); segment->LoadFieldData(ts_info); LoadIndexInfo vec_info; vec_info.field_id = fakevec_id.get(); vec_info.field_type = DataType::VECTOR_FLOAT; vec_info.index = std::move(indexing); vec_info.index_params["metric_type"] = knowhere::metric::L2; segment->LoadIndex(vec_info); LoadIndexInfo counter_index; counter_index.field_id = counter_id.get(); counter_index.field_type = DataType::INT64; counter_index.index_params["index_type"] = "sort"; auto counter_data = dataset.get_col(counter_id); counter_index.index = std::move(GenScalarIndexing(N, counter_data.data())); segment->LoadIndex(counter_index); LoadIndexInfo double_index; double_index.field_id = double_id.get(); double_index.field_type = DataType::DOUBLE; double_index.index_params["index_type"] = "sort"; auto double_data = dataset.get_col(double_id); double_index.index = std::move(GenScalarIndexing(N, double_data.data())); segment->LoadIndex(double_index); LoadIndexInfo nothing_index; nothing_index.field_id = nothing_id.get(); nothing_index.field_type = DataType::INT32; nothing_index.index_params["index_type"] = "sort"; auto nothing_data = dataset.get_col(nothing_id); nothing_index.index = std::move(GenScalarIndexing(N, nothing_data.data())); segment->LoadIndex(nothing_index); auto sr = segment->Search(plan.get(), ph_group.get(), time); auto json = SearchResultToJson(*sr); std::cout << json.dump(1); } TEST(Sealed, Delete) { auto dim = 16; auto topK = 5; auto N = 10; auto metric_type = knowhere::metric::L2; auto schema = std::make_shared(); auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto counter_id = schema->AddDebugField("counter", DataType::INT64); auto double_id = schema->AddDebugField("double", DataType::DOUBLE); auto nothing_id = schema->AddDebugField("nothing", DataType::INT32); schema->set_primary_field_id(counter_id); auto dataset = DataGen(schema, N); auto fakevec = dataset.get_col(fakevec_id); auto segment = CreateSealedSegment(schema); std::string dsl = R"({ "bool": { "must": [ { "range": { "double": { "GE": -1, "LT": 1 } } }, { "vector": { "fakevec": { "metric_type": "L2", "params": { "nprobe": 10 }, "query": "$0", "topk": 5, "round_decimal": 3 } } } ] } })"; Timestamp time = 1000000; auto plan = CreatePlan(*schema, dsl); auto num_queries = 5; auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); ASSERT_ANY_THROW(segment->Search(plan.get(), ph_group.get(), time)); SealedLoadFieldData(dataset, *segment); int64_t row_count = 5; std::vector pks{1, 2, 3, 4, 5}; auto ids = std::make_unique(); ids->mutable_int_id()->mutable_data()->Add(pks.begin(), pks.end()); std::vector timestamps{10, 10, 10, 10, 10}; LoadDeletedRecordInfo info = {timestamps.data(), ids.get(), row_count}; segment->LoadDeletedRecord(info); std::vector tmp_block{0, 0}; BitsetType bitset(N, false); segment->mask_with_delete(bitset, 10, 11); ASSERT_EQ(bitset.count(), pks.size()); int64_t new_count = 3; std::vector new_pks{6, 7, 8}; auto new_ids = std::make_unique(); new_ids->mutable_int_id()->mutable_data()->Add(new_pks.begin(), new_pks.end()); std::vector new_timestamps{10, 10, 10}; auto reserved_offset = segment->PreDelete(new_count); ASSERT_EQ(reserved_offset, row_count); segment->Delete(reserved_offset, new_count, new_ids.get(), reinterpret_cast(new_timestamps.data())); } auto GenMaxFloatVecs(int N, int dim) { std::vector vecs; for (int i = 0; i < N; i++) { for (int j = 0; j < dim; j++) { vecs.push_back(std::numeric_limits::max()); } } return vecs; } auto GenRandomFloatVecs(int N, int dim) { std::vector vecs; srand(time(NULL)); for (int i = 0; i < N; i++) { for (int j = 0; j < dim; j++) { vecs.push_back(static_cast(rand()) / static_cast(RAND_MAX)); } } return vecs; } auto GenQueryVecs(int N, int dim) { std::vector vecs; for (int i = 0; i < N; i++) { for (int j = 0; j < dim; j++) { vecs.push_back(1); } } return vecs; } auto transfer_to_fields_data(const std::vector& vecs) { auto arr = std::make_unique(); *(arr->mutable_vectors()->mutable_float_vector()->mutable_data()) = {vecs.begin(), vecs.end()}; return arr; } TEST(Sealed, BF) { auto schema = std::make_shared(); auto dim = 128; auto metric_type = "L2"; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); int64_t N = 100000; auto base = GenRandomFloatVecs(N, dim); auto base_arr = transfer_to_fields_data(base); base_arr->set_type(proto::schema::DataType::FloatVector); LoadFieldDataInfo load_info{100, base_arr.get(), N}; auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); std::cout << fake_id.get() << std::endl; SealedLoadFieldData(dataset, *segment, {fake_id.get()}); segment->LoadFieldData(load_info); auto topK = 1; auto fmt = boost::format(R"(vector_anns: < field_id: 100 query_info: < topk: %1% metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0"> output_field_ids: 101)") % topK; auto serialized_expr_plan = fmt.str(); auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan.data()); auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size()); auto num_queries = 10; auto query = GenQueryVecs(num_queries, dim); auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, query); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); auto result = segment->Search(plan.get(), ph_group.get(), MAX_TIMESTAMP); auto ves = SearchResultToVector(*result); // first: offset, second: distance EXPECT_GT(ves[0].first, 0); EXPECT_LE(ves[0].first, N); EXPECT_LE(ves[0].second, dim); auto result2 = segment->Search(plan.get(), ph_group.get(), 0); EXPECT_EQ(result2->get_total_result_count(), 0); } TEST(Sealed, BF_Overflow) { auto schema = std::make_shared(); auto dim = 128; auto metric_type = "L2"; auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type); auto i64_fid = schema->AddDebugField("counter", DataType::INT64); schema->set_primary_field_id(i64_fid); int64_t N = 10; auto base = GenMaxFloatVecs(N, dim); auto base_arr = transfer_to_fields_data(base); base_arr->set_type(proto::schema::DataType::FloatVector); LoadFieldDataInfo load_info{100, base_arr.get(), N}; auto dataset = DataGen(schema, N); auto segment = CreateSealedSegment(schema); std::cout << fake_id.get() << std::endl; SealedLoadFieldData(dataset, *segment, {fake_id.get()}); segment->LoadFieldData(load_info); auto topK = 1; auto fmt = boost::format(R"(vector_anns: < field_id: 100 query_info: < topk: %1% metric_type: "L2" search_params: "{\"nprobe\": 10}" > placeholder_tag: "$0"> output_field_ids: 101)") % topK; auto serialized_expr_plan = fmt.str(); auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan.data()); auto plan = CreateSearchPlanByExpr(*schema, binary_plan.data(), binary_plan.size()); auto num_queries = 10; auto query = GenQueryVecs(num_queries, dim); auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, query); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); auto result = segment->Search(plan.get(), ph_group.get(), MAX_TIMESTAMP); auto ves = SearchResultToVector(*result); for (int i = 0; i < num_queries; ++i) { EXPECT_EQ(ves[0].first, -1); } } TEST(Sealed, DeleteCount) { auto schema = std::make_shared(); auto pk = schema->AddDebugField("pk", DataType::INT64); schema->set_primary_field_id(pk); auto segment = CreateSealedSegment(schema); int64_t c = 10; auto offset = segment->PreDelete(c); ASSERT_EQ(offset, 0); Timestamp begin_ts = 100; auto tss = GenTss(c, begin_ts); auto pks = GenPKs(c, 0); auto status = segment->Delete(offset, c, pks.get(), tss.data()); ASSERT_TRUE(status.ok()); auto cnt = segment->get_deleted_count(); ASSERT_EQ(cnt, c); } TEST(Sealed, RealCount) { auto schema = std::make_shared(); auto pk = schema->AddDebugField("pk", DataType::INT64); schema->set_primary_field_id(pk); auto segment = CreateSealedSegment(schema); int64_t c = 10; auto dataset = DataGen(schema, c); auto pks = dataset.get_col(pk); SealedLoadFieldData(dataset, *segment); // no delete. ASSERT_EQ(c, segment->get_real_count()); // delete half. auto half = c / 2; auto del_offset1 = segment->PreDelete(half); ASSERT_EQ(del_offset1, 0); auto del_ids1 = GenPKs(pks.begin(), pks.begin() + half); auto del_tss1 = GenTss(half, c); auto status = segment->Delete(del_offset1, half, del_ids1.get(), del_tss1.data()); ASSERT_TRUE(status.ok()); ASSERT_EQ(c - half, segment->get_real_count()); // delete duplicate. auto del_offset2 = segment->PreDelete(half); ASSERT_EQ(del_offset2, half); auto del_tss2 = GenTss(half, c + half); status = segment->Delete(del_offset2, half, del_ids1.get(), del_tss2.data()); ASSERT_TRUE(status.ok()); ASSERT_EQ(c - half, segment->get_real_count()); // delete all. auto del_offset3 = segment->PreDelete(c); ASSERT_EQ(del_offset3, half * 2); auto del_ids3 = GenPKs(pks.begin(), pks.end()); auto del_tss3 = GenTss(c, c + half * 2); status = segment->Delete(del_offset3, c, del_ids3.get(), del_tss3.data()); ASSERT_TRUE(status.ok()); ASSERT_EQ(0, segment->get_real_count()); }