mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-05 05:18:52 +08:00
7385770014
Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
3455 lines
140 KiB
C++
3455 lines
140 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include <boost/format.hpp>
|
|
#include <chrono>
|
|
#include <google/protobuf/text_format.h>
|
|
#include <gtest/gtest.h>
|
|
#include <iostream>
|
|
#include <random>
|
|
#include <string>
|
|
#include <unordered_set>
|
|
|
|
#include "common/LoadInfo.h"
|
|
#include "common/memory_c.h"
|
|
#include "knowhere/index/VecIndexFactory.h"
|
|
#include "knowhere/index/vector_index/IndexIVFPQ.h"
|
|
#include "knowhere/index/vector_index/helpers/IndexParameter.h"
|
|
#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
|
|
#include "pb/plan.pb.h"
|
|
#include "query/ExprImpl.h"
|
|
#include "segcore/Collection.h"
|
|
#include "segcore/reduce_c.h"
|
|
#include "segcore/Reduce.h"
|
|
#include "test_utils/DataGen.h"
|
|
|
|
namespace chrono = std::chrono;
|
|
|
|
using namespace milvus;
|
|
using namespace milvus::segcore;
|
|
using namespace knowhere;
|
|
|
|
namespace {
|
|
const int DIM = 16;
|
|
const int64_t ROW_COUNT = 100 * 1000;
|
|
|
|
const char*
|
|
get_default_schema_config() {
|
|
static std::string conf = R"(name: "default-collection"
|
|
fields: <
|
|
fieldID: 100
|
|
name: "fakevec"
|
|
data_type: FloatVector
|
|
type_params: <
|
|
key: "dim"
|
|
value: "16"
|
|
>
|
|
index_params: <
|
|
key: "metric_type"
|
|
value: "L2"
|
|
>
|
|
>
|
|
fields: <
|
|
fieldID: 101
|
|
name: "age"
|
|
data_type: Int64
|
|
is_primary_key: true
|
|
>)";
|
|
static std::string fake_conf = "";
|
|
return conf.c_str();
|
|
}
|
|
|
|
auto
|
|
generate_data(int N) {
|
|
std::vector<char> raw_data;
|
|
std::vector<uint64_t> timestamps;
|
|
std::vector<int64_t> uids;
|
|
std::default_random_engine e(42);
|
|
std::normal_distribution<> dis(0.0, 1.0);
|
|
for (int i = 0; i < N; ++i) {
|
|
uids.push_back(10 * N + i);
|
|
timestamps.push_back(0);
|
|
float vec[DIM];
|
|
for (auto& x : vec) {
|
|
x = dis(e);
|
|
}
|
|
raw_data.insert(raw_data.end(), (const char*)std::begin(vec), (const char*)std::end(vec));
|
|
int age = e() % 100;
|
|
raw_data.insert(raw_data.end(), (const char*)&age, ((const char*)&age) + sizeof(age));
|
|
}
|
|
return std::make_tuple(raw_data, timestamps, uids);
|
|
}
|
|
|
|
std::string
|
|
generate_query_data(int nq) {
|
|
namespace ser = milvus::proto::common;
|
|
std::default_random_engine e(67);
|
|
int dim = DIM;
|
|
std::normal_distribution<double> dis(0.0, 1.0);
|
|
ser::PlaceholderGroup raw_group;
|
|
auto value = raw_group.add_placeholders();
|
|
value->set_tag("$0");
|
|
value->set_type(ser::PlaceholderType::FloatVector);
|
|
for (int i = 0; i < nq; ++i) {
|
|
std::vector<float> vec;
|
|
for (int d = 0; d < dim; ++d) {
|
|
vec.push_back(dis(e));
|
|
}
|
|
value->add_values(vec.data(), vec.size() * sizeof(float));
|
|
}
|
|
auto blob = raw_group.SerializeAsString();
|
|
return blob;
|
|
}
|
|
|
|
std::string
|
|
generate_collection_schema(std::string metric_type, int dim, bool is_binary) {
|
|
namespace schema = milvus::proto::schema;
|
|
schema::CollectionSchema collection_schema;
|
|
collection_schema.set_name("collection_test");
|
|
|
|
auto vec_field_schema = collection_schema.add_fields();
|
|
vec_field_schema->set_name("fakevec");
|
|
vec_field_schema->set_fieldid(100);
|
|
if (is_binary) {
|
|
vec_field_schema->set_data_type(schema::DataType::BinaryVector);
|
|
} else {
|
|
vec_field_schema->set_data_type(schema::DataType::FloatVector);
|
|
}
|
|
auto metric_type_param = vec_field_schema->add_index_params();
|
|
metric_type_param->set_key("metric_type");
|
|
metric_type_param->set_value(metric_type);
|
|
auto dim_param = vec_field_schema->add_type_params();
|
|
dim_param->set_key("dim");
|
|
dim_param->set_value(std::to_string(dim));
|
|
|
|
auto other_field_schema = collection_schema.add_fields();
|
|
other_field_schema->set_name("counter");
|
|
other_field_schema->set_fieldid(101);
|
|
other_field_schema->set_data_type(schema::DataType::Int64);
|
|
other_field_schema->set_is_primary_key(true);
|
|
|
|
auto other_field_schema2 = collection_schema.add_fields();
|
|
other_field_schema2->set_name("doubleField");
|
|
other_field_schema2->set_fieldid(102);
|
|
other_field_schema2->set_data_type(schema::DataType::Double);
|
|
|
|
std::string schema_string;
|
|
auto marshal = google::protobuf::TextFormat::PrintToString(collection_schema, &schema_string);
|
|
assert(marshal == true);
|
|
return schema_string;
|
|
}
|
|
|
|
VecIndexPtr
|
|
generate_index(void* raw_data, knowhere::Config conf, int64_t dim, int64_t topK, int64_t N, knowhere::IndexType index_type) {
|
|
auto indexing = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_type, knowhere::IndexMode::MODE_CPU);
|
|
|
|
auto database = knowhere::GenDataset(N, dim, raw_data);
|
|
indexing->Train(database, conf);
|
|
indexing->AddWithoutIds(database, conf);
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
EXPECT_EQ(indexing->Dim(), dim);
|
|
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
EXPECT_EQ(indexing->Dim(), dim);
|
|
return indexing;
|
|
}
|
|
} // namespace
|
|
|
|
TEST(CApiTest, CollectionTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
DeleteCollection(collection);
|
|
}
|
|
|
|
TEST(CApiTest, GetCollectionNameTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto name = GetCollectionName(collection);
|
|
assert(strcmp(name, "default-collection") == 0);
|
|
DeleteCollection(collection);
|
|
}
|
|
|
|
TEST(CApiTest, SegmentTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
template <typename Message>
|
|
std::vector<uint8_t>
|
|
serialize(const Message* msg) {
|
|
auto l = msg->ByteSize();
|
|
std::vector<uint8_t> ret(l);
|
|
auto ok = msg->SerializeToArray(ret.data(), l);
|
|
assert(ok);
|
|
return ret;
|
|
}
|
|
|
|
#ifdef __linux__
|
|
|
|
// TEST(Common, Memory_benchmark) {
|
|
// auto run_times = 1000000;
|
|
// auto start = std::chrono::high_resolution_clock::now();
|
|
//
|
|
// for (int i = 0; i < run_times; i++) {
|
|
// PurgeMemory(UINT64_MAX /*never malloc_trim*/);
|
|
// }
|
|
//
|
|
// auto stop = std::chrono::high_resolution_clock::now();
|
|
// auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
|
|
//
|
|
// std::cout << run_times << " times taken by PurgeMemory: " << duration.count() << " milliseconds" << std::endl;
|
|
// // 1000000 times taken by PurgeMemory: 8307 milliseconds
|
|
//}
|
|
|
|
TEST(Common, Memory) {
|
|
auto res = PurgeMemory(UINT64_MAX /*never malloc_trim*/);
|
|
assert(res.error_code == Success);
|
|
res = PurgeMemory(0);
|
|
assert(res.error_code == Success);
|
|
}
|
|
|
|
#endif
|
|
|
|
TEST(CApiTest, InsertTest) {
|
|
auto c_collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(c_collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)c_collection;
|
|
|
|
int N = 10000;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
DeleteCollection(c_collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, DeleteTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
std::vector<int64_t> delete_row_ids = {100000, 100001, 100002};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
uint64_t delete_timestamps[] = {0, 0, 0};
|
|
|
|
auto offset = PreDelete(segment, 3);
|
|
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps);
|
|
assert(del_res.error_code == Success);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, MultiDeleteGrowingSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 10;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
auto insert_data = serialize(dataset.raw_);
|
|
|
|
// insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
// delete data pks = {1}
|
|
std::vector<int64_t> delete_pks = {1};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(1, dataset.timestamps_[N - 1]);
|
|
offset = PreDelete(segment, 1);
|
|
auto del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks = {1}
|
|
std::vector<int64_t> retrive_pks = {1};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_pks);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
// retrieve pks = {2}
|
|
retrive_pks = {2};
|
|
term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_pks);
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 1);
|
|
|
|
// delete pks = {2}
|
|
delete_pks = {2};
|
|
ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end());
|
|
delete_data = serialize(ids.get());
|
|
offset = PreDelete(segment, 1);
|
|
del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks in {2}
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, MultiDeleteSealedSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 10;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
|
|
// load field data
|
|
for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) {
|
|
auto array = dataset.get_col(field_id);
|
|
auto data = serialize(array.get());
|
|
|
|
auto load_info = CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N};
|
|
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
}
|
|
|
|
// load timestamps
|
|
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto ts_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta);
|
|
auto ts_data = serialize(ts_array.get());
|
|
auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N};
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// load rowID
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_id_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_id_data = serialize(row_id_array.get());
|
|
load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N};
|
|
res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// delete data pks = {1}
|
|
std::vector<int64_t> delete_pks = {1};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(1, dataset.timestamps_[N - 1]);
|
|
auto offset = PreDelete(segment, 1);
|
|
auto del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks = {1}
|
|
std::vector<int64_t> retrive_pks = {1};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_pks);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
// retrieve pks = {2}
|
|
retrive_pks = {2};
|
|
term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_pks);
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 1);
|
|
|
|
// delete pks = {2}
|
|
delete_pks = {2};
|
|
ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_pks.begin(), delete_pks.end());
|
|
delete_data = serialize(ids.get());
|
|
offset = PreDelete(segment, 1);
|
|
del_res = Delete(segment, offset, 1, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks in {2}
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, DeleteRepeatedPksFromGrowingSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 10;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
|
|
// first insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
// second insert, pks= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
|
|
PreInsert(segment, N, &offset);
|
|
res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
// create retrieve plan pks in {1, 2, 3}
|
|
std::vector<int64_t> retrive_row_ids = {1, 2, 3};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_row_ids);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 6);
|
|
|
|
// delete data pks = {1, 2, 3}
|
|
std::vector<int64_t> delete_row_ids = {1, 2, 3};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(3, dataset.timestamps_[N - 1]);
|
|
|
|
offset = PreDelete(segment, 3);
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks in {1, 2, 3}
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, DeleteRepeatedPksFromSealedSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 20;
|
|
auto dataset = DataGen(col->get_schema(), N, 42, 0, 2);
|
|
|
|
for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) {
|
|
auto array = dataset.get_col(field_id);
|
|
auto data = serialize(array.get());
|
|
|
|
auto load_info = CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N};
|
|
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
}
|
|
|
|
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto ts_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta);
|
|
auto ts_data = serialize(ts_array.get());
|
|
auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N};
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_id_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_id_data = serialize(row_id_array.get());
|
|
load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N};
|
|
res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// create retrieve plan pks in {1, 2, 3}
|
|
std::vector<int64_t> retrive_row_ids = {1, 2, 3};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_row_ids);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 6);
|
|
|
|
// delete data pks = {1, 2, 3}
|
|
std::vector<int64_t> delete_row_ids = {1, 2, 3};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(3, dataset.timestamps_[N - 1]);
|
|
|
|
auto offset = PreDelete(segment, 3);
|
|
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// retrieve pks in {1, 2, 3}
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, InsertSamePkAfterDeleteOnGrowingSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 10;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
auto insert_data = serialize(dataset.raw_);
|
|
|
|
// first insert data
|
|
// insert data with pks = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} , timestamps = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
// delete data pks = {1, 2, 3}, timestamps = {9, 9, 9}
|
|
std::vector<int64_t> delete_row_ids = {1, 2, 3};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(3, dataset.timestamps_[N - 1]);
|
|
|
|
offset = PreDelete(segment, 3);
|
|
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// create retrieve plan pks in {1, 2, 3}, timestamp = 9
|
|
std::vector<int64_t> retrive_row_ids = {1, 2, 3};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_row_ids);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 0);
|
|
|
|
// second insert data
|
|
// insert data with pks = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} , timestamps = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19}
|
|
dataset = DataGen(col->get_schema(), N, 42, N);
|
|
insert_data = serialize(dataset.raw_);
|
|
PreInsert(segment, N, &offset);
|
|
res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
// retrieve pks in {1, 2, 3}, timestamp = 19
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 3);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, InsertSamePkAfterDeleteOnSealedSegment) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
auto col = (milvus::segcore::Collection*)collection;
|
|
|
|
int N = 10;
|
|
auto dataset = DataGen(col->get_schema(), N, 42, 0, 2);
|
|
|
|
// insert data with pks = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5} , timestamps = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
|
|
for (auto& [field_id, field_meta] : col->get_schema()->get_fields()) {
|
|
auto array = dataset.get_col(field_id);
|
|
auto data = serialize(array.get());
|
|
|
|
auto load_info = CLoadFieldDataInfo{field_id.get(), data.data(), data.size(), N};
|
|
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
}
|
|
|
|
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto ts_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, ts_field_meta);
|
|
auto ts_data = serialize(ts_array.get());
|
|
auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N};
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_id_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_id_data = serialize(row_id_array.get());
|
|
load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N};
|
|
res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// delete data pks = {1, 2, 3}, timestamps = {4, 4, 4}
|
|
std::vector<int64_t> delete_row_ids = {1, 2, 3};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
std::vector<uint64_t> delete_timestamps(3, dataset.timestamps_[4]);
|
|
|
|
auto offset = PreDelete(segment, 3);
|
|
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps.data());
|
|
assert(del_res.error_code == Success);
|
|
|
|
// create retrieve plan pks in {1, 2, 3}, timestamp = 9
|
|
std::vector<int64_t> retrive_row_ids = {1, 2, 3};
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, retrive_row_ids);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), dataset.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->ids().int_id().data().size(), 3);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SearchTest) {
|
|
auto c_collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(c_collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)c_collection;
|
|
|
|
int N = 10000;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
int64_t ts_offset = 1000;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
ASSERT_EQ(ins_res.error_code, Success);
|
|
|
|
const char* dsl_string = R"(
|
|
{
|
|
"bool": {
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 10,
|
|
"round_decimal": 3
|
|
}
|
|
}
|
|
}
|
|
})";
|
|
|
|
int num_queries = 10;
|
|
auto blob = generate_query_data(num_queries);
|
|
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(c_collection, dsl_string, &plan);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
|
|
CSearchResult search_result;
|
|
auto res = Search(segment, plan, placeholderGroup, N + ts_offset, &search_result, -1);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
CSearchResult search_result2;
|
|
auto res2 = Search(segment, plan, placeholderGroup, ts_offset, &search_result2, -1);
|
|
ASSERT_EQ(res2.error_code, Success);
|
|
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(search_result);
|
|
DeleteSearchResult(search_result2);
|
|
DeleteCollection(c_collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SearchTestWithExpr) {
|
|
auto c_collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(c_collection, Growing, -1);
|
|
auto col = (milvus::segcore::Collection*)c_collection;
|
|
|
|
int N = 10000;
|
|
auto dataset = DataGen(col->get_schema(), N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
ASSERT_EQ(ins_res.error_code, Success);
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
query_info: <
|
|
topk: 10
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
int num_queries = 10;
|
|
auto blob = generate_query_data(num_queries);
|
|
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(c_collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
dataset.timestamps_.clear();
|
|
dataset.timestamps_.push_back(1);
|
|
|
|
CSearchResult search_result;
|
|
auto res = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &search_result, -1);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(search_result);
|
|
DeleteCollection(c_collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, RetrieveTestWithExpr) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
|
|
int N = 10000;
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
ASSERT_EQ(ins_res.error_code, Success);
|
|
|
|
// create retrieve plan "age in [0]"
|
|
std::vector<int64_t> values(1, 0);
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(FieldId(101), DataType::INT64, values);
|
|
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids{FieldId(100), FieldId(101)};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
auto res = Retrieve(segment, plan.get(), dataset.timestamps_[0], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, GetMemoryUsageInBytesTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto old_memory_usage_size = GetMemoryUsageInBytes(segment);
|
|
// std::cout << "old_memory_usage_size = " << old_memory_usage_size << std::endl;
|
|
assert(old_memory_usage_size == 0);
|
|
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
int N = 10000;
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
auto memory_usage_size = GetMemoryUsageInBytes(segment);
|
|
// std::cout << "new_memory_usage_size = " << memory_usage_size << std::endl;
|
|
// TODO:: assert
|
|
// assert(memory_usage_size == 2785280);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, GetDeletedCountTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
std::vector<int64_t> delete_row_ids = {100000, 100001, 100002};
|
|
auto ids = std::make_unique<IdArray>();
|
|
ids->mutable_int_id()->mutable_data()->Add(delete_row_ids.begin(), delete_row_ids.end());
|
|
auto delete_data = serialize(ids.get());
|
|
uint64_t delete_timestamps[] = {0, 0, 0};
|
|
|
|
auto offset = PreDelete(segment, 3);
|
|
|
|
auto del_res = Delete(segment, offset, 3, delete_data.data(), delete_data.size(), delete_timestamps);
|
|
assert(del_res.error_code == Success);
|
|
|
|
// TODO: assert(deleted_count == len(delete_row_ids))
|
|
auto deleted_count = GetDeletedCount(segment);
|
|
assert(deleted_count == 0);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, GetRowCountTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
int N = 10000;
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(res.error_code == Success);
|
|
|
|
auto row_count = GetRowCount(segment);
|
|
assert(row_count == N);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
// TEST(CApiTest, SchemaTest) {
|
|
// std::string schema_string =
|
|
// "id: 6873737669791618215\nname: \"collection0\"\nschema: \u003c\n "
|
|
// "field_metas: \u003c\n field_name: \"age\"\n type: INT32\n dim: 1\n \u003e\n "
|
|
// "field_metas: \u003c\n field_name: \"field_1\"\n type: VECTOR_FLOAT\n dim: 16\n \u003e\n"
|
|
// "\u003e\ncreate_time: 1600416765\nsegment_ids: 6873737669791618215\npartition_tags: \"default\"\n";
|
|
//
|
|
// auto collection = NewCollection(schema_string.data());
|
|
// auto segment = NewSegment(collection, Growing, -1);
|
|
// DeleteCollection(collection);
|
|
// DeleteSegment(segment);
|
|
//}
|
|
|
|
void
|
|
CheckSearchResultDuplicate(const std::vector<CSearchResult>& results) {
|
|
auto sr = (SearchResult*)results[0];
|
|
auto topk = sr->unity_topK_;
|
|
auto num_queries = sr->total_nq_;
|
|
|
|
// fill primary keys
|
|
std::vector<PkType> result_pks(num_queries * topk);
|
|
for (int i = 0; i < results.size(); i++) {
|
|
auto search_result = (SearchResult*)results[i];
|
|
auto size = search_result->result_offsets_.size();
|
|
if (size == 0) {
|
|
continue;
|
|
}
|
|
for (int j = 0; j < size; j++) {
|
|
auto offset = search_result->result_offsets_[j];
|
|
result_pks[offset] = search_result->primary_keys_[j];
|
|
}
|
|
}
|
|
|
|
// check primary key duplicates
|
|
// int64_t cnt = 0;
|
|
// std::unordered_set<PkType> pk_set;
|
|
// for (int qi = 0; qi < num_queries; qi++) {
|
|
// pk_set.clear();
|
|
// for (int k = 0; k < topk; k++) {
|
|
// int64_t idx = topk * qi + k;
|
|
// pk_set.insert(result_pks[idx]);
|
|
// }
|
|
// cnt += pk_set.size();
|
|
// }
|
|
// assert(cnt == topk * num_queries);
|
|
}
|
|
|
|
TEST(CApiTest, ReduceRemoveDuplicates) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
int N = 10000;
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"(
|
|
{
|
|
"bool": {
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 10,
|
|
"round_decimal": 3
|
|
}
|
|
}
|
|
}
|
|
})";
|
|
|
|
int num_queries = 10;
|
|
int topK = 10;
|
|
|
|
auto blob = generate_query_data(num_queries);
|
|
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
dataset.timestamps_.clear();
|
|
dataset.timestamps_.push_back(1);
|
|
|
|
{
|
|
auto slice_nqs = std::vector<int32_t>{num_queries / 2, num_queries / 2};
|
|
auto slice_topKs = std::vector<int32_t>{topK / 2, topK};
|
|
std::vector<CSearchResult> results;
|
|
CSearchResult res1, res2;
|
|
status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res1, -1);
|
|
assert(status.error_code == Success);
|
|
status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res2, -1);
|
|
assert(status.error_code == Success);
|
|
results.push_back(res1);
|
|
results.push_back(res2);
|
|
|
|
CSearchResultDataBlobs cSearchResultData;
|
|
status = ReduceSearchResultsAndFillData(&cSearchResultData, plan, results.data(), results.size(),
|
|
slice_nqs.data(), slice_topKs.data(), slice_nqs.size());
|
|
assert(status.error_code == Success);
|
|
// TODO:: insert no duplicate pks and check reduce results
|
|
CheckSearchResultDuplicate(results);
|
|
|
|
DeleteSearchResult(res1);
|
|
DeleteSearchResult(res2);
|
|
}
|
|
{
|
|
int nq1 = num_queries / 3;
|
|
int nq2 = num_queries / 3;
|
|
int nq3 = num_queries - nq1 - nq2;
|
|
auto slice_nqs = std::vector<int32_t>{nq1, nq2, nq3};
|
|
auto slice_topKs = std::vector<int32_t>{topK / 2, topK, topK};
|
|
std::vector<CSearchResult> results;
|
|
CSearchResult res1, res2, res3;
|
|
status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res1, -1);
|
|
assert(status.error_code == Success);
|
|
status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res2, -1);
|
|
assert(status.error_code == Success);
|
|
status = Search(segment, plan, placeholderGroup, dataset.timestamps_[0], &res3, -1);
|
|
assert(status.error_code == Success);
|
|
results.push_back(res1);
|
|
results.push_back(res2);
|
|
results.push_back(res3);
|
|
CSearchResultDataBlobs cSearchResultData;
|
|
status = ReduceSearchResultsAndFillData(&cSearchResultData, plan, results.data(), results.size(),
|
|
slice_nqs.data(), slice_topKs.data(), slice_nqs.size());
|
|
assert(status.error_code == Success);
|
|
// TODO:: insert no duplicate pks and check reduce results
|
|
CheckSearchResultDuplicate(results);
|
|
|
|
DeleteSearchResult(res1);
|
|
DeleteSearchResult(res2);
|
|
DeleteSearchResult(res3);
|
|
}
|
|
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
void
|
|
testReduceSearchWithExpr(int N, int topK, int num_queries) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto schema = ((milvus::segcore::Collection*)collection)->get_schema();
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
auto fmt = boost::format(R"(vector_anns: <
|
|
field_id: 100
|
|
query_info: <
|
|
topk: %1%
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0">
|
|
output_field_ids: 100)") %
|
|
topK;
|
|
|
|
auto serialized_expr_plan = fmt.str();
|
|
auto blob = generate_query_data(num_queries);
|
|
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan.data());
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
dataset.timestamps_.clear();
|
|
dataset.timestamps_.push_back(1);
|
|
|
|
std::vector<CSearchResult> results;
|
|
CSearchResult res1;
|
|
CSearchResult res2;
|
|
auto res = Search(segment, plan, placeholderGroup, dataset.timestamps_[N - 1], &res1, -1);
|
|
assert(res.error_code == Success);
|
|
res = Search(segment, plan, placeholderGroup, dataset.timestamps_[N - 1], &res2, -1);
|
|
assert(res.error_code == Success);
|
|
results.push_back(res1);
|
|
results.push_back(res2);
|
|
|
|
auto slice_nqs = std::vector<int32_t>{num_queries / 2, num_queries / 2};
|
|
if (num_queries == 1) {
|
|
slice_nqs = std::vector<int32_t>{num_queries};
|
|
}
|
|
auto slice_topKs = std::vector<int32_t>{topK / 2, topK};
|
|
if (topK == 1) {
|
|
slice_topKs = std::vector<int32_t>{topK, topK};
|
|
}
|
|
|
|
// 1. reduce
|
|
CSearchResultDataBlobs cSearchResultData;
|
|
status = ReduceSearchResultsAndFillData(&cSearchResultData, plan, results.data(), results.size(), slice_nqs.data(),
|
|
slice_topKs.data(), slice_nqs.size());
|
|
assert(status.error_code == Success);
|
|
|
|
auto search_result_data_blobs = reinterpret_cast<milvus::segcore::SearchResultDataBlobs*>(cSearchResultData);
|
|
|
|
// check result
|
|
for (int i = 0; i < slice_nqs.size(); i++) {
|
|
milvus::proto::schema::SearchResultData search_result_data;
|
|
auto suc = search_result_data.ParseFromArray(search_result_data_blobs->blobs[i].data(),
|
|
search_result_data_blobs->blobs[i].size());
|
|
assert(suc);
|
|
|
|
assert(suc);
|
|
assert(search_result_data.num_queries() == slice_nqs[i]);
|
|
assert(search_result_data.top_k() == slice_topKs[i]);
|
|
assert(search_result_data.scores().size() == slice_topKs[i] * slice_nqs[i]);
|
|
assert(search_result_data.ids().int_id().data_size() == slice_topKs[i] * slice_nqs[i]);
|
|
|
|
// check topKs
|
|
assert(search_result_data.topks().size() == slice_nqs[i]);
|
|
for (int j = 0; j < search_result_data.topks().size(); j++) {
|
|
assert(search_result_data.topks().at(j) == slice_topKs[i]);
|
|
}
|
|
|
|
// assert(search_result_data.scores().size() == slice_topKs[i] * slice_nqs[i]);
|
|
// assert(search_result_data.ids().int_id().data_size() == slice_topKs[i] * slice_nqs[i]);
|
|
// assert(search_result_data.top_k() == topK);
|
|
// assert(search_result_data.num_queries() == req_sizes[i]);
|
|
// assert(search_result_data.scores().size() == topK * req_sizes[i]);
|
|
// assert(search_result_data.ids().int_id().data_size() == topK * req_sizes[i]);
|
|
}
|
|
|
|
DeleteSearchResultDataBlobs(cSearchResultData);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(res1);
|
|
DeleteSearchResult(res2);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, ReduceSearchWithExpr) {
|
|
testReduceSearchWithExpr(100, 1, 1);
|
|
testReduceSearchWithExpr(100, 10, 10);
|
|
testReduceSearchWithExpr(10000, 1, 1);
|
|
testReduceSearchWithExpr(10000, 10, 10);
|
|
}
|
|
|
|
TEST(CApiTest, LoadIndexInfo) {
|
|
// generator index
|
|
constexpr auto TOPK = 10;
|
|
|
|
auto N = 1024 * 10;
|
|
auto [raw_data, timestamps, uids] = generate_data(N);
|
|
auto indexing = std::make_shared<knowhere::IVFPQ>();
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 4},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto database = knowhere::GenDataset(N, DIM, raw_data.data());
|
|
indexing->Train(database, conf);
|
|
indexing->AddWithoutIds(database, conf);
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
EXPECT_EQ(indexing->Dim(), DIM);
|
|
auto binary_set = indexing->Serialize(conf);
|
|
CBinarySet c_binary_set = (CBinarySet)&binary_set;
|
|
|
|
void* c_load_index_info = nullptr;
|
|
auto status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_param_key1 = "index_type";
|
|
std::string index_param_value1 = "IVF_PQ";
|
|
status = AppendIndexParam(c_load_index_info, index_param_key1.data(), index_param_value1.data());
|
|
std::string index_param_key2 = "index_mode";
|
|
std::string index_param_value2 = "cpu";
|
|
status = AppendIndexParam(c_load_index_info, index_param_key2.data(), index_param_value2.data());
|
|
assert(status.error_code == Success);
|
|
std::string field_name = "field0";
|
|
status = AppendFieldInfo(c_load_index_info, 0, CDataType::FloatVector);
|
|
assert(status.error_code == Success);
|
|
status = AppendIndex(c_load_index_info, c_binary_set);
|
|
assert(status.error_code == Success);
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
}
|
|
|
|
TEST(CApiTest, LoadIndex_Search) {
|
|
// generator index
|
|
constexpr auto TOPK = 10;
|
|
|
|
auto N = 1024 * 1024;
|
|
auto num_query = 100;
|
|
auto [raw_data, timestamps, uids] = generate_data(N);
|
|
auto indexing = std::make_shared<knowhere::IVFPQ>();
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 4},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto database = knowhere::GenDataset(N, DIM, raw_data.data());
|
|
indexing->Train(database, conf);
|
|
indexing->AddWithoutIds(database, conf);
|
|
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
EXPECT_EQ(indexing->Dim(), DIM);
|
|
|
|
// serializ index to binarySet
|
|
auto binary_set = indexing->Serialize(conf);
|
|
|
|
// fill loadIndexInfo
|
|
LoadIndexInfo load_index_info;
|
|
auto& index_params = load_index_info.index_params;
|
|
index_params["index_type"] = "IVF_PQ";
|
|
index_params["index_mode"] = "CPU";
|
|
auto mode = knowhere::IndexMode::MODE_CPU;
|
|
load_index_info.index = knowhere::VecIndexFactory::GetInstance().CreateVecIndex(index_params["index_type"], mode);
|
|
load_index_info.index->Load(binary_set);
|
|
|
|
// search
|
|
auto query_dataset = knowhere::GenDataset(num_query, DIM, raw_data.data() + DIM * 4200);
|
|
|
|
auto result = indexing->Query(query_dataset, conf, nullptr);
|
|
|
|
auto ids = knowhere::GetDatasetIDs(result);
|
|
auto dis = knowhere::GetDatasetDistance(result);
|
|
// for (int i = 0; i < std::min(num_query * K, 100); ++i) {
|
|
// std::cout << ids[i] << "->" << dis[i] << std::endl;
|
|
//}
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Without_Predicate) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"(
|
|
{
|
|
"bool": {
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_raw_index_json = SearchResultToJson(*search_result_on_raw_index);
|
|
auto search_result_on_bigIndex_json = SearchResultToJson((*(SearchResult*)c_search_result_on_bigIndex));
|
|
// std::cout << search_result_on_raw_index_json.dump(1) << std::endl;
|
|
// std::cout << search_result_on_bigIndex_json.dump(1) << std::endl;
|
|
|
|
ASSERT_EQ(search_result_on_raw_index_json.dump(1), search_result_on_bigIndex_json.dump(1));
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Expr_Without_Predicate) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_raw_index_json = SearchResultToJson(*search_result_on_raw_index);
|
|
auto search_result_on_bigIndex_json = SearchResultToJson((*(SearchResult*)c_search_result_on_bigIndex));
|
|
// std::cout << search_result_on_raw_index_json.dump(1) << std::endl;
|
|
// std::cout << search_result_on_bigIndex_json.dump(1) << std::endl;
|
|
|
|
ASSERT_EQ(search_result_on_raw_index_json.dump(1), search_result_on_bigIndex_json.dump(1));
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_With_float_Predicate_Range) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"({
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"range": {
|
|
"counter": {
|
|
"GE": 42000,
|
|
"LT": 42010
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 10;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Expr_With_float_Predicate_Range) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = 1000 * 1000;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 420000 * DIM;
|
|
|
|
{
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(),
|
|
insert_data.data(), insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
}
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
predicates: <
|
|
binary_expr: <
|
|
op: LogicalAnd
|
|
left: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: GreaterEqual
|
|
value: <
|
|
int64_val: 420000
|
|
>
|
|
>
|
|
>
|
|
right: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: LessThan
|
|
value: <
|
|
int64_val: 420010
|
|
>
|
|
>
|
|
>
|
|
>
|
|
>
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 10;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_With_float_Predicate_Term) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"({
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"term": {
|
|
"counter": {
|
|
"values": [42000, 42001, 42002, 42003, 42004]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Expr_With_float_Predicate_Term) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = 1000 * 1000;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 420000 * DIM;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* serialized_expr_plan = R"(
|
|
vector_anns: <
|
|
field_id: 100
|
|
predicates: <
|
|
term_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
values: <
|
|
int64_val: 420000
|
|
>
|
|
values: <
|
|
int64_val: 420001
|
|
>
|
|
values: <
|
|
int64_val: 420002
|
|
>
|
|
values: <
|
|
int64_val: 420003
|
|
>
|
|
values: <
|
|
int64_val: 420004
|
|
>
|
|
>
|
|
>
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_With_binary_Predicate_Range) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("JACCARD", DIM, true);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = 1000 * 1000;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<uint8_t>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 420000 * DIM / 8;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"({
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"range": {
|
|
"counter": {
|
|
"GE": 420000,
|
|
"LT": 420010
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "JACCARD",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::JACCARD},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "BIN_IVF_FLAT";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "JACCARD";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 420000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Range) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("JACCARD", DIM, true);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<uint8_t>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM / 8;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
predicates: <
|
|
binary_expr: <
|
|
op: LogicalAnd
|
|
left: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: GreaterEqual
|
|
value: <
|
|
int64_val: 42000
|
|
>
|
|
>
|
|
>
|
|
right: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: LessThan
|
|
value: <
|
|
int64_val: 42010
|
|
>
|
|
>
|
|
>
|
|
>
|
|
>
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "JACCARD"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
ASSERT_TRUE(res_before_load_index.error_code == Success) << res_before_load_index.error_msg;
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::JACCARD},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "BIN_IVF_FLAT";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "JACCARD";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[offset]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_With_binary_Predicate_Term) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("JACCARD", DIM, true);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<uint8_t>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM / 8;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* dsl_string = R"({
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"term": {
|
|
"counter": {
|
|
"values": [42000, 42001, 42002, 42003, 42004]
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "JACCARD",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
int topK = 5;
|
|
auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::JACCARD},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "BIN_IVF_FLAT";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "JACCARD";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
std::vector<CSearchResult> results;
|
|
results.push_back(c_search_result_on_bigIndex);
|
|
|
|
auto slice_nqs = std::vector<int32_t>{num_queries};
|
|
auto slice_topKs = std::vector<int32_t>{topK};
|
|
|
|
CSearchResultDataBlobs cSearchResultData;
|
|
status = ReduceSearchResultsAndFillData(&cSearchResultData, plan, results.data(), results.size(), slice_nqs.data(),
|
|
slice_topKs.data(), slice_nqs.size());
|
|
assert(status.error_code == Success);
|
|
|
|
// status = ReduceSearchResultsAndFillData(plan, results.data(), results.size());
|
|
// assert(status.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = search_result_on_bigIndex->get_result_count(i);
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[i * TOPK]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, Indexing_Expr_With_binary_Predicate_Term) {
|
|
// insert data to segment
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("JACCARD", DIM, true);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Growing, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<uint8_t>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM / 8;
|
|
|
|
int64_t offset;
|
|
PreInsert(segment, N, &offset);
|
|
|
|
auto insert_data = serialize(dataset.raw_);
|
|
auto ins_res = Insert(segment, offset, N, dataset.row_ids_.data(), dataset.timestamps_.data(), insert_data.data(),
|
|
insert_data.size());
|
|
assert(ins_res.error_code == Success);
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
predicates: <
|
|
term_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
values: <
|
|
int64_val: 42000
|
|
>
|
|
values: <
|
|
int64_val: 42001
|
|
>
|
|
values: <
|
|
int64_val: 42002
|
|
>
|
|
values: <
|
|
int64_val: 42003
|
|
>
|
|
values: <
|
|
int64_val: 42004
|
|
>
|
|
>
|
|
>
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "JACCARD"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 5;
|
|
int topK = 5;
|
|
auto raw_group = CreateBinaryPlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
CSearchResult c_search_result_on_smallIndex;
|
|
auto res_before_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_smallIndex, -1);
|
|
assert(res_before_load_index.error_code == Success);
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::JACCARD},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_BIN_IVFFLAT);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto search_result_on_raw_index = (SearchResult*)c_search_result_on_smallIndex;
|
|
search_result_on_raw_index->seg_offsets_ = vec_ids;
|
|
search_result_on_raw_index->distances_ = vec_dis;
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "BIN_IVF_FLAT";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "JACCARD";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::BinaryVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
std::vector<CSearchResult> results;
|
|
results.push_back(c_search_result_on_bigIndex);
|
|
|
|
auto slice_nqs = std::vector<int32_t>{num_queries};
|
|
auto slice_topKs = std::vector<int32_t>{topK};
|
|
|
|
CSearchResultDataBlobs cSearchResultData;
|
|
status = ReduceSearchResultsAndFillData(&cSearchResultData, plan, results.data(), results.size(), slice_nqs.data(),
|
|
slice_topKs.data(), slice_nqs.size());
|
|
assert(status.error_code == Success);
|
|
|
|
// status = ReduceSearchResultsAndFillData(plan, results.data(), results.size());
|
|
// assert(status.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = search_result_on_bigIndex->get_result_count(i);
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
ASSERT_EQ(search_result_on_bigIndex->distances_[offset], search_result_on_raw_index->distances_[i * TOPK]);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_smallIndex);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SealedSegmentTest) {
|
|
auto collection = NewCollection(get_default_schema_config());
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
|
|
int N = 10000;
|
|
std::default_random_engine e(67);
|
|
auto ages = std::vector<int64_t>(N);
|
|
for (auto& age : ages) {
|
|
age = e() % 2000;
|
|
}
|
|
auto blob = (void*)(&ages[0]);
|
|
FieldMeta field_meta(FieldName("age"), FieldId(101), DataType::INT64);
|
|
auto array = CreateScalarDataArrayFrom(ages.data(), N, field_meta);
|
|
auto age_data = serialize(array.get());
|
|
|
|
auto load_info = CLoadFieldDataInfo{101, age_data.data(), age_data.size(), N};
|
|
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SealedSegment_search_float_Predicate_Range) {
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
auto counter_col = dataset.get_col<int64_t>(FieldId(101));
|
|
FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64);
|
|
auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta);
|
|
auto counter_data = serialize(count_array.get());
|
|
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_ids_data = serialize(row_ids_array.get());
|
|
|
|
FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta);
|
|
auto timestamps_data = serialize(timestamps_array.get());
|
|
|
|
const char* dsl_string = R"({
|
|
"bool": {
|
|
"must": [
|
|
{
|
|
"range": {
|
|
"counter": {
|
|
"GE": 42000,
|
|
"LT": 42010
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
]
|
|
}
|
|
})";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 10;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
|
|
auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto index = std::dynamic_pointer_cast<knowhere::VecIndex>(load_index_info->index);
|
|
auto result_on_index2 = index->Query(query_dataset2, conf, nullptr);
|
|
auto ids2 = knowhere::GetDatasetIDs(result_on_index2);
|
|
auto dis2 = knowhere::GetDatasetDistance(result_on_index2);
|
|
|
|
auto c_counter_field_data = CLoadFieldDataInfo{
|
|
101,
|
|
counter_data.data(),
|
|
counter_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_counter_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_id_field_data = CLoadFieldDataInfo{
|
|
0,
|
|
row_ids_data.data(),
|
|
row_ids_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_id_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_ts_field_data = CLoadFieldDataInfo{
|
|
1,
|
|
timestamps_data.data(),
|
|
timestamps_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_ts_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
// load index for vec field, load raw data for scalar filed
|
|
auto sealed_segment = SealedCreator(schema, dataset);
|
|
sealed_segment->DropFieldData(FieldId(100));
|
|
sealed_segment->LoadIndex(*(LoadIndexInfo*)c_load_index_info);
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index =
|
|
Search(sealed_segment.get(), plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SealedSegment_search_without_predicates) {
|
|
constexpr auto TOPK = 5;
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
uint64_t ts_offset = 1000;
|
|
auto dataset = DataGen(schema, N, ts_offset);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
auto vec_array = dataset.get_col(FieldId(100));
|
|
auto vec_data = serialize(vec_array.get());
|
|
|
|
auto counter_col = dataset.get_col<int64_t>(FieldId(101));
|
|
FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64);
|
|
auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta);
|
|
auto counter_data = serialize(count_array.get());
|
|
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_ids_data = serialize(row_ids_array.get());
|
|
|
|
FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta);
|
|
auto timestamps_data = serialize(timestamps_array.get());
|
|
|
|
const char* dsl_string = R"(
|
|
{
|
|
"bool": {
|
|
"vector": {
|
|
"fakevec": {
|
|
"metric_type": "L2",
|
|
"params": {
|
|
"nprobe": 10
|
|
},
|
|
"query": "$0",
|
|
"topk": 5,
|
|
"round_decimal": -1
|
|
}
|
|
}
|
|
}
|
|
})";
|
|
|
|
auto c_vec_field_data = CLoadFieldDataInfo{
|
|
100,
|
|
vec_data.data(),
|
|
vec_data.size(),
|
|
N,
|
|
};
|
|
auto status = LoadFieldData(segment, c_vec_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_counter_field_data = CLoadFieldDataInfo{
|
|
101,
|
|
counter_data.data(),
|
|
counter_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_counter_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_id_field_data = CLoadFieldDataInfo{
|
|
0,
|
|
row_ids_data.data(),
|
|
row_ids_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_id_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_ts_field_data = CLoadFieldDataInfo{
|
|
1,
|
|
timestamps_data.data(),
|
|
timestamps_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_ts_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
int num_queries = 10;
|
|
auto blob = generate_query_data(num_queries);
|
|
|
|
void* plan = nullptr;
|
|
status = CreateSearchPlan(collection, dsl_string, &plan);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
ASSERT_EQ(status.error_code, Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
CSearchResult search_result;
|
|
auto res = Search(segment, plan, placeholderGroup, N + ts_offset, &search_result, -1);
|
|
std::cout << res.error_msg << std::endl;
|
|
ASSERT_EQ(res.error_code, Success);
|
|
|
|
CSearchResult search_result2;
|
|
auto res2 = Search(segment, plan, placeholderGroup, ts_offset, &search_result2, -1);
|
|
ASSERT_EQ(res2.error_code, Success);
|
|
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(search_result);
|
|
DeleteSearchResult(search_result2);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, SealedSegment_search_float_With_Expr_Predicate_Range) {
|
|
constexpr auto TOPK = 5;
|
|
|
|
std::string schema_string = generate_collection_schema("L2", DIM, false);
|
|
auto collection = NewCollection(schema_string.c_str());
|
|
auto schema = ((segcore::Collection*)collection)->get_schema();
|
|
auto segment = NewSegment(collection, Sealed, -1);
|
|
|
|
auto N = ROW_COUNT;
|
|
auto dataset = DataGen(schema, N);
|
|
auto vec_col = dataset.get_col<float>(FieldId(100));
|
|
auto query_ptr = vec_col.data() + 42000 * DIM;
|
|
|
|
auto counter_col = dataset.get_col<int64_t>(FieldId(101));
|
|
FieldMeta counter_field_meta(FieldName("counter"), FieldId(101), DataType::INT64);
|
|
auto count_array = CreateScalarDataArrayFrom(counter_col.data(), N, counter_field_meta);
|
|
auto counter_data = serialize(count_array.get());
|
|
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_ids_array = CreateScalarDataArrayFrom(dataset.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_ids_data = serialize(row_ids_array.get());
|
|
|
|
FieldMeta timestamp_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto timestamps_array = CreateScalarDataArrayFrom(dataset.timestamps_.data(), N, timestamp_field_meta);
|
|
auto timestamps_data = serialize(timestamps_array.get());
|
|
|
|
const char* serialized_expr_plan = R"(vector_anns: <
|
|
field_id: 100
|
|
predicates: <
|
|
binary_expr: <
|
|
op: LogicalAnd
|
|
left: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: GreaterEqual
|
|
value: <
|
|
int64_val: 42000
|
|
>
|
|
>
|
|
>
|
|
right: <
|
|
unary_range_expr: <
|
|
column_info: <
|
|
field_id: 101
|
|
data_type: Int64
|
|
>
|
|
op: LessThan
|
|
value: <
|
|
int64_val: 42010
|
|
>
|
|
>
|
|
>
|
|
>
|
|
>
|
|
query_info: <
|
|
topk: 5
|
|
round_decimal: -1
|
|
metric_type: "L2"
|
|
search_params: "{\"nprobe\": 10}"
|
|
>
|
|
placeholder_tag: "$0"
|
|
>)";
|
|
|
|
// create place_holder_group
|
|
int num_queries = 10;
|
|
auto raw_group = CreatePlaceholderGroupFromBlob(num_queries, DIM, query_ptr);
|
|
auto blob = raw_group.SerializeAsString();
|
|
|
|
// search on segment's small index
|
|
void* plan = nullptr;
|
|
auto binary_plan = translate_text_plan_to_binary_plan(serialized_expr_plan);
|
|
auto status = CreateSearchPlanByExpr(collection, binary_plan.data(), binary_plan.size(), &plan);
|
|
assert(status.error_code == Success);
|
|
|
|
void* placeholderGroup = nullptr;
|
|
status = ParsePlaceholderGroup(plan, blob.data(), blob.length(), &placeholderGroup);
|
|
assert(status.error_code == Success);
|
|
|
|
std::vector<CPlaceholderGroup> placeholderGroups;
|
|
placeholderGroups.push_back(placeholderGroup);
|
|
Timestamp time = 10000000;
|
|
|
|
// load index to segment
|
|
auto conf = knowhere::Config{
|
|
{knowhere::meta::METRIC_TYPE, knowhere::metric::L2},
|
|
{knowhere::meta::DIM, DIM},
|
|
{knowhere::meta::TOPK, TOPK},
|
|
{knowhere::indexparam::NLIST, 100},
|
|
{knowhere::indexparam::NPROBE, 10},
|
|
{knowhere::indexparam::M, 4},
|
|
{knowhere::indexparam::NBITS, 8},
|
|
{knowhere::meta::DEVICE_ID, 0}
|
|
};
|
|
|
|
auto indexing = generate_index(vec_col.data(), conf, DIM, TOPK, N, IndexEnum::INDEX_FAISS_IVFPQ);
|
|
|
|
auto binary_set = indexing->Serialize(conf);
|
|
void* c_load_index_info = nullptr;
|
|
status = NewLoadIndexInfo(&c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
std::string index_type_key = "index_type";
|
|
std::string index_type_value = "IVF_PQ";
|
|
std::string index_mode_key = "index_mode";
|
|
std::string index_mode_value = "cpu";
|
|
std::string metric_type_key = "metric_type";
|
|
std::string metric_type_value = "L2";
|
|
|
|
AppendIndexParam(c_load_index_info, index_type_key.c_str(), index_type_value.c_str());
|
|
AppendIndexParam(c_load_index_info, index_mode_key.c_str(), index_mode_value.c_str());
|
|
AppendIndexParam(c_load_index_info, metric_type_key.c_str(), metric_type_value.c_str());
|
|
AppendFieldInfo(c_load_index_info, 100, CDataType::FloatVector);
|
|
AppendIndex(c_load_index_info, (CBinarySet)&binary_set);
|
|
|
|
// load vec index
|
|
auto load_index_info = (LoadIndexInfo*)c_load_index_info;
|
|
auto query_dataset2 = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto index = std::dynamic_pointer_cast<knowhere::VecIndex>(load_index_info->index);
|
|
auto result_on_index2 = index->Query(query_dataset2, conf, nullptr);
|
|
auto ids2 = knowhere::GetDatasetIDs(result_on_index2);
|
|
auto dis2 = knowhere::GetDatasetDistance(result_on_index2);
|
|
status = UpdateSealedSegmentIndex(segment, c_load_index_info);
|
|
assert(status.error_code == Success);
|
|
|
|
// load raw data
|
|
auto c_counter_field_data = CLoadFieldDataInfo{
|
|
101,
|
|
counter_data.data(),
|
|
counter_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_counter_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_id_field_data = CLoadFieldDataInfo{
|
|
0,
|
|
row_ids_data.data(),
|
|
row_ids_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_id_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
auto c_ts_field_data = CLoadFieldDataInfo{
|
|
1,
|
|
timestamps_data.data(),
|
|
timestamps_data.size(),
|
|
N,
|
|
};
|
|
status = LoadFieldData(segment, c_ts_field_data);
|
|
assert(status.error_code == Success);
|
|
|
|
// gen query dataset
|
|
auto query_dataset = knowhere::GenDataset(num_queries, DIM, query_ptr);
|
|
auto result_on_index = indexing->Query(query_dataset, conf, nullptr);
|
|
auto ids = knowhere::GetDatasetIDs(result_on_index);
|
|
auto dis = knowhere::GetDatasetDistance(result_on_index);
|
|
std::vector<int64_t> vec_ids(ids, ids + TOPK * num_queries);
|
|
std::vector<float> vec_dis;
|
|
for (int j = 0; j < TOPK * num_queries; ++j) {
|
|
vec_dis.push_back(dis[j] * -1);
|
|
}
|
|
|
|
CSearchResult c_search_result_on_bigIndex;
|
|
auto res_after_load_index = Search(segment, plan, placeholderGroup, time, &c_search_result_on_bigIndex, -1);
|
|
assert(res_after_load_index.error_code == Success);
|
|
|
|
auto search_result_on_bigIndex = (SearchResult*)c_search_result_on_bigIndex;
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
auto offset = i * TOPK;
|
|
ASSERT_EQ(search_result_on_bigIndex->seg_offsets_[offset], 42000 + i);
|
|
}
|
|
|
|
DeleteLoadIndexInfo(c_load_index_info);
|
|
DeleteSearchPlan(plan);
|
|
DeletePlaceholderGroup(placeholderGroup);
|
|
DeleteSearchResult(c_search_result_on_bigIndex);
|
|
DeleteCollection(collection);
|
|
DeleteSegment(segment);
|
|
}
|
|
|
|
TEST(CApiTest, RetriveScalarFieldFromSealedSegmentWithIndex) {
|
|
auto schema = std::make_shared<Schema>();
|
|
auto i8_fid = schema->AddDebugField("age8", DataType::INT8);
|
|
auto i16_fid = schema->AddDebugField("age16", DataType::INT16);
|
|
auto i32_fid = schema->AddDebugField("age32", DataType::INT32);
|
|
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
|
|
auto float_fid = schema->AddDebugField("age_float", DataType::FLOAT);
|
|
auto double_fid = schema->AddDebugField("age_double", DataType::DOUBLE);
|
|
schema->set_primary_field_id(i64_fid);
|
|
|
|
auto segment = CreateSealedSegment(schema).release();
|
|
|
|
int N = ROW_COUNT;
|
|
auto raw_data = DataGen(schema, N);
|
|
LoadIndexInfo load_index_info;
|
|
|
|
// load timestamp field
|
|
FieldMeta ts_field_meta(FieldName("Timestamp"), TimestampFieldID, DataType::INT64);
|
|
auto ts_array = CreateScalarDataArrayFrom(raw_data.timestamps_.data(), N, ts_field_meta);
|
|
auto ts_data = serialize(ts_array.get());
|
|
auto load_info = CLoadFieldDataInfo{TimestampFieldID.get(), ts_data.data(), ts_data.size(), N};
|
|
auto res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
auto count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// load rowid field
|
|
FieldMeta row_id_field_meta(FieldName("RowID"), RowFieldID, DataType::INT64);
|
|
auto row_id_array = CreateScalarDataArrayFrom(raw_data.row_ids_.data(), N, row_id_field_meta);
|
|
auto row_id_data = serialize(row_id_array.get());
|
|
load_info = CLoadFieldDataInfo{RowFieldID.get(), row_id_data.data(), row_id_data.size(), N};
|
|
res = LoadFieldData(segment, load_info);
|
|
assert(res.error_code == Success);
|
|
count = GetRowCount(segment);
|
|
assert(count == N);
|
|
|
|
// load index for int8 field
|
|
auto age8_col = raw_data.get_col<int8_t>(i8_fid);
|
|
GenScalarIndexing(N, age8_col.data());
|
|
auto age8_index = milvus::scalar::CreateScalarIndexSort<int8_t>();
|
|
age8_index->Build(N, age8_col.data());
|
|
load_index_info.field_id = i8_fid.get();
|
|
load_index_info.field_type = Int8;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int8_t>>(age8_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// load index for 16 field
|
|
auto age16_col = raw_data.get_col<int16_t>(i16_fid);
|
|
GenScalarIndexing(N, age16_col.data());
|
|
auto age16_index = milvus::scalar::CreateScalarIndexSort<int16_t>();
|
|
age16_index->Build(N, age16_col.data());
|
|
load_index_info.field_id = i16_fid.get();
|
|
load_index_info.field_type = Int16;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int16_t>>(age16_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// load index for int32 field
|
|
auto age32_col = raw_data.get_col<int32_t>(i32_fid);
|
|
GenScalarIndexing(N, age32_col.data());
|
|
auto age32_index = milvus::scalar::CreateScalarIndexSort<int32_t>();
|
|
age32_index->Build(N, age32_col.data());
|
|
load_index_info.field_id = i32_fid.get();
|
|
load_index_info.field_type = Int32;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int32_t>>(age32_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// load index for int64 field
|
|
auto age64_col = raw_data.get_col<int64_t>(i64_fid);
|
|
GenScalarIndexing(N, age64_col.data());
|
|
auto age64_index = milvus::scalar::CreateScalarIndexSort<int64_t>();
|
|
age64_index->Build(N, age64_col.data());
|
|
load_index_info.field_id = i64_fid.get();
|
|
load_index_info.field_type = Int64;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<int64_t>>(age64_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// load index for float field
|
|
auto age_float_col = raw_data.get_col<float>(float_fid);
|
|
GenScalarIndexing(N, age_float_col.data());
|
|
auto age_float_index = milvus::scalar::CreateScalarIndexSort<float>();
|
|
age_float_index->Build(N, age_float_col.data());
|
|
load_index_info.field_id = float_fid.get();
|
|
load_index_info.field_type = Float;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<float>>(age_float_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// load index for double field
|
|
auto age_double_col = raw_data.get_col<double>(double_fid);
|
|
GenScalarIndexing(N, age_double_col.data());
|
|
auto age_double_index = milvus::scalar::CreateScalarIndexSort<double>();
|
|
age_double_index->Build(N, age_double_col.data());
|
|
load_index_info.field_id = double_fid.get();
|
|
load_index_info.field_type = Float;
|
|
load_index_info.index = std::shared_ptr<milvus::scalar::ScalarIndexSort<double>>(age_double_index.release());
|
|
segment->LoadIndex(load_index_info);
|
|
|
|
// create retrieve plan
|
|
auto plan = std::make_unique<query::RetrievePlan>(*schema);
|
|
plan->plan_node_ = std::make_unique<query::RetrievePlanNode>();
|
|
std::vector<int64_t> retrive_row_ids = {age64_col[0]};
|
|
auto term_expr = std::make_unique<query::TermExprImpl<int64_t>>(i64_fid, DataType::INT64, retrive_row_ids);
|
|
plan->plan_node_->predicate_ = std::move(term_expr);
|
|
std::vector<FieldId> target_field_ids;
|
|
|
|
// retrieve value
|
|
target_field_ids = {i8_fid, i16_fid, i32_fid, i64_fid, float_fid, double_fid};
|
|
plan->field_ids_ = target_field_ids;
|
|
|
|
CRetrieveResult retrieve_result;
|
|
res = Retrieve(segment, plan.get(), raw_data.timestamps_[N - 1], &retrieve_result);
|
|
ASSERT_EQ(res.error_code, Success);
|
|
auto query_result = std::make_unique<proto::segcore::RetrieveResults>();
|
|
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size);
|
|
ASSERT_TRUE(suc);
|
|
ASSERT_EQ(query_result->fields_data().size(), 6);
|
|
auto fields_data = query_result->fields_data();
|
|
for (auto iter = fields_data.begin(); iter < fields_data.end(); ++iter) {
|
|
switch (iter->type()) {
|
|
case proto::schema::DataType::Int8: {
|
|
ASSERT_EQ(iter->scalars().int_data().data(0), age8_col[0]);
|
|
break;
|
|
}
|
|
case proto::schema::DataType::Int16: {
|
|
ASSERT_EQ(iter->scalars().int_data().data(0), age16_col[0]);
|
|
break;
|
|
}
|
|
case proto::schema::DataType::Int32: {
|
|
ASSERT_EQ(iter->scalars().int_data().data(0), age32_col[0]);
|
|
break;
|
|
}
|
|
case proto::schema::DataType::Int64: {
|
|
ASSERT_EQ(iter->scalars().long_data().data(0), age64_col[0]);
|
|
break;
|
|
}
|
|
case proto::schema::DataType::Float: {
|
|
ASSERT_EQ(iter->scalars().float_data().data(0), age_float_col[0]);
|
|
break;
|
|
}
|
|
case proto::schema::DataType::Double: {
|
|
ASSERT_EQ(iter->scalars().double_data().data(0), age_double_col[0]);
|
|
break;
|
|
}
|
|
default: {
|
|
PanicInfo("not supported type");
|
|
}
|
|
}
|
|
}
|
|
|
|
DeleteRetrievePlan(plan.release());
|
|
DeleteRetrieveResult(&retrieve_result);
|
|
|
|
DeleteSegment(segment);
|
|
}
|