2020-12-29 11:56:37 +08:00
|
|
|
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
|
|
|
|
//
|
|
|
|
// Created by mike on 12/28/20.
|
|
|
|
//
|
|
|
|
#include "test_utils/DataGen.h"
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
#include <knowhere/index/vector_index/VecIndex.h>
|
|
|
|
#include <knowhere/index/vector_index/adapter/VectorAdapter.h>
|
|
|
|
#include <knowhere/index/vector_index/VecIndexFactory.h>
|
|
|
|
#include <knowhere/index/vector_index/IndexIVF.h>
|
2021-01-20 10:15:43 +08:00
|
|
|
#include "segcore/SegmentSealedImpl.h"
|
2020-12-29 11:56:37 +08:00
|
|
|
|
|
|
|
using namespace milvus;
|
|
|
|
using namespace milvus::segcore;
|
2021-01-21 15:29:52 +08:00
|
|
|
using namespace milvus::query;
|
2020-12-29 11:56:37 +08:00
|
|
|
|
|
|
|
TEST(Sealed, without_predicate) {
|
|
|
|
using namespace milvus::query;
|
|
|
|
using namespace milvus::segcore;
|
|
|
|
auto schema = std::make_shared<Schema>();
|
|
|
|
auto dim = 16;
|
|
|
|
auto topK = 5;
|
|
|
|
auto metric_type = MetricType::METRIC_L2;
|
2021-01-22 15:41:54 +08:00
|
|
|
auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
|
2021-01-13 11:08:03 +08:00
|
|
|
schema->AddDebugField("age", DataType::FLOAT);
|
2020-12-29 11:56:37 +08:00
|
|
|
std::string dsl = R"({
|
|
|
|
"bool": {
|
|
|
|
"must": [
|
|
|
|
{
|
|
|
|
"vector": {
|
|
|
|
"fakevec": {
|
|
|
|
"metric_type": "L2",
|
|
|
|
"params": {
|
|
|
|
"nprobe": 10
|
|
|
|
},
|
|
|
|
"query": "$0",
|
|
|
|
"topk": 5
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
})";
|
|
|
|
|
|
|
|
int64_t N = 1000 * 1000;
|
|
|
|
|
|
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
auto vec_col = dataset.get_col<float>(0);
|
|
|
|
for (int64_t i = 0; i < 1000 * dim; ++i) {
|
|
|
|
vec_col.push_back(0);
|
|
|
|
}
|
|
|
|
auto query_ptr = vec_col.data() + 4200 * dim;
|
2021-01-13 18:46:25 +08:00
|
|
|
auto segment = CreateGrowingSegment(schema);
|
2020-12-29 11:56:37 +08:00
|
|
|
segment->PreInsert(N);
|
|
|
|
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
|
|
|
|
|
|
|
|
auto plan = CreatePlan(*schema, dsl);
|
|
|
|
auto num_queries = 5;
|
|
|
|
auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr);
|
|
|
|
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
SearchResult sr;
|
2020-12-29 11:56:37 +08:00
|
|
|
Timestamp time = 1000000;
|
|
|
|
std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
sr = segment->Search(plan.get(), *ph_group, time);
|
|
|
|
auto pre_result = SearchResultToJson(sr);
|
2020-12-29 11:56:37 +08:00
|
|
|
auto indexing = std::make_shared<knowhere::IVF>();
|
|
|
|
|
|
|
|
auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
|
|
|
|
{knowhere::meta::TOPK, topK},
|
|
|
|
{knowhere::IndexParams::nlist, 100},
|
|
|
|
{knowhere::IndexParams::nprobe, 10},
|
|
|
|
{knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
|
|
|
|
{knowhere::meta::DEVICEID, 0}};
|
|
|
|
|
|
|
|
auto database = knowhere::GenDataset(N, dim, vec_col.data() + 1000 * dim);
|
|
|
|
indexing->Train(database, conf);
|
|
|
|
indexing->AddWithoutIds(database, conf);
|
|
|
|
|
|
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
|
|
EXPECT_EQ(indexing->Dim(), dim);
|
|
|
|
|
|
|
|
auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr);
|
|
|
|
|
|
|
|
auto result = indexing->Query(query_dataset, conf, nullptr);
|
|
|
|
|
|
|
|
auto ids = result->Get<int64_t*>(milvus::knowhere::meta::IDS); // for comparison
|
|
|
|
auto dis = result->Get<float*>(milvus::knowhere::meta::DISTANCE); // for comparison
|
|
|
|
std::vector<int64_t> vec_ids(ids, ids + topK * num_queries);
|
|
|
|
std::vector<float> vec_dis(dis, dis + topK * num_queries);
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
sr.internal_seg_offsets_ = vec_ids;
|
|
|
|
sr.result_distances_ = vec_dis;
|
|
|
|
auto ref_result = SearchResultToJson(sr);
|
2020-12-29 11:56:37 +08:00
|
|
|
|
|
|
|
LoadIndexInfo load_info;
|
2021-01-22 15:41:54 +08:00
|
|
|
load_info.field_id = fake_id.get();
|
2020-12-29 11:56:37 +08:00
|
|
|
load_info.index = indexing;
|
|
|
|
load_info.index_params["metric_type"] = "L2";
|
|
|
|
|
2021-07-20 15:18:08 +08:00
|
|
|
auto sealed_segment = SealedCreator(schema, dataset, load_info);
|
|
|
|
sr = sealed_segment->Search(plan.get(), *ph_group, time);
|
2020-12-29 11:56:37 +08:00
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
auto post_result = SearchResultToJson(sr);
|
2020-12-29 11:56:37 +08:00
|
|
|
std::cout << ref_result.dump(1);
|
|
|
|
std::cout << post_result.dump(1);
|
|
|
|
ASSERT_EQ(ref_result.dump(2), post_result.dump(2));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST(Sealed, with_predicate) {
|
|
|
|
using namespace milvus::query;
|
|
|
|
using namespace milvus::segcore;
|
|
|
|
auto schema = std::make_shared<Schema>();
|
|
|
|
auto dim = 16;
|
|
|
|
auto topK = 5;
|
|
|
|
auto metric_type = MetricType::METRIC_L2;
|
2021-01-22 15:41:54 +08:00
|
|
|
auto fake_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
|
2021-01-13 11:08:03 +08:00
|
|
|
schema->AddDebugField("counter", DataType::INT64);
|
2020-12-29 11:56:37 +08:00
|
|
|
std::string dsl = R"({
|
|
|
|
"bool": {
|
|
|
|
"must": [
|
|
|
|
{
|
|
|
|
"range": {
|
|
|
|
"counter": {
|
|
|
|
"GE": 420000,
|
|
|
|
"LT": 420005
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"vector": {
|
|
|
|
"fakevec": {
|
|
|
|
"metric_type": "L2",
|
|
|
|
"params": {
|
|
|
|
"nprobe": 10
|
|
|
|
},
|
|
|
|
"query": "$0",
|
|
|
|
"topk": 5
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
})";
|
|
|
|
|
|
|
|
int64_t N = 1000 * 1000;
|
|
|
|
|
|
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
auto vec_col = dataset.get_col<float>(0);
|
|
|
|
auto query_ptr = vec_col.data() + 420000 * dim;
|
2021-01-13 18:46:25 +08:00
|
|
|
auto segment = CreateGrowingSegment(schema);
|
2020-12-29 11:56:37 +08:00
|
|
|
segment->PreInsert(N);
|
|
|
|
segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_);
|
|
|
|
|
|
|
|
auto plan = CreatePlan(*schema, dsl);
|
|
|
|
auto num_queries = 5;
|
|
|
|
auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr);
|
|
|
|
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
SearchResult sr;
|
2020-12-29 11:56:37 +08:00
|
|
|
Timestamp time = 10000000;
|
|
|
|
std::vector<const PlaceholderGroup*> ph_group_arr = {ph_group.get()};
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
sr = segment->Search(plan.get(), *ph_group, time);
|
|
|
|
auto pre_sr = sr;
|
2020-12-29 11:56:37 +08:00
|
|
|
auto indexing = std::make_shared<knowhere::IVF>();
|
|
|
|
|
|
|
|
auto conf = knowhere::Config{{knowhere::meta::DIM, dim},
|
|
|
|
{knowhere::meta::TOPK, topK},
|
|
|
|
{knowhere::IndexParams::nlist, 100},
|
|
|
|
{knowhere::IndexParams::nprobe, 10},
|
|
|
|
{knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
|
|
|
|
{knowhere::meta::DEVICEID, 0}};
|
|
|
|
|
|
|
|
auto database = knowhere::GenDataset(N, dim, vec_col.data());
|
|
|
|
indexing->Train(database, conf);
|
|
|
|
indexing->AddWithoutIds(database, conf);
|
|
|
|
|
|
|
|
EXPECT_EQ(indexing->Count(), N);
|
|
|
|
EXPECT_EQ(indexing->Dim(), dim);
|
|
|
|
|
|
|
|
auto query_dataset = knowhere::GenDataset(num_queries, dim, query_ptr);
|
|
|
|
|
|
|
|
auto result = indexing->Query(query_dataset, conf, nullptr);
|
|
|
|
|
|
|
|
LoadIndexInfo load_info;
|
2021-01-22 15:41:54 +08:00
|
|
|
load_info.field_id = fake_id.get();
|
2020-12-29 11:56:37 +08:00
|
|
|
load_info.index = indexing;
|
|
|
|
load_info.index_params["metric_type"] = "L2";
|
|
|
|
|
2021-07-20 15:18:08 +08:00
|
|
|
auto sealed_segment = SealedCreator(schema, dataset, load_info);
|
|
|
|
sr = sealed_segment->Search(plan.get(), *ph_group, time);
|
2020-12-29 11:56:37 +08:00
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
auto post_sr = sr;
|
2020-12-29 11:56:37 +08:00
|
|
|
for (int i = 0; i < num_queries; ++i) {
|
|
|
|
auto offset = i * topK;
|
2021-07-13 22:20:33 +08:00
|
|
|
ASSERT_EQ(post_sr.internal_seg_offsets_[offset], 420000 + i);
|
|
|
|
ASSERT_EQ(post_sr.result_distances_[offset], 0.0);
|
2020-12-29 11:56:37 +08:00
|
|
|
}
|
2021-01-20 10:15:43 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST(Sealed, LoadFieldData) {
|
|
|
|
auto dim = 16;
|
|
|
|
auto topK = 5;
|
|
|
|
int64_t N = 1000 * 1000;
|
|
|
|
auto metric_type = MetricType::METRIC_L2;
|
|
|
|
auto schema = std::make_shared<Schema>();
|
|
|
|
auto fakevec_id = schema->AddDebugField("fakevec", DataType::VECTOR_FLOAT, dim, metric_type);
|
2021-01-26 09:38:40 +08:00
|
|
|
auto counter_id = schema->AddDebugField("counter", DataType::INT64);
|
|
|
|
auto double_id = schema->AddDebugField("double", DataType::DOUBLE);
|
|
|
|
auto nothing_id = schema->AddDebugField("nothing", DataType::INT32);
|
2021-01-20 17:33:31 +08:00
|
|
|
|
2021-01-20 10:15:43 +08:00
|
|
|
auto dataset = DataGen(schema, N);
|
|
|
|
|
|
|
|
auto fakevec = dataset.get_col<float>(0);
|
|
|
|
|
2021-02-07 15:47:10 +08:00
|
|
|
auto indexing = GenIndexing(N, dim, fakevec.data());
|
2021-01-20 10:15:43 +08:00
|
|
|
|
|
|
|
auto segment = CreateSealedSegment(schema);
|
2021-01-21 15:29:52 +08:00
|
|
|
std::string dsl = R"({
|
|
|
|
"bool": {
|
|
|
|
"must": [
|
|
|
|
{
|
|
|
|
"range": {
|
|
|
|
"double": {
|
|
|
|
"GE": -1,
|
|
|
|
"LT": 1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"vector": {
|
|
|
|
"fakevec": {
|
|
|
|
"metric_type": "L2",
|
|
|
|
"params": {
|
|
|
|
"nprobe": 10
|
|
|
|
},
|
|
|
|
"query": "$0",
|
|
|
|
"topk": 5
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|
|
|
|
})";
|
|
|
|
|
2021-01-26 09:38:40 +08:00
|
|
|
Timestamp time = 1000000;
|
2021-01-21 15:29:52 +08:00
|
|
|
auto plan = CreatePlan(*schema, dsl);
|
|
|
|
auto num_queries = 5;
|
|
|
|
auto ph_group_raw = CreatePlaceholderGroup(num_queries, 16, 1024);
|
|
|
|
auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
|
|
|
|
2021-07-01 10:32:15 +08:00
|
|
|
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
|
2021-01-26 09:38:40 +08:00
|
|
|
|
|
|
|
SealedLoader(dataset, *segment);
|
|
|
|
segment->DropFieldData(nothing_id);
|
2021-07-01 10:32:15 +08:00
|
|
|
segment->Search(plan.get(), *ph_group, time);
|
2021-01-26 09:38:40 +08:00
|
|
|
|
2021-02-07 15:47:10 +08:00
|
|
|
segment->DropFieldData(fakevec_id);
|
2021-07-01 10:32:15 +08:00
|
|
|
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
|
2021-01-26 09:38:40 +08:00
|
|
|
|
|
|
|
LoadIndexInfo vec_info;
|
|
|
|
vec_info.field_id = fakevec_id.get();
|
|
|
|
vec_info.index = indexing;
|
|
|
|
vec_info.index_params["metric_type"] = milvus::knowhere::Metric::L2;
|
|
|
|
segment->LoadIndex(vec_info);
|
|
|
|
|
|
|
|
ASSERT_EQ(segment->num_chunk(), 1);
|
|
|
|
auto chunk_span1 = segment->chunk_data<int64_t>(FieldOffset(1), 0);
|
|
|
|
auto chunk_span2 = segment->chunk_data<double>(FieldOffset(2), 0);
|
|
|
|
auto ref1 = dataset.get_col<int64_t>(1);
|
|
|
|
auto ref2 = dataset.get_col<double>(2);
|
|
|
|
for (int i = 0; i < N; ++i) {
|
|
|
|
ASSERT_EQ(chunk_span1[i], ref1[i]);
|
|
|
|
ASSERT_EQ(chunk_span2[i], ref2[i]);
|
|
|
|
}
|
|
|
|
|
2021-07-13 22:20:33 +08:00
|
|
|
auto sr = segment->Search(plan.get(), *ph_group, time);
|
|
|
|
auto json = SearchResultToJson(sr);
|
2021-01-21 15:29:52 +08:00
|
|
|
std::cout << json.dump(1);
|
2021-01-26 09:38:40 +08:00
|
|
|
|
|
|
|
segment->DropIndex(fakevec_id);
|
2021-07-01 10:32:15 +08:00
|
|
|
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
|
2021-01-26 09:38:40 +08:00
|
|
|
segment->LoadIndex(vec_info);
|
2021-07-13 22:20:33 +08:00
|
|
|
auto sr2 = segment->Search(plan.get(), *ph_group, time);
|
|
|
|
auto json2 = SearchResultToJson(sr);
|
2021-01-26 09:38:40 +08:00
|
|
|
ASSERT_EQ(json.dump(-2), json2.dump(-2));
|
|
|
|
segment->DropFieldData(double_id);
|
2021-07-01 10:32:15 +08:00
|
|
|
ASSERT_ANY_THROW(segment->Search(plan.get(), *ph_group, time));
|
2021-02-07 15:47:10 +08:00
|
|
|
auto std_json = Json::parse(R"(
|
|
|
|
[
|
|
|
|
[
|
|
|
|
[
|
2021-07-06 09:50:01 +08:00
|
|
|
"982->0.000000",
|
|
|
|
"25315->4.741588",
|
|
|
|
"551029->5.078479",
|
|
|
|
"455002->5.134716",
|
|
|
|
"504754->5.329021"
|
2021-02-07 15:47:10 +08:00
|
|
|
],
|
|
|
|
[
|
2021-07-06 09:50:01 +08:00
|
|
|
"287136->8.409121",
|
|
|
|
"528353->8.740297",
|
|
|
|
"935763->9.422906",
|
|
|
|
"794649->9.436665",
|
|
|
|
"192031->9.832053"
|
2021-02-07 15:47:10 +08:00
|
|
|
],
|
|
|
|
[
|
2021-07-06 09:50:01 +08:00
|
|
|
"59251->2.542610",
|
|
|
|
"433044->3.424016",
|
|
|
|
"797884->3.663446",
|
|
|
|
"430441->3.692723",
|
|
|
|
"697705->3.944479"
|
2021-02-07 15:47:10 +08:00
|
|
|
],
|
|
|
|
[
|
2021-07-06 09:50:01 +08:00
|
|
|
"611544->3.463480",
|
|
|
|
"642941->3.753775",
|
|
|
|
"967504->3.885163",
|
|
|
|
"232724->4.574215",
|
|
|
|
"507245->5.040902"
|
2021-02-07 15:47:10 +08:00
|
|
|
],
|
|
|
|
[
|
2021-07-06 09:50:01 +08:00
|
|
|
"351788->4.453843",
|
|
|
|
"410227->4.699380",
|
|
|
|
"501497->4.805948",
|
|
|
|
"715061->5.166959",
|
|
|
|
"414882->5.179897"
|
2021-02-07 15:47:10 +08:00
|
|
|
]
|
|
|
|
]
|
2021-07-06 09:50:01 +08:00
|
|
|
])");
|
2021-02-07 15:47:10 +08:00
|
|
|
ASSERT_EQ(std_json.dump(-2), json.dump(-2));
|
2021-01-21 15:29:52 +08:00
|
|
|
}
|