// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include #include #include #include #include "pb/plan.pb.h" #include "query/Expr.h" #include "query/generated/PlanNodeVisitor.h" #include "query/generated/ExecExprVisitor.h" #include "segcore/SegmentGrowingImpl.h" #include "test_utils/DataGen.h" #include "query/PlanProto.h" #include "query/Utils.h" #include "query/SearchBruteForce.h" using namespace milvus; namespace { template auto GenGenericValue(T value) { auto generic = new proto::plan::GenericValue(); if constexpr (std::is_same_v) { generic->set_bool_val(static_cast(value)); } else if constexpr (std::is_integral_v) { generic->set_int64_val(static_cast(value)); } else if constexpr (std::is_floating_point_v) { generic->set_float_val(static_cast(value)); } else if constexpr (std::is_same_v) { generic->set_string_val(static_cast(value)); } else { static_assert(always_false); } return generic; } auto GenColumnInfo(int64_t field_id, proto::schema::DataType field_type, bool auto_id, bool is_pk) { auto column_info = new proto::plan::ColumnInfo(); column_info->set_field_id(field_id); column_info->set_data_type(field_type); column_info->set_is_autoid(auto_id); column_info->set_is_primary_key(is_pk); return column_info; } auto GenQueryInfo(int64_t topk, std::string metric_type, std::string search_params, int64_t round_decimal = -1) { auto query_info = new proto::plan::QueryInfo(); query_info->set_topk(topk); query_info->set_metric_type(metric_type); query_info->set_search_params(search_params); query_info->set_round_decimal(round_decimal); return query_info; } auto GenAnns(proto::plan::Expr* predicate, bool is_binary, int64_t field_id, std::string placeholder_tag = "$0") { auto query_info = GenQueryInfo(10, "L2", "{\"nprobe\": 10}", -1); auto anns = new proto::plan::VectorANNS(); anns->set_is_binary(is_binary); anns->set_field_id(field_id); anns->set_allocated_predicates(predicate); anns->set_allocated_query_info(query_info); anns->set_placeholder_tag(placeholder_tag); return anns; } template auto GenTermExpr(const std::vector& values) { auto term_expr = new proto::plan::TermExpr(); for (int i = 0; i < values.size(); i++) { auto add_value = term_expr->add_values(); if constexpr (std::is_same_v) { add_value->set_bool_val(static_cast(values[i])); } else if constexpr (std::is_integral_v) { add_value->set_int64_val(static_cast(values[i])); } else if constexpr (std::is_floating_point_v) { add_value->set_float_val(static_cast(values[i])); } else if constexpr (std::is_same_v) { add_value->set_string_val(static_cast(values[i])); } else { static_assert(always_false); } } return term_expr; } auto GenCompareExpr(proto::plan::OpType op) { auto compare_expr = new proto::plan::CompareExpr(); compare_expr->set_op(op); return compare_expr; } template auto GenUnaryRangeExpr(proto::plan::OpType op, T value) { auto unary_range_expr = new proto::plan::UnaryRangeExpr(); unary_range_expr->set_op(op); auto generic = GenGenericValue(value); unary_range_expr->set_allocated_value(generic); return unary_range_expr; } template auto GenBinaryRangeExpr(bool lb_inclusive, bool ub_inclusive, T lb, T ub) { auto binary_range_expr = new proto::plan::BinaryRangeExpr(); binary_range_expr->set_lower_inclusive(lb_inclusive); binary_range_expr->set_upper_inclusive(ub_inclusive); auto lb_generic = GenGenericValue(lb); auto ub_generic = GenGenericValue(ub); binary_range_expr->set_allocated_lower_value(lb_generic); binary_range_expr->set_allocated_upper_value(ub_generic); return binary_range_expr; } auto GenNotExpr() { auto not_expr = new proto::plan::UnaryExpr(); not_expr->set_op(proto::plan::UnaryExpr_UnaryOp_Not); return not_expr; } auto GenExpr() { return std::make_unique(); } auto GenPlanNode() { return std::make_unique(); } void SetTargetEntry(std::unique_ptr& plan_node, const std::vector& output_fields) { for (auto id : output_fields) { plan_node->add_output_field_ids(id); } } auto GenTermPlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta, const std::vector& strs) -> std::unique_ptr { auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto term_expr = GenTermExpr(strs); term_expr->set_allocated_column_info(column_info); auto expr = GenExpr().release(); expr->set_allocated_term_expr(term_expr); auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = GenPlanNode(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); } auto GenAlwaysFalseExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) { auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto term_expr = GenTermExpr({}); // in empty set, always false. term_expr->set_allocated_column_info(column_info); auto expr = GenExpr().release(); expr->set_allocated_term_expr(term_expr); return expr; } auto GenAlwaysTrueExpr(const FieldMeta& fvec_meta, const FieldMeta& str_meta) { auto always_false_expr = GenAlwaysFalseExpr(fvec_meta, str_meta); auto not_expr = GenNotExpr(); not_expr->set_allocated_child(always_false_expr); auto expr = GenExpr().release(); expr->set_allocated_unary_expr(not_expr); return expr; } auto GenAlwaysFalsePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) { auto always_false_expr = GenAlwaysFalseExpr(fvec_meta, str_meta); auto anns = GenAnns(always_false_expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = GenPlanNode(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); } auto GenAlwaysTruePlan(const FieldMeta& fvec_meta, const FieldMeta& str_meta) { auto always_true_expr = GenAlwaysTrueExpr(fvec_meta, str_meta); auto anns = GenAnns(always_true_expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = GenPlanNode(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); } SchemaPtr GenTestSchema() { auto schema = std::make_shared(); schema->AddDebugField("str", DataType::VARCHAR); schema->AddDebugField("another_str", DataType::VARCHAR); schema->AddDebugField("fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); auto pk = schema->AddDebugField("int64", DataType::INT64); schema->set_primary_field_id(pk); return schema; } SchemaPtr GenStrPKSchema() { auto schema = std::make_shared(); auto pk = schema->AddDebugField("str", DataType::VARCHAR); schema->AddDebugField("another_str", DataType::VARCHAR); schema->AddDebugField("fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2); schema->AddDebugField("int64", DataType::INT64); schema->set_primary_field_id(pk); return schema; } } // namespace TEST(StringExpr, Term) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenTestSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); auto vec_2k_3k = []() -> std::vector { std::vector ret; for (int i = 2000; i < 3000; i++) { ret.push_back(std::to_string(i)); } return ret; }(); std::map> terms = { {0, {"2000", "3000"}}, {1, {"2000"}}, {2, {"3000"}}, {3, {}}, {4, {vec_2k_3k}}, }; auto seg = CreateGrowingSegment(schema); int N = 1000; std::vector str_col; int num_iters = 100; for (int iter = 0; iter < num_iters; ++iter) { auto raw_data = DataGen(schema, N, iter); auto new_str_col = raw_data.get_col(str_meta.get_id()); auto begin = new_str_col->scalars().string_data().data().begin(); auto end = new_str_col->scalars().string_data().data().end(); str_col.insert(str_col.end(), begin, end); seg->PreInsert(N); seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_); } auto seg_promote = dynamic_cast(seg.get()); ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP); for (const auto& [_, term] : terms) { auto plan_proto = GenTermPlan(fvec_meta, str_meta, term); auto plan = ProtoParser(*schema).CreatePlan(*plan_proto); auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); EXPECT_EQ(final.size(), N * num_iters); for (int i = 0; i < N * num_iters; ++i) { auto ans = final[i]; auto val = str_col[i]; auto ref = std::find(term.begin(), term.end(), val) != term.end(); ASSERT_EQ(ans, ref) << "@" << i << "!!" << val; } } } TEST(StringExpr, Compare) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenTestSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); const auto& another_str_meta = schema->operator[](FieldName("another_str")); auto gen_compare_plan = [&, fvec_meta, str_meta, another_str_meta](proto::plan::OpType op) -> std::unique_ptr { auto str_col_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto another_str_col_info = GenColumnInfo(another_str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto compare_expr = GenCompareExpr(op); compare_expr->set_allocated_left_column_info(str_col_info); compare_expr->set_allocated_right_column_info(another_str_col_info); auto expr = GenExpr().release(); expr->set_allocated_compare_expr(compare_expr); auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = std::make_unique(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); }; std::vector>> testcases{ {proto::plan::OpType::GreaterThan, [](std::string v1, std::string v2) { return v1 > v2; }}, {proto::plan::OpType::GreaterEqual, [](std::string v1, std::string v2) { return v1 >= v2; }}, {proto::plan::OpType::LessThan, [](std::string v1, std::string v2) { return v1 < v2; }}, {proto::plan::OpType::LessEqual, [](std::string v1, std::string v2) { return v1 <= v2; }}, {proto::plan::OpType::Equal, [](std::string v1, std::string v2) { return v1 == v2; }}, {proto::plan::OpType::NotEqual, [](std::string v1, std::string v2) { return v1 != v2; }}, {proto::plan::OpType::PrefixMatch, [](std::string v1, std::string v2) { return PrefixMatch(v1, v2); }}, }; auto seg = CreateGrowingSegment(schema); int N = 1000; std::vector str_col; std::vector another_str_col; int num_iters = 100; for (int iter = 0; iter < num_iters; ++iter) { auto raw_data = DataGen(schema, N, iter); auto reserve_col = [&, raw_data](const FieldMeta& field_meta, std::vector& str_col) { auto new_str_col = raw_data.get_col(field_meta.get_id()); auto begin = new_str_col->scalars().string_data().data().begin(); auto end = new_str_col->scalars().string_data().data().end(); str_col.insert(str_col.end(), begin, end); }; reserve_col(str_meta, str_col); reserve_col(another_str_meta, another_str_col); { seg->PreInsert(N); seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_); } } auto seg_promote = dynamic_cast(seg.get()); ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP); for (const auto& [op, ref_func] : testcases) { auto plan_proto = gen_compare_plan(op); auto plan = ProtoParser(*schema).CreatePlan(*plan_proto); auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); EXPECT_EQ(final.size(), N * num_iters); for (int i = 0; i < N * num_iters; ++i) { auto ans = final[i]; auto val = str_col[i]; auto another_val = another_str_col[i]; auto ref = ref_func(val, another_val); ASSERT_EQ(ans, ref) << "@" << op << "@" << i << "!!" << val; } } } TEST(StringExpr, UnaryRange) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenTestSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); auto gen_unary_range_plan = [&, fvec_meta, str_meta](proto::plan::OpType op, std::string value) -> std::unique_ptr { auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto unary_range_expr = GenUnaryRangeExpr(op, value); unary_range_expr->set_allocated_column_info(column_info); auto expr = GenExpr().release(); expr->set_allocated_unary_range_expr(unary_range_expr); auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = std::make_unique(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); }; std::vector>> testcases{ {proto::plan::OpType::GreaterThan, "2000", [](std::string val) { return val > "2000"; }}, {proto::plan::OpType::GreaterEqual, "2000", [](std::string val) { return val >= "2000"; }}, {proto::plan::OpType::LessThan, "3000", [](std::string val) { return val < "3000"; }}, {proto::plan::OpType::LessEqual, "3000", [](std::string val) { return val <= "3000"; }}, {proto::plan::OpType::PrefixMatch, "a", [](std::string val) { return PrefixMatch(val, "a"); }}, }; auto seg = CreateGrowingSegment(schema); int N = 1000; std::vector str_col; int num_iters = 100; for (int iter = 0; iter < num_iters; ++iter) { auto raw_data = DataGen(schema, N, iter); auto new_str_col = raw_data.get_col(str_meta.get_id()); auto begin = new_str_col->scalars().string_data().data().begin(); auto end = new_str_col->scalars().string_data().data().end(); str_col.insert(str_col.end(), begin, end); seg->PreInsert(N); seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_); } auto seg_promote = dynamic_cast(seg.get()); ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP); for (const auto& [op, value, ref_func] : testcases) { auto plan_proto = gen_unary_range_plan(op, value); auto plan = ProtoParser(*schema).CreatePlan(*plan_proto); auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); EXPECT_EQ(final.size(), N * num_iters); for (int i = 0; i < N * num_iters; ++i) { auto ans = final[i]; auto val = str_col[i]; auto ref = ref_func(val); ASSERT_EQ(ans, ref) << "@" << op << "@" << value << "@" << i << "!!" << val; } } } TEST(StringExpr, BinaryRange) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenTestSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); auto gen_binary_range_plan = [&, fvec_meta, str_meta](bool lb_inclusive, bool ub_inclusive, std::string lb, std::string ub) -> std::unique_ptr { auto column_info = GenColumnInfo(str_meta.get_id().get(), proto::schema::DataType::VarChar, false, false); auto binary_range_expr = GenBinaryRangeExpr(lb_inclusive, ub_inclusive, lb, ub); binary_range_expr->set_allocated_column_info(column_info); auto expr = GenExpr().release(); expr->set_allocated_binary_range_expr(binary_range_expr); auto anns = GenAnns(expr, fvec_meta.get_data_type() == DataType::VECTOR_BINARY, fvec_meta.get_id().get(), "$0"); auto plan_node = std::make_unique(); plan_node->set_allocated_vector_anns(anns); return std::move(plan_node); }; // bool lb_inclusive, bool ub_inclusive, std::string lb, std::string ub std::vector>> testcases{ {false, false, "2000", "3000", [](std::string val) { return val > "2000" && val < "3000"; }}, {false, true, "2000", "3000", [](std::string val) { return val > "2000" && val <= "3000"; }}, {true, false, "2000", "3000", [](std::string val) { return val >= "2000" && val < "3000"; }}, {true, true, "2000", "3000", [](std::string val) { return val >= "2000" && val <= "3000"; }}, {true, true, "2000", "1000", [](std::string val) { return false; }}, }; auto seg = CreateGrowingSegment(schema); int N = 1000; std::vector str_col; int num_iters = 100; for (int iter = 0; iter < num_iters; ++iter) { auto raw_data = DataGen(schema, N, iter); auto new_str_col = raw_data.get_col(str_meta.get_id()); auto begin = new_str_col->scalars().string_data().data().begin(); auto end = new_str_col->scalars().string_data().data().end(); str_col.insert(str_col.end(), begin, end); seg->PreInsert(N); seg->Insert(iter * N, N, raw_data.row_ids_.data(), raw_data.timestamps_.data(), raw_data.raw_); } auto seg_promote = dynamic_cast(seg.get()); ExecExprVisitor visitor(*seg_promote, seg_promote->get_row_count(), MAX_TIMESTAMP); for (const auto& [lb_inclusive, ub_inclusive, lb, ub, ref_func] : testcases) { auto plan_proto = gen_binary_range_plan(lb_inclusive, ub_inclusive, lb, ub); auto plan = ProtoParser(*schema).CreatePlan(*plan_proto); auto final = visitor.call_child(*plan->plan_node_->predicate_.value()); EXPECT_EQ(final.size(), N * num_iters); for (int i = 0; i < N * num_iters; ++i) { auto ans = final[i]; auto val = str_col[i]; auto ref = ref_func(val); ASSERT_EQ(ans, ref) << "@" << lb_inclusive << "@" << ub_inclusive << "@" << lb << "@" << ub << "@" << i << "!!" << val; } } } TEST(AlwaysTrueStringPlan, SearchWithOutputFields) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenStrPKSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); auto N = 100000; auto dim = fvec_meta.get_dim(); auto round_decimal = -1; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(fvec_meta.get_id()); auto str_col = dataset.get_col(str_meta.get_id())->scalars().string_data().data(); auto query_ptr = vec_col.data(); auto segment = CreateGrowingSegment(schema); segment->disable_small_index(); // brute-force search. segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto plan_proto = GenAlwaysTruePlan(fvec_meta, str_meta); SetTargetEntry(plan_proto, {str_meta.get_id().get()}); auto plan = ProtoParser(*schema).CreatePlan(*plan_proto); auto num_queries = 5; auto topk = 10; auto ph_group_raw = CreatePlaceholderGroupFromBlob(num_queries, 16, query_ptr); auto ph_group = ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString()); Timestamp time = MAX_TIMESTAMP; std::vector ph_group_arr = {ph_group.get()}; query::dataset::SearchDataset search_dataset{ knowhere::metric::L2, // num_queries, // topk, // round_decimal, dim, // query_ptr // }; auto sub_result = BruteForceSearch(search_dataset, vec_col.data(), N, nullptr); auto sr = segment->Search(plan.get(), ph_group.get(), time); segment->FillPrimaryKeys(plan.get(), *sr); segment->FillTargetEntry(plan.get(), *sr); ASSERT_EQ(sr->pk_type_, DataType::VARCHAR); ASSERT_TRUE(sr->output_fields_data_.find(str_meta.get_id()) != sr->output_fields_data_.end()); auto retrieved_str_col = sr->output_fields_data_[str_meta.get_id()]->scalars().string_data().data(); for (auto q = 0; q < num_queries; q++) { for (auto k = 0; k < topk; k++) { auto offset = q * topk + k; auto seg_offset = sub_result.get_seg_offsets()[offset]; ASSERT_EQ(std::get(sr->primary_keys_[offset]), str_col[seg_offset]); ASSERT_EQ(retrieved_str_col[offset], str_col[seg_offset]); } } } TEST(AlwaysTrueStringPlan, QueryWithOutputFields) { using namespace milvus::query; using namespace milvus::segcore; auto schema = GenStrPKSchema(); const auto& fvec_meta = schema->operator[](FieldName("fvec")); const auto& str_meta = schema->operator[](FieldName("str")); auto N = 100000; auto dataset = DataGen(schema, N); auto vec_col = dataset.get_col(fvec_meta.get_id()); auto str_col = dataset.get_col(str_meta.get_id())->scalars().string_data().data(); auto segment = CreateGrowingSegment(schema); segment->disable_small_index(); // brute-force search. segment->PreInsert(N); segment->Insert(0, N, dataset.row_ids_.data(), dataset.timestamps_.data(), dataset.raw_); auto expr_proto = GenAlwaysTrueExpr(fvec_meta, str_meta); auto plan_proto = GenPlanNode(); plan_proto->set_allocated_predicates(expr_proto); SetTargetEntry(plan_proto, {str_meta.get_id().get()}); auto plan = ProtoParser(*schema).CreateRetrievePlan(*plan_proto); Timestamp time = MAX_TIMESTAMP; auto retrieved = segment->Retrieve(plan.get(), time); ASSERT_EQ(retrieved->ids().str_id().data().size(), N); ASSERT_EQ(retrieved->offset().size(), N); ASSERT_EQ(retrieved->fields_data().size(), 1); ASSERT_EQ(retrieved->fields_data(0).scalars().string_data().data().size(), N); }