enhance: support null in text match index (#37517)

#37508

Signed-off-by: lixinguo <xinguo.li@zilliz.com>
Co-authored-by: lixinguo <xinguo.li@zilliz.com>
This commit is contained in:
smellthemoon 2024-11-13 11:08:29 +08:00 committed by GitHub
parent 30d396f476
commit 3389a6b500
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 603 additions and 125 deletions

View File

@ -214,6 +214,20 @@ class SegmentExpr : public Expr {
}
}
void
ApplyValidData(const bool* valid_data,
TargetBitmapView res,
TargetBitmapView valid_res,
const int size) {
if (valid_data != nullptr) {
for (int i = 0; i < size; i++) {
if (!valid_data[i]) {
res[i] = valid_res[i] = false;
}
}
}
}
int64_t
GetNextBatchSize() {
auto current_chunk = is_index_mode_ && use_index_ ? current_index_chunk_
@ -254,9 +268,9 @@ class SegmentExpr : public Expr {
std::min(active_count_ - current_data_chunk_pos_, batch_size_);
auto& skip_index = segment_->GetSkipIndex();
auto views_info = segment_->get_batch_views<T>(
field_id_, 0, current_data_chunk_pos_, need_size);
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
auto views_info = segment_->get_batch_views<T>(
field_id_, 0, current_data_chunk_pos_, need_size);
// first is the raw data, second is valid_data
// use valid_data to see if raw data is null
func(views_info.first.data(),
@ -265,6 +279,8 @@ class SegmentExpr : public Expr {
res,
valid_res,
values...);
} else {
ApplyValidData(views_info.second.data(), res, valid_res, need_size);
}
current_data_chunk_pos_ += need_size;
return need_size;
@ -303,19 +319,24 @@ class SegmentExpr : public Expr {
size = std::min(size, batch_size_ - processed_size);
auto& skip_index = segment_->GetSkipIndex();
auto chunk = segment_->chunk_data<T>(field_id_, i);
const bool* valid_data = chunk.valid_data();
if (valid_data != nullptr) {
valid_data += data_pos;
}
if (!skip_func || !skip_func(skip_index, field_id_, i)) {
auto chunk = segment_->chunk_data<T>(field_id_, i);
const T* data = chunk.data() + data_pos;
const bool* valid_data = chunk.valid_data();
if (valid_data != nullptr) {
valid_data += data_pos;
}
func(data,
valid_data,
size,
res + processed_size,
valid_res + processed_size,
values...);
} else {
ApplyValidData(valid_data,
res + processed_size,
valid_res + processed_size,
size);
}
processed_size += size;
@ -390,6 +411,27 @@ class SegmentExpr : public Expr {
valid_res + processed_size,
values...);
}
} else {
const bool* valid_data;
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json>) {
if (segment_->type() == SegmentType::Sealed) {
valid_data = segment_
->get_batch_views<T>(
field_id_, i, data_pos, size)
.second.data();
}
} else {
auto chunk = segment_->chunk_data<T>(field_id_, i);
valid_data = chunk.valid_data();
if (valid_data != nullptr) {
valid_data += data_pos;
}
}
ApplyValidData(valid_data,
res + processed_size,
valid_res + processed_size,
size);
}
processed_size += size;

View File

@ -11,6 +11,7 @@
#include <boost/uuid/random_generator.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <memory>
#include "index/TextMatchIndex.h"
#include "index/InvertedIndexUtil.h"
@ -103,6 +104,13 @@ TextMatchIndex::Upload(const Config& config) {
for (auto& file : remote_paths_to_size) {
ret.Append(file.first, nullptr, file.second);
}
auto binary_set = Serialize(config);
mem_file_manager_->AddFile(binary_set);
auto remote_mem_path_to_size =
mem_file_manager_->GetRemotePathsToFileSize();
for (auto& file : remote_mem_path_to_size) {
ret.Append(file.first, nullptr, file.second);
}
return ret;
}
@ -114,21 +122,76 @@ TextMatchIndex::Load(const Config& config) {
AssertInfo(index_files.has_value(),
"index file paths is empty when load text log index");
auto prefix = disk_file_manager_->GetLocalTextIndexPrefix();
disk_file_manager_->CacheTextLogToDisk(index_files.value());
auto files_value = index_files.value();
auto it = std::find_if(
files_value.begin(), files_value.end(), [](const std::string& file) {
return file.substr(file.find_last_of('/') + 1) ==
"index_null_offset";
});
if (it != files_value.end()) {
std::vector<std::string> file;
file.push_back(*it);
files_value.erase(it);
auto index_datas = mem_file_manager_->LoadIndexToMemory(file);
AssembleIndexDatas(index_datas);
BinarySet binary_set;
for (auto& [key, data] : index_datas) {
auto size = data->DataSize();
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
auto buf = std::shared_ptr<uint8_t[]>(
(uint8_t*)const_cast<void*>(data->Data()), deleter);
binary_set.Append(key, buf, size);
}
auto index_valid_data = binary_set.GetByName("index_null_offset");
null_offset.resize((size_t)index_valid_data->size / sizeof(size_t));
memcpy(null_offset.data(),
index_valid_data->data.get(),
(size_t)index_valid_data->size);
}
disk_file_manager_->CacheTextLogToDisk(files_value);
AssertInfo(
tantivy_index_exist(prefix.c_str()), "index not exist: {}", prefix);
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str());
}
void
TextMatchIndex::AddText(const std::string& text, int64_t offset) {
AddTexts(1, &text, offset);
TextMatchIndex::AddText(const std::string& text,
const bool valid,
int64_t offset) {
if (!valid) {
AddNull(offset);
if (shouldTriggerCommit()) {
Commit();
}
return;
}
wrapper_->add_data(&text, 1, offset);
if (shouldTriggerCommit()) {
Commit();
}
}
void
TextMatchIndex::AddNull(int64_t offset) {
null_offset.push_back(offset);
// still need to add null to make offset is correct
std::string empty = "";
wrapper_->add_multi_data(&empty, 0, offset);
}
void
TextMatchIndex::AddTexts(size_t n,
const std::string* texts,
const bool* valids,
int64_t offset_begin) {
if (valids != nullptr) {
for (int i = 0; i < n; i++) {
auto offset = i + offset_begin;
if (!valids[i]) {
null_offset.push_back(offset);
}
}
}
wrapper_->add_data(texts, n, offset_begin);
if (shouldTriggerCommit()) {
Commit();

View File

@ -45,10 +45,16 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
public:
void
AddText(const std::string& text, int64_t offset);
AddText(const std::string& text, const bool valid, int64_t offset);
void
AddTexts(size_t n, const std::string* texts, int64_t offset_begin);
AddNull(int64_t offset);
void
AddTexts(size_t n,
const std::string* texts,
const bool* valids,
int64_t offset_begin);
void
Finish();

View File

@ -1538,7 +1538,8 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
field_id.get());
auto n = column->NumRows();
for (size_t i = 0; i < n; i++) {
index->AddText(std::string(column->RawAt(i)), i);
index->AddText(
std::string(column->RawAt(i)), column->IsValid(i), i);
}
} else { // fetch raw data from index.
auto field_index_iter = scalar_indexings_.find(field_id);
@ -1557,9 +1558,9 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
for (size_t i = 0; i < n; i++) {
auto raw = impl->Reverse_Lookup(i);
if (!raw.has_value()) {
continue;
index->AddNull(i);
}
index->AddText(raw.value(), i);
index->AddText(raw.value(), true, i);
}
}
}

View File

@ -155,7 +155,18 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
.string_data()
.data()
.end());
AddTexts(field_id, texts.data(), num_rows, reserved_offset);
FixedVector<bool> texts_valid_data(
insert_record_proto->fields_data(data_offset)
.valid_data()
.begin(),
insert_record_proto->fields_data(data_offset)
.valid_data()
.end());
AddTexts(field_id,
texts.data(),
texts_valid_data.data(),
num_rows,
reserved_offset);
}
// update average row data size
@ -880,12 +891,13 @@ SegmentGrowingImpl::CreateTextIndexes() {
void
SegmentGrowingImpl::AddTexts(milvus::FieldId field_id,
const std::string* texts,
const bool* texts_valid_data,
size_t n,
int64_t offset_begin) {
std::unique_lock lock(mutex_);
auto iter = text_indexes_.find(field_id);
AssertInfo(iter != text_indexes_.end(), "text index not found");
iter->second->AddTexts(n, texts, offset_begin);
iter->second->AddTexts(n, texts, texts_valid_data, offset_begin);
}
} // namespace milvus::segcore

View File

@ -368,6 +368,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
void
AddTexts(FieldId field_id,
const std::string* texts,
const bool* texts_valid_data,
size_t n,
int64_t offset_begin);

View File

@ -2035,7 +2035,8 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
field_id.get());
auto n = column->NumRows();
for (size_t i = 0; i < n; i++) {
index->AddText(std::string(column->RawAt(i)), i);
index->AddText(
std::string(column->RawAt(i)), column->IsValid(i), i);
}
} else { // fetch raw data from index.
auto field_index_iter = scalar_indexings_.find(field_id);
@ -2054,9 +2055,9 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
for (size_t i = 0; i < n; i++) {
auto raw = impl->Reverse_Lookup(i);
if (!raw.has_value()) {
continue;
index->AddNull(i);
}
index->AddText(raw.value(), i);
index->AddText(raw.value(), true, i);
}
}
}

View File

@ -3883,8 +3883,6 @@ TEST(Expr, TestExprNOT) {
FixedVector<bool> valid_data) {
query::ExecPlanNodeVisitor visitor(*seg, MAX_TIMESTAMP);
BitsetType final;
return std::make_shared<expr::LogicalUnaryExpr>(
expr::LogicalUnaryExpr::OpType::LogicalNot, expr);
auto plan =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, expr);
auto start = std::chrono::steady_clock::now();

View File

@ -19,6 +19,7 @@
#include "test_utils/GenExprProto.h"
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
#include "expr/ITypeExpr.h"
using namespace milvus;
using namespace milvus::query;
@ -26,7 +27,8 @@ using namespace milvus::segcore;
namespace {
SchemaPtr
GenTestSchema(std::map<std::string, std::string> params = {}) {
GenTestSchema(std::map<std::string, std::string> params = {},
bool nullable = false) {
auto schema = std::make_shared<Schema>();
{
FieldMeta f(FieldName("pk"), FieldId(100), DataType::INT64, false);
@ -38,7 +40,7 @@ GenTestSchema(std::map<std::string, std::string> params = {}) {
FieldId(101),
DataType::VARCHAR,
65536,
false,
nullable,
true,
true,
params);
@ -55,6 +57,41 @@ GenTestSchema(std::map<std::string, std::string> params = {}) {
}
return schema;
}
std::shared_ptr<milvus::plan::FilterBitsNode>
GetTextMatchExpr(SchemaPtr schema, const std::string& query) {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr = test::GenUnaryRangeExpr(OpType::TextMatch, query);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
std::shared_ptr<milvus::plan::FilterBitsNode>
GetNotTextMatchExpr(SchemaPtr schema, const std::string& query) {
const auto& str_meta = schema->operator[](FieldName("str"));
proto::plan::GenericValue val;
val.set_string_val(query);
auto child_expr = std::make_shared<expr::UnaryRangeFilterExpr>(
milvus::expr::ColumnInfo(str_meta.get_id(), DataType::VARCHAR),
proto::plan::OpType::TextMatch,
val);
auto expr = std::make_shared<expr::LogicalUnaryExpr>(
expr::LogicalUnaryExpr::OpType::LogicalNot, child_expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, expr);
return parsed;
};
} // namespace
TEST(ParseJson, Naive) {
@ -94,13 +131,23 @@ TEST(TextMatch, Index) {
auto index = std::make_unique<Index>(
std::numeric_limits<int64_t>::max(), "milvus_tokenizer", "{}");
index->CreateReader();
index->AddText("football, basketball, pingpang", 0);
index->AddText("swimming, football", 1);
index->AddText("football, basketball, pingpang", true, 0);
index->AddText("", false, 1);
index->AddText("swimming, football", true, 2);
index->Commit();
index->Reload();
auto res = index->MatchQuery("football");
ASSERT_TRUE(res[0]);
ASSERT_TRUE(res[1]);
ASSERT_FALSE(res[1]);
ASSERT_TRUE(res[2]);
auto res1 = index->IsNull();
ASSERT_FALSE(res1[0]);
ASSERT_TRUE(res1[1]);
ASSERT_FALSE(res1[2]);
auto res2 = index->IsNotNull();
ASSERT_TRUE(res2[0]);
ASSERT_FALSE(res2[1]);
ASSERT_TRUE(res2[2]);
}
TEST(TextMatch, GrowingNaive) {
@ -130,50 +177,127 @@ TEST(TextMatch, GrowingNaive) {
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto get_text_match_expr = [&schema](const std::string& query) -> auto {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr =
test::GenUnaryRangeExpr(OpType::TextMatch, query);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
{
auto expr = get_text_match_expr("football");
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "football");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("swimming");
auto expr = GetTextMatchExpr(schema, "swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("basketball, swimming");
auto expr = GetTextMatchExpr(schema, "basketball, swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "basketball, swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, GrowingNaiveNullable) {
auto schema = GenTestSchema({}, true);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {
"football, basketball, pingpang", "swimming, football", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "football");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "basketball, swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "basketball, swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
@ -198,50 +322,121 @@ TEST(TextMatch, SealedNaive) {
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
auto get_text_match_expr = [&schema](const std::string& query) -> auto {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr =
test::GenUnaryRangeExpr(OpType::TextMatch, query);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
{
auto expr = get_text_match_expr("football");
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "football");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("swimming");
auto expr = GetTextMatchExpr(schema, "swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("basketball, swimming");
auto expr = GetTextMatchExpr(schema, "basketball, swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "basketball, swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, SealedNaiveNullable) {
auto schema = GenTestSchema({}, true);
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {
"football, basketball, pingpang", "swimming, football", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
auto expr = GetTextMatchExpr(schema, "football");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "football");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "basketball, swimming");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "basketball, swimming");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
@ -275,50 +470,132 @@ TEST(TextMatch, GrowingJieBa) {
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
auto get_text_match_expr = [&schema](const std::string& query) -> auto {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr =
test::GenUnaryRangeExpr(OpType::TextMatch, query);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
{
auto expr = get_text_match_expr("青铜");
auto expr = GetTextMatchExpr(schema, "青铜");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
auto expr1 = GetNotTextMatchExpr(schema, "青铜");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
{
auto expr = get_text_match_expr("时代");
auto expr = GetTextMatchExpr(schema, "黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "黄金");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = GetTextMatchExpr(schema, "时代");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "时代");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, GrowingJieBaNullable) {
auto schema = GenTestSchema(
{
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
},
true);
auto seg = CreateGrowingSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
seg->PreInsert(N);
seg->Insert(0,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
std::this_thread::sleep_for(std::chrono::milliseconds(200) * 2);
{
auto expr = GetTextMatchExpr(schema, "青铜");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "青铜");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "黄金");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "时代");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "时代");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}
@ -346,49 +623,126 @@ TEST(TextMatch, SealedJieBa) {
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
auto get_text_match_expr = [&schema](const std::string& query) -> auto {
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr =
test::GenUnaryRangeExpr(OpType::TextMatch, query);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
return parsed;
};
{
auto expr = get_text_match_expr("青铜");
auto expr = GetTextMatchExpr(schema, "青铜");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = get_text_match_expr("黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
auto expr1 = GetNotTextMatchExpr(schema, "青铜");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
}
{
auto expr = get_text_match_expr("时代");
auto expr = GetTextMatchExpr(schema, "黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "黄金");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
}
{
auto expr = GetTextMatchExpr(schema, "时代");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
auto expr1 = GetNotTextMatchExpr(schema, "时代");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
}
}
TEST(TextMatch, SealedJieBaNullable) {
auto schema = GenTestSchema(
{
{"enable_match", "true"},
{"enable_tokenizer", "true"},
{"analyzer_params", R"({"tokenizer": "jieba"})"},
},
true);
auto seg = CreateSealedSegment(schema, empty_index_meta);
std::vector<std::string> raw_str = {"青铜时代", "黄金时代", ""};
std::vector<bool> raw_str_valid = {true, true, false};
int64_t N = 3;
uint64_t seed = 19190504;
auto raw_data = DataGen(schema, N, seed);
auto str_col = raw_data.raw_->mutable_fields_data()
->at(1)
.mutable_scalars()
->mutable_string_data()
->mutable_data();
for (int64_t i = 0; i < N; i++) {
str_col->at(i) = raw_str[i];
}
auto str_col_valid =
raw_data.raw_->mutable_fields_data()->at(1).mutable_valid_data();
for (int64_t i = 0; i < N; i++) {
str_col_valid->at(i) = raw_str_valid[i];
}
SealedLoadFieldData(raw_data, *seg);
seg->CreateTextIndex(FieldId(101));
{
auto expr = GetTextMatchExpr(schema, "青铜");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "青铜");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "黄金");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "黄金");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
{
auto expr = GetTextMatchExpr(schema, "时代");
BitsetType final;
final = ExecuteQueryExpr(expr, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_TRUE(final[0]);
ASSERT_TRUE(final[1]);
ASSERT_FALSE(final[2]);
auto expr1 = GetNotTextMatchExpr(schema, "时代");
final = ExecuteQueryExpr(expr1, seg.get(), N, MAX_TIMESTAMP);
ASSERT_EQ(final.size(), N);
ASSERT_FALSE(final[0]);
ASSERT_FALSE(final[1]);
ASSERT_FALSE(final[2]);
}
}