From c19fe95154b4950c695d3295906ba6154c7d2548 Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Wed, 7 Aug 2024 09:54:22 +0800 Subject: [PATCH] fix: support string match for hybrid and bitmap index (#35294) #34841 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/exec/expression/UnaryExpr.h | 9 +- internal/core/src/index/BitmapIndex.cpp | 73 +++++++++++++++ internal/core/src/index/BitmapIndex.h | 19 ++++ internal/core/src/index/HybridScalarIndex.h | 22 +++++ .../core/src/index/InvertedIndexTantivy.cpp | 5 +- .../core/src/index/InvertedIndexTantivy.h | 10 +- internal/core/src/index/ScalarIndex.h | 5 + tests/python_client/testcases/test_query.py | 93 +++++++++++++++++++ 8 files changed, 228 insertions(+), 8 deletions(-) diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index 2792cc3f93..7167abec56 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -207,12 +207,8 @@ struct UnaryIndexFuncForMatch { !std::is_same_v) { PanicInfo(Unsupported, "regex query is only supported on string"); } else { - PatternMatchTranslator translator; - auto regex_pattern = translator(val); - RegexMatcher matcher(regex_pattern); - if (index->SupportRegexQuery()) { - return index->RegexQuery(regex_pattern); + return index->PatternMatch(val); } if (!index->HasRawData()) { PanicInfo(Unsupported, @@ -223,6 +219,9 @@ struct UnaryIndexFuncForMatch { // retrieve raw data to do brute force query, may be very slow. auto cnt = index->Count(); TargetBitmap res(cnt); + PatternMatchTranslator translator; + auto regex_pattern = translator(val); + RegexMatcher matcher(regex_pattern); for (int64_t i = 0; i < cnt; i++) { auto raw = index->Reverse_Lookup(i); res[i] = matcher(raw); diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index 95e51e676e..a2576610a9 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -25,6 +25,7 @@ #include "index/ScalarIndex.h" #include "index/Utils.h" #include "storage/Util.h" +#include "query/Utils.h" namespace milvus { namespace index { @@ -806,6 +807,78 @@ BitmapIndex::ShouldSkip(const T lower_value, return true; } +template +const TargetBitmap +BitmapIndex::Query(const DatasetPtr& dataset) { + return ScalarIndex::Query(dataset); +} + +template <> +const TargetBitmap +BitmapIndex::Query(const DatasetPtr& dataset) { + AssertInfo(is_built_, "index has not been built"); + + auto op = dataset->Get(OPERATOR_TYPE); + if (op == OpType::PrefixMatch) { + auto prefix = dataset->Get(PREFIX_VALUE); + TargetBitmap res(total_num_rows_, false); + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + for (auto it = data_.begin(); it != data_.end(); ++it) { + const auto& key = it->first; + if (milvus::query::Match(key, prefix, op)) { + for (const auto& v : it->second) { + res.set(v); + } + } + } + } else { + for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) { + const auto& key = it->first; + if (milvus::query::Match(key, prefix, op)) { + res |= it->second; + } + } + } + + return res; + } else { + PanicInfo(OpTypeInvalid, + fmt::format("unsupported op_type:{} for bitmap query", op)); + } +} + +template +const TargetBitmap +BitmapIndex::RegexQuery(const std::string& regex_pattern) { + return ScalarIndex::RegexQuery(regex_pattern); +} + +template <> +const TargetBitmap +BitmapIndex::RegexQuery(const std::string& regex_pattern) { + AssertInfo(is_built_, "index has not been built"); + RegexMatcher matcher(regex_pattern); + TargetBitmap res(total_num_rows_, false); + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + for (auto it = data_.begin(); it != data_.end(); ++it) { + const auto& key = it->first; + if (matcher(key)) { + for (const auto& v : it->second) { + res.set(v); + } + } + } + } else { + for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) { + const auto& key = it->first; + if (matcher(key)) { + res |= it->second; + } + } + } + return res; +} + template class BitmapIndex; template class BitmapIndex; template class BitmapIndex; diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index 0a60181d60..a29f401f75 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -21,6 +21,7 @@ #include #include +#include "common/RegexQuery.h" #include "index/ScalarIndex.h" #include "storage/FileManager.h" #include "storage/DiskFileManagerImpl.h" @@ -113,6 +114,24 @@ class BitmapIndex : public ScalarIndex { LoadWithoutAssemble(const BinarySet& binary_set, const Config& config) override; + const TargetBitmap + Query(const DatasetPtr& dataset) override; + + const TargetBitmap + PatternMatch(const std::string& pattern) override { + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + return RegexQuery(regex_pattern); + } + + bool + SupportRegexQuery() const override { + return true; + } + + const TargetBitmap + RegexQuery(const std::string& regex_pattern) override; + public: int64_t Cardinality() { diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h index 22107f2b4b..4a9c60d6bd 100644 --- a/internal/core/src/index/HybridScalarIndex.h +++ b/internal/core/src/index/HybridScalarIndex.h @@ -87,6 +87,28 @@ class HybridScalarIndex : public ScalarIndex { return internal_index_->NotIn(n, values); } + const TargetBitmap + Query(const DatasetPtr& dataset) override { + return internal_index_->Query(dataset); + } + + const TargetBitmap + PatternMatch(const std::string& pattern) override { + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + return RegexQuery(regex_pattern); + } + + bool + SupportRegexQuery() const override { + return internal_index_->SupportRegexQuery(); + } + + const TargetBitmap + RegexQuery(const std::string& pattern) override { + return internal_index_->RegexQuery(pattern); + } + const TargetBitmap Range(T value, OpType op) override { return internal_index_->Range(value, op); diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index ea638a2915..6de712ecaa 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -11,6 +11,7 @@ #include "tantivy-binding.h" #include "common/Slice.h" +#include "common/RegexQuery.h" #include "storage/LocalChunkManagerSingleton.h" #include "index/InvertedIndexTantivy.h" #include "log/Log.h" @@ -316,9 +317,9 @@ InvertedIndexTantivy::Query(const DatasetPtr& dataset) { template const TargetBitmap -InvertedIndexTantivy::RegexQuery(const std::string& pattern) { +InvertedIndexTantivy::RegexQuery(const std::string& regex_pattern) { TargetBitmap bitset(Count()); - auto array = wrapper_->regex_query(pattern); + auto array = wrapper_->regex_query(regex_pattern); apply_hits(bitset, array, true); return bitset; } diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 14a34ddb3c..98114a7914 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -11,6 +11,7 @@ #pragma once +#include "common/RegexQuery.h" #include "index/Index.h" #include "storage/FileManager.h" #include "storage/DiskFileManagerImpl.h" @@ -146,13 +147,20 @@ class InvertedIndexTantivy : public ScalarIndex { const TargetBitmap Query(const DatasetPtr& dataset) override; + const TargetBitmap + PatternMatch(const std::string& pattern) override { + PatternMatchTranslator translator; + auto regex_pattern = translator(pattern); + return RegexQuery(regex_pattern); + } + bool SupportRegexQuery() const override { return true; } const TargetBitmap - RegexQuery(const std::string& pattern) override; + RegexQuery(const std::string& regex_pattern) override; void BuildWithFieldData(const std::vector& datas) override; diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index 023f101192..badff11383 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -114,6 +114,11 @@ class ScalarIndex : public IndexBase { virtual const TargetBitmap Query(const DatasetPtr& dataset); + virtual const TargetBitmap + PatternMatch(const std::string& pattern) { + PanicInfo(Unsupported, "pattern match is not supported"); + } + virtual int64_t Size() = 0; diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 866fc90a21..e8b7c50d00 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -2699,6 +2699,99 @@ class TestQueryString(TestcaseBase): collection_w.query(expression, output_fields=output_fields, check_task=CheckTasks.check_query_results, check_items={exp_res: res}) + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_prefixes_auto_index(self): + """ + target: test query with prefix string expression and indexed with auto index + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + primary_field=default_int_field_name)[0:2] + + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index("varchar", index_name="varchar_auto_index") + time.sleep(1) + collection_w.load() + expression = 'varchar like "0%"' + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len = len(result) + collection_w.release() + collection_w.drop_index(index_name="varchar_auto_index") + collection_w.load() + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len_1 = len(result) + assert res_len_1 == res_len + + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_prefixes_bitmap(self): + """ + target: test query with prefix string expression and indexed with bitmap + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + primary_field=default_int_field_name)[0:2] + + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index("varchar", index_name="bitmap_auto_index") + time.sleep(1) + collection_w.load() + expression = 'varchar like "0%"' + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len = len(result) + collection_w.release() + collection_w.drop_index(index_name="varchar_bitmap_index") + collection_w.load() + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len_1 = len(result) + assert res_len_1 == res_len + + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_match_auto_index(self): + """ + target: test query with match string expression and indexed with auto index + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + primary_field=default_int_field_name)[0:2] + + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index("varchar", index_name="varchar_auto_index") + time.sleep(1) + collection_w.load() + expression = 'varchar like "%0%"' + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len = len(result) + collection_w.release() + collection_w.drop_index(index_name="varchar_auto_index") + collection_w.load() + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len_1 = len(result) + assert res_len_1 == res_len + + @pytest.mark.tags(CaseLabel.L1) + def test_query_string_expr_with_match_bitmap(self): + """ + target: test query with match string expression and indexed with bitmap + expected: verify query successfully + """ + collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False, + primary_field=default_int_field_name)[0:2] + + collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index") + collection_w.create_index("varchar", index_name="bitmap_auto_index") + time.sleep(1) + collection_w.load() + expression = 'varchar like "%0%"' + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len = len(result) + collection_w.release() + collection_w.drop_index(index_name="varchar_bitmap_index") + collection_w.load() + result , _ = collection_w.query(expression, output_fields=['varchar']) + res_len_1 = len(result) + assert res_len_1 == res_len + + @pytest.mark.tags(CaseLabel.L1) def test_query_string_with_invalid_prefix_expr(self): """