fix: support string match for hybrid and bitmap index (#35294)

#34841

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2024-08-07 09:54:22 +08:00 committed by GitHub
parent e75efa7359
commit c19fe95154
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 228 additions and 8 deletions

View File

@ -207,12 +207,8 @@ struct UnaryIndexFuncForMatch {
!std::is_same_v<T, std::string>) {
PanicInfo(Unsupported, "regex query is only supported on string");
} else {
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
if (index->SupportRegexQuery()) {
return index->RegexQuery(regex_pattern);
return index->PatternMatch(val);
}
if (!index->HasRawData()) {
PanicInfo(Unsupported,
@ -223,6 +219,9 @@ struct UnaryIndexFuncForMatch {
// retrieve raw data to do brute force query, may be very slow.
auto cnt = index->Count();
TargetBitmap res(cnt);
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
for (int64_t i = 0; i < cnt; i++) {
auto raw = index->Reverse_Lookup(i);
res[i] = matcher(raw);

View File

@ -25,6 +25,7 @@
#include "index/ScalarIndex.h"
#include "index/Utils.h"
#include "storage/Util.h"
#include "query/Utils.h"
namespace milvus {
namespace index {
@ -806,6 +807,78 @@ BitmapIndex<T>::ShouldSkip(const T lower_value,
return true;
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::Query(const DatasetPtr& dataset) {
return ScalarIndex<T>::Query(dataset);
}
template <>
const TargetBitmap
BitmapIndex<std::string>::Query(const DatasetPtr& dataset) {
AssertInfo(is_built_, "index has not been built");
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
if (op == OpType::PrefixMatch) {
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
TargetBitmap res(total_num_rows_, false);
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
for (auto it = data_.begin(); it != data_.end(); ++it) {
const auto& key = it->first;
if (milvus::query::Match(key, prefix, op)) {
for (const auto& v : it->second) {
res.set(v);
}
}
}
} else {
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
const auto& key = it->first;
if (milvus::query::Match(key, prefix, op)) {
res |= it->second;
}
}
}
return res;
} else {
PanicInfo(OpTypeInvalid,
fmt::format("unsupported op_type:{} for bitmap query", op));
}
}
template <typename T>
const TargetBitmap
BitmapIndex<T>::RegexQuery(const std::string& regex_pattern) {
return ScalarIndex<T>::RegexQuery(regex_pattern);
}
template <>
const TargetBitmap
BitmapIndex<std::string>::RegexQuery(const std::string& regex_pattern) {
AssertInfo(is_built_, "index has not been built");
RegexMatcher matcher(regex_pattern);
TargetBitmap res(total_num_rows_, false);
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
for (auto it = data_.begin(); it != data_.end(); ++it) {
const auto& key = it->first;
if (matcher(key)) {
for (const auto& v : it->second) {
res.set(v);
}
}
}
} else {
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
const auto& key = it->first;
if (matcher(key)) {
res |= it->second;
}
}
}
return res;
}
template class BitmapIndex<bool>;
template class BitmapIndex<int8_t>;
template class BitmapIndex<int16_t>;

View File

@ -21,6 +21,7 @@
#include <string>
#include <roaring/roaring.hh>
#include "common/RegexQuery.h"
#include "index/ScalarIndex.h"
#include "storage/FileManager.h"
#include "storage/DiskFileManagerImpl.h"
@ -113,6 +114,24 @@ class BitmapIndex : public ScalarIndex<T> {
LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) override;
const TargetBitmap
Query(const DatasetPtr& dataset) override;
const TargetBitmap
PatternMatch(const std::string& pattern) override {
PatternMatchTranslator translator;
auto regex_pattern = translator(pattern);
return RegexQuery(regex_pattern);
}
bool
SupportRegexQuery() const override {
return true;
}
const TargetBitmap
RegexQuery(const std::string& regex_pattern) override;
public:
int64_t
Cardinality() {

View File

@ -87,6 +87,28 @@ class HybridScalarIndex : public ScalarIndex<T> {
return internal_index_->NotIn(n, values);
}
const TargetBitmap
Query(const DatasetPtr& dataset) override {
return internal_index_->Query(dataset);
}
const TargetBitmap
PatternMatch(const std::string& pattern) override {
PatternMatchTranslator translator;
auto regex_pattern = translator(pattern);
return RegexQuery(regex_pattern);
}
bool
SupportRegexQuery() const override {
return internal_index_->SupportRegexQuery();
}
const TargetBitmap
RegexQuery(const std::string& pattern) override {
return internal_index_->RegexQuery(pattern);
}
const TargetBitmap
Range(T value, OpType op) override {
return internal_index_->Range(value, op);

View File

@ -11,6 +11,7 @@
#include "tantivy-binding.h"
#include "common/Slice.h"
#include "common/RegexQuery.h"
#include "storage/LocalChunkManagerSingleton.h"
#include "index/InvertedIndexTantivy.h"
#include "log/Log.h"
@ -316,9 +317,9 @@ InvertedIndexTantivy<std::string>::Query(const DatasetPtr& dataset) {
template <typename T>
const TargetBitmap
InvertedIndexTantivy<T>::RegexQuery(const std::string& pattern) {
InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
TargetBitmap bitset(Count());
auto array = wrapper_->regex_query(pattern);
auto array = wrapper_->regex_query(regex_pattern);
apply_hits(bitset, array, true);
return bitset;
}

View File

@ -11,6 +11,7 @@
#pragma once
#include "common/RegexQuery.h"
#include "index/Index.h"
#include "storage/FileManager.h"
#include "storage/DiskFileManagerImpl.h"
@ -146,13 +147,20 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
const TargetBitmap
Query(const DatasetPtr& dataset) override;
const TargetBitmap
PatternMatch(const std::string& pattern) override {
PatternMatchTranslator translator;
auto regex_pattern = translator(pattern);
return RegexQuery(regex_pattern);
}
bool
SupportRegexQuery() const override {
return true;
}
const TargetBitmap
RegexQuery(const std::string& pattern) override;
RegexQuery(const std::string& regex_pattern) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;

View File

@ -114,6 +114,11 @@ class ScalarIndex : public IndexBase {
virtual const TargetBitmap
Query(const DatasetPtr& dataset);
virtual const TargetBitmap
PatternMatch(const std::string& pattern) {
PanicInfo(Unsupported, "pattern match is not supported");
}
virtual int64_t
Size() = 0;

View File

@ -2699,6 +2699,99 @@ class TestQueryString(TestcaseBase):
collection_w.query(expression, output_fields=output_fields,
check_task=CheckTasks.check_query_results, check_items={exp_res: res})
@pytest.mark.tags(CaseLabel.L1)
def test_query_string_expr_with_prefixes_auto_index(self):
"""
target: test query with prefix string expression and indexed with auto index
expected: verify query successfully
"""
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
primary_field=default_int_field_name)[0:2]
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
collection_w.create_index("varchar", index_name="varchar_auto_index")
time.sleep(1)
collection_w.load()
expression = 'varchar like "0%"'
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len = len(result)
collection_w.release()
collection_w.drop_index(index_name="varchar_auto_index")
collection_w.load()
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len_1 = len(result)
assert res_len_1 == res_len
@pytest.mark.tags(CaseLabel.L1)
def test_query_string_expr_with_prefixes_bitmap(self):
"""
target: test query with prefix string expression and indexed with bitmap
expected: verify query successfully
"""
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
primary_field=default_int_field_name)[0:2]
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
collection_w.create_index("varchar", index_name="bitmap_auto_index")
time.sleep(1)
collection_w.load()
expression = 'varchar like "0%"'
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len = len(result)
collection_w.release()
collection_w.drop_index(index_name="varchar_bitmap_index")
collection_w.load()
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len_1 = len(result)
assert res_len_1 == res_len
@pytest.mark.tags(CaseLabel.L1)
def test_query_string_expr_with_match_auto_index(self):
"""
target: test query with match string expression and indexed with auto index
expected: verify query successfully
"""
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
primary_field=default_int_field_name)[0:2]
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
collection_w.create_index("varchar", index_name="varchar_auto_index")
time.sleep(1)
collection_w.load()
expression = 'varchar like "%0%"'
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len = len(result)
collection_w.release()
collection_w.drop_index(index_name="varchar_auto_index")
collection_w.load()
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len_1 = len(result)
assert res_len_1 == res_len
@pytest.mark.tags(CaseLabel.L1)
def test_query_string_expr_with_match_bitmap(self):
"""
target: test query with match string expression and indexed with bitmap
expected: verify query successfully
"""
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
primary_field=default_int_field_name)[0:2]
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
collection_w.create_index("varchar", index_name="bitmap_auto_index")
time.sleep(1)
collection_w.load()
expression = 'varchar like "%0%"'
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len = len(result)
collection_w.release()
collection_w.drop_index(index_name="varchar_bitmap_index")
collection_w.load()
result , _ = collection_w.query(expression, output_fields=['varchar'])
res_len_1 = len(result)
assert res_len_1 == res_len
@pytest.mark.tags(CaseLabel.L1)
def test_query_string_with_invalid_prefix_expr(self):
"""