mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 11:59:00 +08:00
fix: support string match for hybrid and bitmap index (#35294)
#34841 Signed-off-by: luzhang <luzhang@zilliz.com> Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
parent
e75efa7359
commit
c19fe95154
@ -207,12 +207,8 @@ struct UnaryIndexFuncForMatch {
|
||||
!std::is_same_v<T, std::string>) {
|
||||
PanicInfo(Unsupported, "regex query is only supported on string");
|
||||
} else {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(val);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
|
||||
if (index->SupportRegexQuery()) {
|
||||
return index->RegexQuery(regex_pattern);
|
||||
return index->PatternMatch(val);
|
||||
}
|
||||
if (!index->HasRawData()) {
|
||||
PanicInfo(Unsupported,
|
||||
@ -223,6 +219,9 @@ struct UnaryIndexFuncForMatch {
|
||||
// retrieve raw data to do brute force query, may be very slow.
|
||||
auto cnt = index->Count();
|
||||
TargetBitmap res(cnt);
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(val);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
for (int64_t i = 0; i < cnt; i++) {
|
||||
auto raw = index->Reverse_Lookup(i);
|
||||
res[i] = matcher(raw);
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "index/ScalarIndex.h"
|
||||
#include "index/Utils.h"
|
||||
#include "storage/Util.h"
|
||||
#include "query/Utils.h"
|
||||
|
||||
namespace milvus {
|
||||
namespace index {
|
||||
@ -806,6 +807,78 @@ BitmapIndex<T>::ShouldSkip(const T lower_value,
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::Query(const DatasetPtr& dataset) {
|
||||
return ScalarIndex<T>::Query(dataset);
|
||||
}
|
||||
|
||||
template <>
|
||||
const TargetBitmap
|
||||
BitmapIndex<std::string>::Query(const DatasetPtr& dataset) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
|
||||
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
|
||||
if (op == OpType::PrefixMatch) {
|
||||
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (auto it = data_.begin(); it != data_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (milvus::query::Match(key, prefix, op)) {
|
||||
for (const auto& v : it->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (milvus::query::Match(key, prefix, op)) {
|
||||
res |= it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
} else {
|
||||
PanicInfo(OpTypeInvalid,
|
||||
fmt::format("unsupported op_type:{} for bitmap query", op));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::RegexQuery(const std::string& regex_pattern) {
|
||||
return ScalarIndex<T>::RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
template <>
|
||||
const TargetBitmap
|
||||
BitmapIndex<std::string>::RegexQuery(const std::string& regex_pattern) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (auto it = data_.begin(); it != data_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (matcher(key)) {
|
||||
for (const auto& v : it->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (matcher(key)) {
|
||||
res |= it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template class BitmapIndex<bool>;
|
||||
template class BitmapIndex<int8_t>;
|
||||
template class BitmapIndex<int16_t>;
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include <string>
|
||||
#include <roaring/roaring.hh>
|
||||
|
||||
#include "common/RegexQuery.h"
|
||||
#include "index/ScalarIndex.h"
|
||||
#include "storage/FileManager.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
@ -113,6 +114,24 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) override {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
return RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
bool
|
||||
SupportRegexQuery() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
RegexQuery(const std::string& regex_pattern) override;
|
||||
|
||||
public:
|
||||
int64_t
|
||||
Cardinality() {
|
||||
|
@ -87,6 +87,28 @@ class HybridScalarIndex : public ScalarIndex<T> {
|
||||
return internal_index_->NotIn(n, values);
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override {
|
||||
return internal_index_->Query(dataset);
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) override {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
return RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
bool
|
||||
SupportRegexQuery() const override {
|
||||
return internal_index_->SupportRegexQuery();
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
RegexQuery(const std::string& pattern) override {
|
||||
return internal_index_->RegexQuery(pattern);
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
Range(T value, OpType op) override {
|
||||
return internal_index_->Range(value, op);
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "common/Slice.h"
|
||||
#include "common/RegexQuery.h"
|
||||
#include "storage/LocalChunkManagerSingleton.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "log/Log.h"
|
||||
@ -316,9 +317,9 @@ InvertedIndexTantivy<std::string>::Query(const DatasetPtr& dataset) {
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::RegexQuery(const std::string& pattern) {
|
||||
InvertedIndexTantivy<T>::RegexQuery(const std::string& regex_pattern) {
|
||||
TargetBitmap bitset(Count());
|
||||
auto array = wrapper_->regex_query(pattern);
|
||||
auto array = wrapper_->regex_query(regex_pattern);
|
||||
apply_hits(bitset, array, true);
|
||||
return bitset;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/RegexQuery.h"
|
||||
#include "index/Index.h"
|
||||
#include "storage/FileManager.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
@ -146,13 +147,20 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) override {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
return RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
bool
|
||||
SupportRegexQuery() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
RegexQuery(const std::string& pattern) override;
|
||||
RegexQuery(const std::string& regex_pattern) override;
|
||||
|
||||
void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
|
||||
|
@ -114,6 +114,11 @@ class ScalarIndex : public IndexBase {
|
||||
virtual const TargetBitmap
|
||||
Query(const DatasetPtr& dataset);
|
||||
|
||||
virtual const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) {
|
||||
PanicInfo(Unsupported, "pattern match is not supported");
|
||||
}
|
||||
|
||||
virtual int64_t
|
||||
Size() = 0;
|
||||
|
||||
|
@ -2699,6 +2699,99 @@ class TestQueryString(TestcaseBase):
|
||||
collection_w.query(expression, output_fields=output_fields,
|
||||
check_task=CheckTasks.check_query_results, check_items={exp_res: res})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_prefixes_auto_index(self):
|
||||
"""
|
||||
target: test query with prefix string expression and indexed with auto index
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="varchar_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_auto_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_prefixes_bitmap(self):
|
||||
"""
|
||||
target: test query with prefix string expression and indexed with bitmap
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="bitmap_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_bitmap_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_match_auto_index(self):
|
||||
"""
|
||||
target: test query with match string expression and indexed with auto index
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="varchar_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "%0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_auto_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_match_bitmap(self):
|
||||
"""
|
||||
target: test query with match string expression and indexed with bitmap
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="bitmap_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "%0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_bitmap_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_with_invalid_prefix_expr(self):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user