mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-29 18:38:44 +08:00
feat: Tokenizer support build with params and clone for concurrency (#37048)
relate: https://github.com/milvus-io/milvus/issues/35853 https://github.com/milvus-io/milvus/issues/36751 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
8714774305
commit
d67853fa89
@ -22,18 +22,9 @@ TokenizerParams
|
||||
ParseTokenizerParams(const TypeParams& params) {
|
||||
auto iter = params.find("tokenizer_params");
|
||||
if (iter == params.end()) {
|
||||
return {};
|
||||
return "{}";
|
||||
}
|
||||
nlohmann::json j = nlohmann::json::parse(iter->second);
|
||||
std::map<std::string, std::string> ret;
|
||||
for (const auto& [k, v] : j.items()) {
|
||||
try {
|
||||
ret[k] = v.get<std::string>();
|
||||
} catch (std::exception& e) {
|
||||
ret[k] = v.dump();
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
return iter->second;
|
||||
}
|
||||
|
||||
bool
|
||||
|
@ -25,7 +25,7 @@
|
||||
|
||||
namespace milvus {
|
||||
using TypeParams = std::map<std::string, std::string>;
|
||||
using TokenizerParams = std::map<std::string, std::string>;
|
||||
using TokenizerParams = std::string;
|
||||
|
||||
TokenizerParams
|
||||
ParseTokenizerParams(const TypeParams& params);
|
||||
|
@ -19,10 +19,9 @@
|
||||
namespace milvus::index {
|
||||
constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
|
||||
|
||||
TextMatchIndex::TextMatchIndex(
|
||||
int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params)
|
||||
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
: commit_interval_in_ms_(commit_interval_in_ms),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
d_type_ = TantivyDataType::Text;
|
||||
@ -31,10 +30,9 @@ TextMatchIndex::TextMatchIndex(
|
||||
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
|
||||
}
|
||||
|
||||
TextMatchIndex::TextMatchIndex(
|
||||
const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params)
|
||||
TextMatchIndex::TextMatchIndex(const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
path_ = path;
|
||||
@ -47,10 +45,9 @@ TextMatchIndex::TextMatchIndex(
|
||||
tokenizer_params);
|
||||
}
|
||||
|
||||
TextMatchIndex::TextMatchIndex(
|
||||
const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params)
|
||||
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params)
|
||||
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
||||
last_commit_time_(stdclock::now()) {
|
||||
schema_ = ctx.fieldDataMeta.field_schema;
|
||||
@ -174,9 +171,8 @@ TextMatchIndex::CreateReader() {
|
||||
}
|
||||
|
||||
void
|
||||
TextMatchIndex::RegisterTokenizer(
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params) {
|
||||
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
|
||||
const char* tokenizer_params) {
|
||||
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
|
||||
}
|
||||
|
||||
|
@ -22,20 +22,17 @@ using stdclock = std::chrono::high_resolution_clock;
|
||||
class TextMatchIndex : public InvertedIndexTantivy<std::string> {
|
||||
public:
|
||||
// for growing segment.
|
||||
explicit TextMatchIndex(
|
||||
int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params);
|
||||
explicit TextMatchIndex(int64_t commit_interval_in_ms,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
// for sealed segment.
|
||||
explicit TextMatchIndex(
|
||||
const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params);
|
||||
explicit TextMatchIndex(const std::string& path,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
// for building index.
|
||||
explicit TextMatchIndex(
|
||||
const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params);
|
||||
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
|
||||
const char* tokenizer_name,
|
||||
const char* tokenizer_params);
|
||||
// for loading index
|
||||
explicit TextMatchIndex(const storage::FileManagerContext& ctx);
|
||||
|
||||
@ -67,9 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
|
||||
CreateReader();
|
||||
|
||||
void
|
||||
RegisterTokenizer(
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params);
|
||||
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);
|
||||
|
||||
TargetBitmap
|
||||
MatchQuery(const std::string& query);
|
||||
|
@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
|
||||
auto index = std::make_unique<index::TextMatchIndex>(
|
||||
fileManagerContext,
|
||||
"milvus_tokenizer",
|
||||
field_schema.get_tokenizer_params());
|
||||
field_schema.get_tokenizer_params().c_str());
|
||||
index->Build(config);
|
||||
auto binary =
|
||||
std::make_unique<knowhere::BinarySet>(index->Upload(config));
|
||||
|
@ -1511,13 +1511,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
std::numeric_limits<int64_t>::max(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
} else {
|
||||
// build text index using mmap.
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
cfg.GetMmapPath(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
}
|
||||
|
||||
{
|
||||
@ -1567,7 +1567,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
||||
index->Reload();
|
||||
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
@ -1578,7 +1578,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
|
||||
std::unique_lock lck(mutex_);
|
||||
const auto& field_meta = schema_->operator[](field_id);
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
|
||||
"cannot create text index on non-string type");
|
||||
// todo: make this(200) configurable.
|
||||
auto index = std::make_unique<index::TextMatchIndex>(
|
||||
200, "milvus_tokenizer", field_meta.get_tokenizer_params());
|
||||
200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
|
||||
index->Commit();
|
||||
index->CreateReader();
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
std::numeric_limits<int64_t>::max(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
} else {
|
||||
// build text index using mmap.
|
||||
index = std::make_unique<index::TextMatchIndex>(
|
||||
cfg.GetMmapPath(),
|
||||
"milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
}
|
||||
|
||||
{
|
||||
@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
|
||||
index->Reload();
|
||||
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
|
||||
std::unique_lock lck(mutex_);
|
||||
const auto& field_meta = schema_->operator[](field_id);
|
||||
index->RegisterTokenizer("milvus_tokenizer",
|
||||
field_meta.get_tokenizer_params());
|
||||
field_meta.get_tokenizer_params().c_str());
|
||||
text_indexes_[field_id] = std::move(index);
|
||||
}
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include "segcore/tokenizer_c.h"
|
||||
#include <memory>
|
||||
#include "common/FieldMeta.h"
|
||||
#include "common/protobuf_utils.h"
|
||||
#include "pb/schema.pb.h"
|
||||
@ -19,10 +20,9 @@
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
CStatus
|
||||
create_tokenizer(CMap m, CTokenizer* tokenizer) {
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer) {
|
||||
try {
|
||||
auto mm = reinterpret_cast<Map*>(m);
|
||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
|
||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
|
||||
*tokenizer = impl.release();
|
||||
return milvus::SuccessCStatus();
|
||||
} catch (std::exception& e) {
|
||||
@ -30,6 +30,17 @@ create_tokenizer(CMap m, CTokenizer* tokenizer) {
|
||||
}
|
||||
}
|
||||
|
||||
CStatus
|
||||
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst) {
|
||||
try {
|
||||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(*tokenizer);
|
||||
*rst = impl->Clone().release();
|
||||
return milvus::SuccessCStatus();
|
||||
} catch (std::exception& e) {
|
||||
return milvus::FailureCStatus(&e);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
free_tokenizer(CTokenizer tokenizer) {
|
||||
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
|
||||
|
@ -24,7 +24,10 @@ extern "C" {
|
||||
typedef void* CTokenizer;
|
||||
|
||||
CStatus
|
||||
create_tokenizer(CMap m, CTokenizer* tokenizer);
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer);
|
||||
|
||||
CStatus
|
||||
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
||||
|
||||
void
|
||||
free_tokenizer(CTokenizer tokenizer);
|
||||
|
6
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
6
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -1021,11 +1021,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.115"
|
||||
version = "1.0.128"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
|
||||
checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@ -1166,6 +1167,7 @@ dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"scopeguard",
|
||||
"serde_json",
|
||||
"tantivy",
|
||||
"tantivy-jieba",
|
||||
"zstd-sys",
|
||||
|
@ -15,6 +15,7 @@ env_logger = "0.11.3"
|
||||
log = "0.4.21"
|
||||
tantivy-jieba = "0.10.0"
|
||||
lazy_static = "1.4.0"
|
||||
serde_json = "1.0.128"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.26.0"
|
||||
|
@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);
|
||||
|
||||
RustArray tantivy_match_query(void *ptr, const char *query);
|
||||
|
||||
void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params);
|
||||
void tantivy_register_tokenizer(void *ptr,
|
||||
const char *tokenizer_name,
|
||||
const char *tokenizer_params);
|
||||
|
||||
void *tantivy_create_index(const char *field_name,
|
||||
TantivyDataType data_type,
|
||||
@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
|
||||
void *tantivy_create_text_writer(const char *field_name,
|
||||
const char *path,
|
||||
const char *tokenizer_name,
|
||||
void *tokenizer_params,
|
||||
const char *tokenizer_params,
|
||||
uintptr_t num_threads,
|
||||
uintptr_t overall_memory_budget_in_bytes,
|
||||
bool in_ram);
|
||||
@ -157,7 +159,9 @@ bool tantivy_token_stream_advance(void *token_stream);
|
||||
|
||||
const char *tantivy_token_stream_get_token(void *token_stream);
|
||||
|
||||
void *tantivy_create_tokenizer(void *tokenizer_params);
|
||||
void *tantivy_create_tokenizer(const char *tokenizer_params);
|
||||
|
||||
void *tantivy_clone_tokenizer(void *ptr);
|
||||
|
||||
void tantivy_free_tokenizer(void *tokenizer);
|
||||
|
||||
|
40
internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs
vendored
Normal file
40
internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs
vendored
Normal file
@ -0,0 +1,40 @@
|
||||
use serde_json as json;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TantivyError{
|
||||
reason: String,
|
||||
}
|
||||
|
||||
impl TantivyError{
|
||||
fn new(reason:String) -> Self{
|
||||
TantivyError{reason:reason}
|
||||
}
|
||||
|
||||
pub fn reason(&self) -> String{
|
||||
return self.reason.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for TantivyError{
|
||||
fn from(value: &str) -> Self {
|
||||
Self::new(value.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for TantivyError{
|
||||
fn from(value: String) -> Self {
|
||||
Self::new(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<json::Error> for TantivyError{
|
||||
fn from(value: json::Error) -> Self {
|
||||
Self::new(value.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for TantivyError{
|
||||
fn to_string(&self) -> String {
|
||||
return self.reason()
|
||||
}
|
||||
}
|
@ -4,7 +4,7 @@ use tantivy::{
|
||||
Term,
|
||||
};
|
||||
|
||||
use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer};
|
||||
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
// split the query string into multiple tokens using index's default tokenizer,
|
||||
@ -14,7 +14,7 @@ impl IndexReaderWrapper {
|
||||
let mut tokenizer = self
|
||||
.index
|
||||
.tokenizer_for_field(self.field)
|
||||
.unwrap_or(default_tokenizer())
|
||||
.unwrap_or(standard_analyzer(vec![]))
|
||||
.clone();
|
||||
let mut token_stream = tokenizer.token_stream(q);
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
|
@ -1,8 +1,14 @@
|
||||
use std::{collections::HashMap, ffi::CStr};
|
||||
use std::{ffi::CStr};
|
||||
|
||||
use libc::{c_char, c_void};
|
||||
|
||||
use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer};
|
||||
use crate::{
|
||||
array::RustArray,
|
||||
string_c::c_str_to_str,
|
||||
index_reader::IndexReaderWrapper,
|
||||
tokenizer::create_tokenizer,
|
||||
log::init_log,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray {
|
||||
@ -18,23 +24,22 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
|
||||
pub extern "C" fn tantivy_register_tokenizer(
|
||||
ptr: *mut c_void,
|
||||
tokenizer_name: *const c_char,
|
||||
tokenizer_params: *mut c_void,
|
||||
tokenizer_params: *const c_char,
|
||||
) {
|
||||
init_log();
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
|
||||
let analyzer = unsafe {
|
||||
let m = tokenizer_params as *const HashMap<String, String>;
|
||||
create_tokenizer(&(*m))
|
||||
};
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Some(text_analyzer) => unsafe {
|
||||
Ok(text_analyzer) => unsafe {
|
||||
(*real).register_tokenizer(
|
||||
String::from(tokenizer_name_str.to_str().unwrap()),
|
||||
text_analyzer,
|
||||
);
|
||||
},
|
||||
None => {
|
||||
panic!("unsupported tokenizer");
|
||||
}
|
||||
Err(err) => {
|
||||
panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,3 @@
|
||||
use std::collections::HashMap;
|
||||
use std::ffi::c_char;
|
||||
use std::ffi::c_void;
|
||||
use std::ffi::CStr;
|
||||
@ -6,26 +5,27 @@ use std::ffi::CStr;
|
||||
use crate::index_writer::IndexWriterWrapper;
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
use crate::util::create_binding;
|
||||
use crate::string_c::c_str_to_str;
|
||||
use crate::log::init_log;
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_text_writer(
|
||||
field_name: *const c_char,
|
||||
path: *const c_char,
|
||||
tokenizer_name: *const c_char,
|
||||
tokenizer_params: *mut c_void,
|
||||
tokenizer_params: *const c_char,
|
||||
num_threads: usize,
|
||||
overall_memory_budget_in_bytes: usize,
|
||||
in_ram: bool,
|
||||
) -> *mut c_void {
|
||||
init_log();
|
||||
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
|
||||
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
|
||||
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
|
||||
let analyzer = unsafe {
|
||||
let m = tokenizer_params as *const HashMap<String, String>;
|
||||
create_tokenizer(&(*m))
|
||||
};
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Some(text_analyzer) => {
|
||||
Ok(text_analyzer) => {
|
||||
let wrapper = IndexWriterWrapper::create_text_writer(
|
||||
String::from(field_name_str),
|
||||
String::from(path_str),
|
||||
@ -37,8 +37,9 @@ pub extern "C" fn tantivy_create_text_writer(
|
||||
);
|
||||
create_binding(wrapper)
|
||||
}
|
||||
None => {
|
||||
Err(err) => {
|
||||
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
|
||||
std::ptr::null_mut()
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -15,8 +15,10 @@ mod log;
|
||||
mod string_c;
|
||||
mod token_stream_c;
|
||||
mod tokenizer;
|
||||
mod tokenizer_filter;
|
||||
mod tokenizer_c;
|
||||
mod util;
|
||||
mod error;
|
||||
mod util_c;
|
||||
mod vec_collector;
|
||||
|
||||
|
@ -1,54 +1,254 @@
|
||||
use lazy_static::lazy_static;
|
||||
use log::{info, warn};
|
||||
use log::warn;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::log::init_log;
|
||||
use tantivy::tokenizer::*;
|
||||
use serde_json as json;
|
||||
|
||||
lazy_static! {
|
||||
static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
|
||||
use crate::tokenizer_filter::*;
|
||||
use crate::error::TantivyError;
|
||||
use crate::util::*;
|
||||
|
||||
|
||||
// default build-in analyzer
|
||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser)
|
||||
.filter(RemoveLongFilter::limit(40));
|
||||
|
||||
if stop_words.len() > 0{
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub(crate) fn default_tokenizer() -> TextAnalyzer {
|
||||
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
|
||||
fn standard_builder() -> TextAnalyzerBuilder{
|
||||
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn jieba_tokenizer() -> TextAnalyzer {
|
||||
tantivy_jieba::JiebaTokenizer {}.into()
|
||||
fn whitespace_builder()-> TextAnalyzerBuilder{
|
||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
|
||||
init_log();
|
||||
|
||||
match params.get("tokenizer") {
|
||||
Some(tokenizer_name) => match tokenizer_name.as_str() {
|
||||
"default" => {
|
||||
Some(default_tokenizer())
|
||||
}
|
||||
"jieba" => {
|
||||
Some(jieba_tokenizer())
|
||||
}
|
||||
s => {
|
||||
warn!("unsupported tokenizer: {}", s);
|
||||
None
|
||||
}
|
||||
},
|
||||
None => {
|
||||
Some(default_tokenizer())
|
||||
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
match name.as_str() {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
Err(format!("unsupported tokenizer: {}", other).into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct AnalyzerBuilder<'a>{
|
||||
// builder: TextAnalyzerBuilder
|
||||
filters:HashMap<String, SystemFilter>,
|
||||
params:&'a json::Map<String, json::Value>
|
||||
}
|
||||
|
||||
impl AnalyzerBuilder<'_>{
|
||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{
|
||||
AnalyzerBuilder{
|
||||
filters: HashMap::new(),
|
||||
params:params,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tokenizer_name(&self) -> Result<String, TantivyError>{
|
||||
let tokenizer=self.params.get("tokenizer");
|
||||
if tokenizer.is_none(){
|
||||
return Ok("standard".to_string());
|
||||
}
|
||||
if !tokenizer.unwrap().is_string(){
|
||||
return Err(format!("tokenizer name should be string").into());
|
||||
}
|
||||
|
||||
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
|
||||
}
|
||||
|
||||
fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
||||
match SystemFilter::try_from(params){
|
||||
Ok(filter) => {
|
||||
self.filters.insert(name.to_string(), filter);
|
||||
Ok(())
|
||||
},
|
||||
Err(e) => {Err(e)},
|
||||
}
|
||||
}
|
||||
|
||||
fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
||||
for (name, value) in params{
|
||||
if !value.is_object(){
|
||||
continue;
|
||||
}
|
||||
self.add_custom_filter(name, value.as_object().unwrap())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
if !params.is_array(){
|
||||
return Err("filter params should be array".into());
|
||||
}
|
||||
|
||||
let filters = params.as_array().unwrap();
|
||||
for filter in filters{
|
||||
if filter.is_string(){
|
||||
let filter_name = filter.as_str().unwrap();
|
||||
let costum = self.filters.remove(filter_name);
|
||||
if !costum.is_none(){
|
||||
builder = costum.unwrap().transform(builder);
|
||||
continue;
|
||||
}
|
||||
|
||||
// check if filter was system filter
|
||||
let system = SystemFilter::from(filter_name);
|
||||
match system {
|
||||
SystemFilter::Invalid => {
|
||||
return Err(format!("build analyzer failed, filter not found :{}", filter_name).into())
|
||||
}
|
||||
other => {
|
||||
builder = other.transform(builder);
|
||||
},
|
||||
}
|
||||
}else if filter.is_object(){
|
||||
let filter=SystemFilter::try_from(filter.as_object().unwrap())?;
|
||||
builder = filter.transform(builder);
|
||||
}
|
||||
};
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
for (key, value) in self.params{
|
||||
match key.as_str(){
|
||||
"tokenizer" => {},
|
||||
"filter" => {
|
||||
// build with filter if filter param exist
|
||||
builder=self.build_filter(builder, value)?;
|
||||
},
|
||||
"max_token_length" => {
|
||||
if !value.is_u64(){
|
||||
return Err("max token length should be int type".into());
|
||||
}
|
||||
builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
|
||||
}
|
||||
other => return Err(format!("unknown analyzer option key: {}", other).into()),
|
||||
}
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
|
||||
match type_{
|
||||
"standard" => {
|
||||
let value = self.params.get("stop_words");
|
||||
match value{
|
||||
Some(value)=>{
|
||||
let str_list = get_string_list(value, "filter stop_words")?;
|
||||
Ok(standard_analyzer(str_list))
|
||||
}
|
||||
None => Ok(standard_analyzer(vec![]))
|
||||
}
|
||||
},
|
||||
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
|
||||
}
|
||||
}
|
||||
|
||||
fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
|
||||
// build base build-in analyzer
|
||||
match self.params.get("type"){
|
||||
Some(type_) =>{
|
||||
if !type_.is_string(){
|
||||
return Err(format!("analyzer type shoud be string").into())
|
||||
}
|
||||
return self.build_template(type_.as_str().unwrap());
|
||||
},
|
||||
None => {}
|
||||
};
|
||||
|
||||
//build custom analyzer
|
||||
let tokenizer_name = self.get_tokenizer_name()?;
|
||||
|
||||
// jieba analyzer can't add filter.
|
||||
if tokenizer_name == "jieba"{
|
||||
return Ok(tantivy_jieba::JiebaTokenizer{}.into());
|
||||
}
|
||||
|
||||
let mut builder=get_builder_by_name(&tokenizer_name)?;
|
||||
|
||||
// build with option
|
||||
builder = self.build_option(builder)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
||||
match json::from_str::<json::Value>(¶ms){
|
||||
Ok(value) =>{
|
||||
if value.is_null(){
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
if !value.is_object(){
|
||||
return Err("tokenizer params should be a json map".into());
|
||||
}
|
||||
let json_params = value.as_object().unwrap();
|
||||
|
||||
// create builder
|
||||
let analyzer_params=json_params.get("analyzer");
|
||||
if analyzer_params.is_none(){
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
if !analyzer_params.unwrap().is_object(){
|
||||
return Err("analyzer params should be a json map".into());
|
||||
}
|
||||
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
|
||||
|
||||
// build custom filter
|
||||
let filter_params=json_params.get("filter");
|
||||
if !filter_params.is_none() && filter_params.unwrap().is_object(){
|
||||
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
|
||||
}
|
||||
|
||||
// build analyzer
|
||||
builder.build()
|
||||
},
|
||||
Err(err) => Err(err.into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
||||
if params.len()==0{
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
|
||||
#[test]
|
||||
fn test_create_tokenizer() {
|
||||
let mut params : HashMap<String, String> = HashMap::new();
|
||||
params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap());
|
||||
let params = r#"{"tokenizer": "standard"}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms);
|
||||
assert!(tokenizer.is_some());
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jieba_tokenizer() {
|
||||
let params = r#"{"tokenizer": "jieba"}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
let mut stream = bining.token_stream("系统安全");
|
||||
while stream.advance(){
|
||||
let token = stream.token();
|
||||
let text = token.text.clone();
|
||||
print!("test token :{}\n", text.as_str())
|
||||
}
|
||||
}
|
||||
}
|
@ -1,25 +1,34 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use libc::c_void;
|
||||
use libc::{c_void,c_char};
|
||||
use tantivy::tokenizer::TextAnalyzer;
|
||||
|
||||
use crate::{
|
||||
string_c::c_str_to_str,
|
||||
tokenizer::create_tokenizer,
|
||||
util::{create_binding, free_binding},
|
||||
log::init_log,
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
|
||||
let analyzer = unsafe {
|
||||
let m = tokenizer_params as *const HashMap<String, String>;
|
||||
create_tokenizer(&(*m))
|
||||
};
|
||||
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
|
||||
init_log();
|
||||
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
|
||||
let analyzer = create_tokenizer(¶ms);
|
||||
match analyzer {
|
||||
Some(text_analyzer) => create_binding(text_analyzer),
|
||||
None => std::ptr::null_mut(),
|
||||
Ok(text_analyzer) => create_binding(text_analyzer),
|
||||
Err(err) => {
|
||||
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
|
||||
std::ptr::null_mut()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
|
||||
let analyzer=ptr as *mut TextAnalyzer;
|
||||
let clone = unsafe {(*analyzer).clone()};
|
||||
create_binding(clone)
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
|
||||
free_binding::<TextAnalyzer>(tokenizer);
|
||||
|
154
internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
vendored
Normal file
154
internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
vendored
Normal file
@ -0,0 +1,154 @@
|
||||
use tantivy::tokenizer::*;
|
||||
use serde_json as json;
|
||||
|
||||
use crate::error::TantivyError;
|
||||
use crate::util::*;
|
||||
|
||||
pub(crate) enum SystemFilter{
|
||||
Invalid,
|
||||
LowerCase(LowerCaser),
|
||||
AsciiFolding(AsciiFoldingFilter),
|
||||
AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
Length(RemoveLongFilter),
|
||||
Stop(StopWordFilter),
|
||||
Decompounder(SplitCompoundWords),
|
||||
Stemmer(Stemmer)
|
||||
}
|
||||
|
||||
impl SystemFilter{
|
||||
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{
|
||||
match self{
|
||||
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Stemmer(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Invalid => builder,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// create length filter from params
|
||||
// {
|
||||
// "type": "length",
|
||||
// "max": 10, // length
|
||||
// }
|
||||
// TODO support min length
|
||||
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{
|
||||
let limit_str = params.get("max");
|
||||
if limit_str.is_none() || !limit_str.unwrap().is_u64(){
|
||||
return Err("lenth max param was none or not uint".into())
|
||||
}
|
||||
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
|
||||
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit)))
|
||||
}
|
||||
|
||||
fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
let value = params.get("stop_words");
|
||||
if value.is_none(){
|
||||
return Err("stop filter stop_words can't be empty".into());
|
||||
}
|
||||
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
|
||||
}
|
||||
|
||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
let value = params.get("word_list");
|
||||
if value.is_none() || !value.unwrap().is_array(){
|
||||
return Err("decompounder word list should be array".into())
|
||||
}
|
||||
|
||||
let stop_words = value.unwrap().as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words{
|
||||
match element.as_str(){
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => return Err("decompounder word list item should be string".into())
|
||||
}
|
||||
};
|
||||
|
||||
match SplitCompoundWords::from_dictionary(str_list){
|
||||
Ok(f) => Ok(SystemFilter::Decompounder(f)),
|
||||
Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
let value = params.get("language");
|
||||
if value.is_none() || !value.unwrap().is_string(){
|
||||
return Err("stemmer language field should be string".into())
|
||||
}
|
||||
|
||||
match value.unwrap().as_str().unwrap().into_language(){
|
||||
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
|
||||
Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()),
|
||||
}
|
||||
}
|
||||
|
||||
trait LanguageParser {
|
||||
type Error;
|
||||
fn into_language(self) -> Result<Language, Self::Error>;
|
||||
}
|
||||
|
||||
impl LanguageParser for &str {
|
||||
type Error = TantivyError;
|
||||
fn into_language(self) -> Result<Language, Self::Error> {
|
||||
match self.to_lowercase().as_str() {
|
||||
"arabig" => Ok(Language::Arabic),
|
||||
"danish" => Ok(Language::Danish),
|
||||
"dutch" => Ok(Language::Dutch),
|
||||
"english" => Ok(Language::English),
|
||||
"finnish" => Ok(Language::Finnish),
|
||||
"french" => Ok(Language::French),
|
||||
"german" => Ok(Language::German),
|
||||
"greek" => Ok(Language::Greek),
|
||||
"hungarian" => Ok(Language::Hungarian),
|
||||
"italian" => Ok(Language::Italian),
|
||||
"norwegian" => Ok(Language::Norwegian),
|
||||
"portuguese" => Ok(Language::Portuguese),
|
||||
"romanian" => Ok(Language::Romanian),
|
||||
"russian" => Ok(Language::Russian),
|
||||
"spanish" => Ok(Language::Spanish),
|
||||
"swedish" => Ok(Language::Swedish),
|
||||
"tamil" => Ok(Language::Tamil),
|
||||
"turkish" => Ok(Language::Turkish),
|
||||
other => Err(format!("unsupport language: {}", other).into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for SystemFilter{
|
||||
fn from(value: &str) -> Self {
|
||||
match value{
|
||||
"lowercase" => Self::LowerCase(LowerCaser),
|
||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
_ => Self::Invalid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
type Error = TantivyError;
|
||||
|
||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> {
|
||||
match params.get(&"type".to_string()){
|
||||
Some(value) =>{
|
||||
if !value.is_string(){
|
||||
return Err("filter type should be string".into());
|
||||
};
|
||||
|
||||
match value.as_str().unwrap(){
|
||||
"length" => get_length_filter(params),
|
||||
"stop" => get_stop_words_filter(params),
|
||||
"decompounder" => get_decompounder_filter(params),
|
||||
"stemmer" => get_stemmer_filter(params),
|
||||
other=> Err(format!("unsupport filter type: {}", other).into()),
|
||||
}
|
||||
}
|
||||
None => Err("no type field in filter params".into()),
|
||||
}
|
||||
}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
use std::ffi::c_void;
|
||||
use std::ops::Bound;
|
||||
use serde_json as json;
|
||||
use crate::error::TantivyError;
|
||||
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
@ -28,3 +30,19 @@ pub fn free_binding<T>(ptr: *mut c_void) {
|
||||
drop(Box::from_raw(real));
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{
|
||||
if !value.is_array(){
|
||||
return Err(format!("{} should be array", label).into())
|
||||
}
|
||||
|
||||
let stop_words = value.as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words{
|
||||
match element.as_str(){
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => return Err(format!("{} list item should be string", label).into())
|
||||
}
|
||||
};
|
||||
Ok(str_list)
|
||||
}
|
@ -14,7 +14,7 @@ namespace milvus::tantivy {
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
|
||||
static Map DEFAULT_TOKENIZER_PARAMS = {};
|
||||
static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
|
||||
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
|
||||
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
|
||||
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
|
||||
@ -101,17 +101,14 @@ struct TantivyIndexWrapper {
|
||||
bool in_ram,
|
||||
const char* path,
|
||||
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
|
||||
const std::map<std::string, std::string>&
|
||||
tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
|
||||
const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
|
||||
uintptr_t num_threads = DEFAULT_NUM_THREADS,
|
||||
uintptr_t overall_memory_budget_in_bytes =
|
||||
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
|
||||
RustHashMap m;
|
||||
m.from(tokenizer_params);
|
||||
writer_ = tantivy_create_text_writer(field_name,
|
||||
path,
|
||||
tokenizer_name,
|
||||
m.get_pointer(),
|
||||
tokenizer_params,
|
||||
num_threads,
|
||||
overall_memory_budget_in_bytes,
|
||||
in_ram);
|
||||
@ -134,14 +131,11 @@ struct TantivyIndexWrapper {
|
||||
}
|
||||
|
||||
void
|
||||
register_tokenizer(
|
||||
const char* tokenizer_name,
|
||||
const std::map<std::string, std::string>& tokenizer_params) {
|
||||
RustHashMap m;
|
||||
m.from(tokenizer_params);
|
||||
register_tokenizer(const char* tokenizer_name,
|
||||
const char* tokenizer_params) {
|
||||
if (reader_ != nullptr) {
|
||||
tantivy_register_tokenizer(
|
||||
reader_, tokenizer_name, m.get_pointer());
|
||||
reader_, tokenizer_name, tokenizer_params);
|
||||
}
|
||||
}
|
||||
|
||||
|
16
internal/core/thirdparty/tantivy/tokenizer.h
vendored
16
internal/core/thirdparty/tantivy/tokenizer.h
vendored
@ -11,15 +11,17 @@ struct Tokenizer {
|
||||
public:
|
||||
NO_COPY_OR_ASSIGN(Tokenizer);
|
||||
|
||||
explicit Tokenizer(const std::map<std::string, std::string>& params) {
|
||||
RustHashMap m;
|
||||
m.from(params);
|
||||
ptr_ = tantivy_create_tokenizer(m.get_pointer());
|
||||
explicit Tokenizer(std::string&& params) {
|
||||
auto shared_params = std::make_shared<std::string>(std::move(params));
|
||||
ptr_ = tantivy_create_tokenizer(shared_params->c_str());
|
||||
if (ptr_ == nullptr) {
|
||||
throw std::invalid_argument("invalid tokenizer parameters");
|
||||
}
|
||||
}
|
||||
|
||||
explicit Tokenizer(void* _ptr) : ptr_(_ptr) {
|
||||
}
|
||||
|
||||
~Tokenizer() {
|
||||
if (ptr_ != nullptr) {
|
||||
tantivy_free_tokenizer(ptr_);
|
||||
@ -34,6 +36,12 @@ struct Tokenizer {
|
||||
return std::make_unique<TokenStream>(token_stream, shared_text);
|
||||
}
|
||||
|
||||
std::unique_ptr<Tokenizer>
|
||||
Clone() {
|
||||
auto newptr = tantivy_clone_tokenizer(ptr_);
|
||||
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
|
||||
}
|
||||
|
||||
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
|
||||
std::unique_ptr<TokenStream>
|
||||
CreateTokenStreamCopyText(const std::string& text) {
|
||||
|
@ -47,12 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) {
|
||||
}
|
||||
|
||||
TEST(CTokenizer, Default) {
|
||||
auto m = create_cmap();
|
||||
set_cmap(m, "tokenizer", "default");
|
||||
|
||||
auto tokenizer_params = R"({"tokenizer": "standard"})";
|
||||
CTokenizer tokenizer;
|
||||
{
|
||||
auto status = create_tokenizer(m, &tokenizer);
|
||||
auto status = create_tokenizer(tokenizer_params, &tokenizer);
|
||||
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
|
||||
}
|
||||
|
||||
@ -71,5 +69,4 @@ TEST(CTokenizer, Default) {
|
||||
|
||||
free_token_stream(token_stream);
|
||||
free_tokenizer(tokenizer);
|
||||
free_cmap(m);
|
||||
}
|
||||
|
@ -10,9 +10,9 @@
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <string>
|
||||
|
||||
#include "common/Schema.h"
|
||||
#include "segcore/segment_c.h"
|
||||
#include "segcore/SegmentGrowing.h"
|
||||
#include "segcore/SegmentGrowingImpl.h"
|
||||
#include "test_utils/DataGen.h"
|
||||
@ -80,23 +80,19 @@ TEST(ParseJson, Naive) {
|
||||
TEST(ParseTokenizerParams, NoTokenizerParams) {
|
||||
TypeParams params{{"k", "v"}};
|
||||
auto p = ParseTokenizerParams(params);
|
||||
ASSERT_EQ(0, p.size());
|
||||
ASSERT_EQ("{}", std::string(p));
|
||||
}
|
||||
|
||||
TEST(ParseTokenizerParams, Default) {
|
||||
TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
|
||||
TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}};
|
||||
auto p = ParseTokenizerParams(params);
|
||||
ASSERT_EQ(1, p.size());
|
||||
auto iter = p.find("tokenizer");
|
||||
ASSERT_NE(p.end(), iter);
|
||||
ASSERT_EQ("default", iter->second);
|
||||
ASSERT_EQ(params.at("tokenizer_params"), p);
|
||||
}
|
||||
|
||||
TEST(TextMatch, Index) {
|
||||
using Index = index::TextMatchIndex;
|
||||
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
|
||||
"milvus_tokenizer",
|
||||
std::map<std::string, std::string>{});
|
||||
auto index = std::make_unique<Index>(
|
||||
std::numeric_limits<int64_t>::max(), "milvus_tokenizer", "{}");
|
||||
index->CreateReader();
|
||||
index->AddText("football, basketball, pingpang", 0);
|
||||
index->AddText("swimming, football", 1);
|
||||
|
@ -297,7 +297,6 @@ func (t *queryTask) CanSkipAllocTimestamp() bool {
|
||||
}
|
||||
consistencyLevel = collectionInfo.consistencyLevel
|
||||
}
|
||||
|
||||
return consistencyLevel != commonpb.ConsistencyLevel_Strong
|
||||
}
|
||||
|
||||
|
@ -111,7 +111,6 @@ func (t *searchTask) CanSkipAllocTimestamp() bool {
|
||||
}
|
||||
consistencyLevel = collectionInfo.consistencyLevel
|
||||
}
|
||||
|
||||
return consistencyLevel != commonpb.ConsistencyLevel_Strong
|
||||
}
|
||||
|
||||
|
@ -33,6 +33,15 @@ func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
|
||||
return NewCTokenStream(ptr)
|
||||
}
|
||||
|
||||
func (impl *CTokenizer) Clone() (tokenizerapi.Tokenizer, error) {
|
||||
var newptr C.CTokenizer
|
||||
status := C.clone_tokenizer(&impl.ptr, &newptr)
|
||||
if err := HandleCStatus(&status, "failed to clone tokenizer"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return NewCTokenizer(newptr), nil
|
||||
}
|
||||
|
||||
func (impl *CTokenizer) Destroy() {
|
||||
C.free_tokenizer(impl.ptr)
|
||||
}
|
||||
|
@ -9,16 +9,17 @@ package ctokenizer
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
)
|
||||
|
||||
func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) {
|
||||
mm := NewCMap()
|
||||
defer mm.Destroy()
|
||||
mm.From(m)
|
||||
func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) {
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
var ptr C.CTokenizer
|
||||
status := C.create_tokenizer(mm.GetPointer(), &ptr)
|
||||
status := C.create_tokenizer(paramPtr, &ptr)
|
||||
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -10,7 +10,7 @@ import (
|
||||
func TestTokenizer(t *testing.T) {
|
||||
// default tokenizer.
|
||||
{
|
||||
m := make(map[string]string)
|
||||
m := "{\"tokenizer\": \"standard\"}"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
@ -24,8 +24,7 @@ func TestTokenizer(t *testing.T) {
|
||||
|
||||
// jieba tokenizer.
|
||||
{
|
||||
m := make(map[string]string)
|
||||
m["tokenizer"] = "jieba"
|
||||
m := "{\"tokenizer\": \"jieba\"}"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) {
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{Key: "enable_match", Value: "true"},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "default"}`},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -41,7 +41,7 @@ func TestValidateTextSchema(t *testing.T) {
|
||||
DataType: schemapb.DataType_VarChar,
|
||||
TypeParams: []*commonpb.KeyValuePair{
|
||||
{Key: "enable_match", Value: "true"},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "jieba"}`},
|
||||
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
@ -40,6 +40,15 @@ type BM25FunctionRunner struct {
|
||||
concurrency int
|
||||
}
|
||||
|
||||
func getTokenizerParams(field *schemapb.FieldSchema) string {
|
||||
for _, param := range field.GetTypeParams() {
|
||||
if param.Key == "tokenizer_params" {
|
||||
return param.Value
|
||||
}
|
||||
}
|
||||
return "{}"
|
||||
}
|
||||
|
||||
func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) {
|
||||
if len(schema.GetOutputFieldIds()) != 1 {
|
||||
return nil, fmt.Errorf("bm25 function should only have one output field, but now %d", len(schema.GetOutputFieldIds()))
|
||||
@ -49,17 +58,22 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
|
||||
schema: schema,
|
||||
concurrency: 8,
|
||||
}
|
||||
var params string
|
||||
for _, field := range coll.GetFields() {
|
||||
if field.GetFieldID() == schema.GetOutputFieldIds()[0] {
|
||||
runner.outputField = field
|
||||
break
|
||||
}
|
||||
|
||||
if field.GetFieldID() == schema.GetInputFieldIds()[0] {
|
||||
params = getTokenizerParams(field)
|
||||
}
|
||||
}
|
||||
|
||||
if runner.outputField == nil {
|
||||
return nil, fmt.Errorf("no output field")
|
||||
}
|
||||
tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
|
||||
tokenizer, err := ctokenizer.NewTokenizer(params)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -69,8 +83,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
|
||||
}
|
||||
|
||||
func (v *BM25FunctionRunner) run(data []string, dst []map[uint32]float32) error {
|
||||
// TODO AOIASD Support single Tokenizer concurrency
|
||||
tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
|
||||
tokenizer, err := v.tokenizer.Clone()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -3,5 +3,6 @@ package tokenizerapi
|
||||
//go:generate mockery --name=Tokenizer --with-expecter
|
||||
type Tokenizer interface {
|
||||
NewTokenStream(text string) TokenStream
|
||||
Clone() (Tokenizer, error)
|
||||
Destroy()
|
||||
}
|
||||
|
@ -778,7 +778,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
|
||||
def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
|
||||
auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
fields = [
|
||||
gen_int64_field(),
|
||||
|
@ -33,7 +33,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
|
||||
"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_collection_for_full_text_search(self, tokenizer):
|
||||
"""
|
||||
target: test create collection with full text search
|
||||
@ -97,7 +97,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
|
||||
assert len(res["functions"]) == len(text_fields)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer):
|
||||
"""
|
||||
target: test create collection with full text search twice with same schema
|
||||
@ -175,7 +175,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("tokenizer", ["unsupported"])
|
||||
@pytest.mark.xfail(reason="")
|
||||
@pytest.mark.skip(reason="check not implement may cause panic")
|
||||
def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer):
|
||||
"""
|
||||
target: test create collection with full text search with unsupported tokenizer
|
||||
@ -249,7 +249,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
||||
expected: create collection failed
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
@ -327,7 +327,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
||||
expected: create collection failed
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
@ -397,7 +397,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("nullable", [False, True])
|
||||
@pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable):
|
||||
"""
|
||||
target: test insert data with full text search
|
||||
@ -542,7 +542,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_dynamic_field", [True])
|
||||
@pytest.mark.parametrize("nullable", [False])
|
||||
@pytest.mark.parametrize("text_lang", ["en"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field):
|
||||
"""
|
||||
target: test insert data with full text search and enable dynamic field
|
||||
@ -692,7 +692,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("nullable", [True])
|
||||
@pytest.mark.parametrize("text_lang", ["en"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable):
|
||||
"""
|
||||
target: test insert data for full text search with dataframe
|
||||
@ -831,7 +831,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
assert len(data) == count
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
|
||||
"""
|
||||
target: test insert data with full text search with part of empty string
|
||||
@ -990,7 +990,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("nullable", [True])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable):
|
||||
"""
|
||||
target: test insert data with full text search with non varchar data
|
||||
@ -1089,7 +1089,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("nullable", [False, True])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
|
||||
def test_upsert_for_full_text_search(self, tokenizer, nullable):
|
||||
"""
|
||||
@ -1260,7 +1260,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("nullable", [False])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
|
||||
def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable):
|
||||
"""
|
||||
@ -1402,7 +1402,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
|
||||
"""
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_delete_for_full_text_search(self, tokenizer):
|
||||
"""
|
||||
target: test delete data for full text search
|
||||
@ -1564,7 +1564,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("b", [0.1])
|
||||
@pytest.mark.parametrize("k", [1.2])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_index_for_full_text_search_default(
|
||||
self, tokenizer, index_type, k, b
|
||||
):
|
||||
@ -1688,7 +1688,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("b", [0.5])
|
||||
@pytest.mark.parametrize("k", [1.5])
|
||||
@pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_full_text_search_with_invalid_index_type(
|
||||
self, tokenizer, index_type, k, b
|
||||
):
|
||||
@ -1796,7 +1796,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("k", [1.5])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_full_text_search_index_with_invalid_metric_type(
|
||||
self, tokenizer, index_type, metric_type, k, b
|
||||
):
|
||||
@ -1903,7 +1903,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("b", [0.5])
|
||||
@pytest.mark.parametrize("k", [1.5])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
|
||||
self, tokenizer, index_type, k, b
|
||||
):
|
||||
@ -2000,7 +2000,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("b", [-1])
|
||||
@pytest.mark.parametrize("k", [-1])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_create_full_text_search_with_invalid_bm25_params(
|
||||
self, tokenizer, index_type, k, b
|
||||
):
|
||||
@ -2121,7 +2121,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
||||
@pytest.mark.parametrize("expr", ["text_match", "id_range"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
@pytest.mark.parametrize("offset", [10, 0])
|
||||
def test_full_text_search_default(
|
||||
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
||||
@ -2317,7 +2317,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("expr", ["text_match"])
|
||||
@pytest.mark.parametrize("offset", [10])
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba"])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
|
||||
def test_full_text_search_with_jieba_tokenizer(
|
||||
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
||||
):
|
||||
@ -2329,7 +2328,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
expected: full text search successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": tokenizer,
|
||||
"tokenizer": tokenizer,
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
@ -2511,7 +2510,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("expr", [None])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_full_text_search_with_range_search(
|
||||
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
||||
):
|
||||
@ -2676,7 +2675,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("expr", [None])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_full_text_search_with_search_iterator(
|
||||
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
||||
):
|
||||
@ -2829,7 +2828,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("invalid_search_data", ["empty_text"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022")
|
||||
def test_search_for_full_text_search_with_empty_string_search_data(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
|
||||
@ -2959,7 +2958,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
|
||||
@pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_search_for_full_text_search_with_invalid_search_data(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
|
||||
):
|
||||
@ -3106,7 +3105,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_partition_key", [True])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_hybrid_search_with_full_text_search(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type
|
||||
):
|
||||
|
@ -4441,7 +4441,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True, False])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_query_text_match_en_normal(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key
|
||||
):
|
||||
@ -4724,24 +4724,16 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
expected: get the correct token, text match successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": "standard",
|
||||
"alpha_num_only": True,
|
||||
"ascii_folding": True,
|
||||
"lower_case": True,
|
||||
"max_token_length": 40,
|
||||
"split_compound_words": [
|
||||
"dampf",
|
||||
"schiff",
|
||||
"fahrt",
|
||||
"brot",
|
||||
"backen",
|
||||
"automat",
|
||||
],
|
||||
"stemmer": "English",
|
||||
"stop": {
|
||||
"language": "English",
|
||||
"words": ["an", "the"],
|
||||
},
|
||||
"tokenizer": "standard",
|
||||
# "lowercase", "asciifolding", "alphanumonly" was system filter
|
||||
"filter":["lowercase", "asciifolding", "alphanumonly",
|
||||
{
|
||||
"type": "stop",
|
||||
"stop_words": ["in", "of"],
|
||||
}, {
|
||||
"type": "stemmer",
|
||||
"language": "english",
|
||||
}],
|
||||
}
|
||||
dim = 128
|
||||
fields = [
|
||||
@ -4852,7 +4844,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
expected: query successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
@ -4966,7 +4958,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
expected: query successfully and result is correct
|
||||
"""
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
@ -5109,7 +5101,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
|
||||
# 1. initialize with data
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
@ -5254,7 +5246,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
# 1. initialize with data
|
||||
fake_en = Faker("en_US")
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
dim = 128
|
||||
default_fields = [
|
||||
@ -5481,7 +5473,7 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
"""
|
||||
# 1. initialize with data
|
||||
tokenizer_params = {
|
||||
"tokenizer": "default",
|
||||
"tokenizer": "standard",
|
||||
}
|
||||
# 1. initialize with data
|
||||
dim = 128
|
||||
|
@ -13290,7 +13290,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True, False])
|
||||
@pytest.mark.parametrize("tokenizer", ["default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
def test_search_with_text_match_filter_normal_en(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key
|
||||
):
|
||||
|
@ -1881,7 +1881,7 @@ class TestSearchVector(TestBase):
|
||||
assert len(res) == limit
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba", "default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
|
||||
def test_search_vector_with_text_match_filter(self, tokenizer):
|
||||
"""
|
||||
Query a vector with a simple payload
|
||||
@ -2718,7 +2718,7 @@ class TestQueryVector(TestBase):
|
||||
if "like" in filter_expr:
|
||||
assert name.startswith(prefix)
|
||||
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba", "default"])
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
|
||||
def test_query_vector_with_text_match_filter(self, tokenizer):
|
||||
"""
|
||||
Query a vector with a simple payload
|
||||
|
Loading…
Reference in New Issue
Block a user