feat: Tokenizer support build with params and clone for concurrency (#37048)

relate: https://github.com/milvus-io/milvus/issues/35853
https://github.com/milvus-io/milvus/issues/36751

---------

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2024-11-06 17:48:24 +08:00 committed by GitHub
parent 8714774305
commit d67853fa89
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
39 changed files with 667 additions and 228 deletions

View File

@ -22,18 +22,9 @@ TokenizerParams
ParseTokenizerParams(const TypeParams& params) {
auto iter = params.find("tokenizer_params");
if (iter == params.end()) {
return {};
return "{}";
}
nlohmann::json j = nlohmann::json::parse(iter->second);
std::map<std::string, std::string> ret;
for (const auto& [k, v] : j.items()) {
try {
ret[k] = v.get<std::string>();
} catch (std::exception& e) {
ret[k] = v.dump();
}
}
return ret;
return iter->second;
}
bool

View File

@ -25,7 +25,7 @@
namespace milvus {
using TypeParams = std::map<std::string, std::string>;
using TokenizerParams = std::map<std::string, std::string>;
using TokenizerParams = std::string;
TokenizerParams
ParseTokenizerParams(const TypeParams& params);

View File

@ -19,10 +19,9 @@
namespace milvus::index {
constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";
TextMatchIndex::TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(commit_interval_in_ms),
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
@ -31,10 +30,9 @@ TextMatchIndex::TextMatchIndex(
field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
}
TextMatchIndex::TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
TextMatchIndex::TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
path_ = path;
@ -47,10 +45,9 @@ TextMatchIndex::TextMatchIndex(
tokenizer_params);
}
TextMatchIndex::TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params)
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
last_commit_time_(stdclock::now()) {
schema_ = ctx.fieldDataMeta.field_schema;
@ -174,9 +171,8 @@ TextMatchIndex::CreateReader() {
}
void
TextMatchIndex::RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params) {
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
}

View File

@ -22,20 +22,17 @@ using stdclock = std::chrono::high_resolution_clock;
class TextMatchIndex : public InvertedIndexTantivy<std::string> {
public:
// for growing segment.
explicit TextMatchIndex(
int64_t commit_interval_in_ms,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
explicit TextMatchIndex(int64_t commit_interval_in_ms,
const char* tokenizer_name,
const char* tokenizer_params);
// for sealed segment.
explicit TextMatchIndex(
const std::string& path,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
explicit TextMatchIndex(const std::string& path,
const char* tokenizer_name,
const char* tokenizer_params);
// for building index.
explicit TextMatchIndex(
const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
const char* tokenizer_name,
const char* tokenizer_params);
// for loading index
explicit TextMatchIndex(const storage::FileManagerContext& ctx);
@ -67,9 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
CreateReader();
void
RegisterTokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params);
RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);
TargetBitmap
MatchQuery(const std::string& query);

View File

@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
auto index = std::make_unique<index::TextMatchIndex>(
fileManagerContext,
"milvus_tokenizer",
field_schema.get_tokenizer_params());
field_schema.get_tokenizer_params().c_str());
index->Build(config);
auto binary =
std::make_unique<knowhere::BinarySet>(index->Upload(config));

View File

@ -1511,13 +1511,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
}
{
@ -1567,7 +1567,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
text_indexes_[field_id] = std::move(index);
}
@ -1578,7 +1578,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
"cannot create text index on non-string type");
// todo: make this(200) configurable.
auto index = std::make_unique<index::TextMatchIndex>(
200, "milvus_tokenizer", field_meta.get_tokenizer_params());
200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
index->Commit();
index->CreateReader();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
} else {
// build text index using mmap.
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
"milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
}
{
@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index->Reload();
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
text_indexes_[field_id] = std::move(index);
}
@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
std::unique_lock lck(mutex_);
const auto& field_meta = schema_->operator[](field_id);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_tokenizer_params());
field_meta.get_tokenizer_params().c_str());
text_indexes_[field_id] = std::move(index);
}

View File

@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "segcore/tokenizer_c.h"
#include <memory>
#include "common/FieldMeta.h"
#include "common/protobuf_utils.h"
#include "pb/schema.pb.h"
@ -19,10 +20,9 @@
using Map = std::map<std::string, std::string>;
CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer) {
create_tokenizer(const char* params, CTokenizer* tokenizer) {
try {
auto mm = reinterpret_cast<Map*>(m);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
*tokenizer = impl.release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
@ -30,6 +30,17 @@ create_tokenizer(CMap m, CTokenizer* tokenizer) {
}
}
CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst) {
try {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(*tokenizer);
*rst = impl->Clone().release();
return milvus::SuccessCStatus();
} catch (std::exception& e) {
return milvus::FailureCStatus(&e);
}
}
void
free_tokenizer(CTokenizer tokenizer) {
auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);

View File

@ -24,7 +24,10 @@ extern "C" {
typedef void* CTokenizer;
CStatus
create_tokenizer(CMap m, CTokenizer* tokenizer);
create_tokenizer(const char* params, CTokenizer* tokenizer);
CStatus
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
void
free_tokenizer(CTokenizer tokenizer);

View File

@ -1021,11 +1021,12 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.115"
version = "1.0.128"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
@ -1166,6 +1167,7 @@ dependencies = [
"libc",
"log",
"scopeguard",
"serde_json",
"tantivy",
"tantivy-jieba",
"zstd-sys",

View File

@ -15,6 +15,7 @@ env_logger = "0.11.3"
log = "0.4.21"
tantivy-jieba = "0.10.0"
lazy_static = "1.4.0"
serde_json = "1.0.128"
[build-dependencies]
cbindgen = "0.26.0"

View File

@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);
RustArray tantivy_match_query(void *ptr, const char *query);
void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params);
void tantivy_register_tokenizer(void *ptr,
const char *tokenizer_name,
const char *tokenizer_params);
void *tantivy_create_index(const char *field_name,
TantivyDataType data_type,
@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
void *tantivy_create_text_writer(const char *field_name,
const char *path,
const char *tokenizer_name,
void *tokenizer_params,
const char *tokenizer_params,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
@ -157,7 +159,9 @@ bool tantivy_token_stream_advance(void *token_stream);
const char *tantivy_token_stream_get_token(void *token_stream);
void *tantivy_create_tokenizer(void *tokenizer_params);
void *tantivy_create_tokenizer(const char *tokenizer_params);
void *tantivy_clone_tokenizer(void *ptr);
void tantivy_free_tokenizer(void *tokenizer);

View File

@ -0,0 +1,40 @@
use serde_json as json;
#[derive(Debug)]
pub struct TantivyError{
reason: String,
}
impl TantivyError{
fn new(reason:String) -> Self{
TantivyError{reason:reason}
}
pub fn reason(&self) -> String{
return self.reason.clone()
}
}
impl From<&str> for TantivyError{
fn from(value: &str) -> Self {
Self::new(value.to_string())
}
}
impl From<String> for TantivyError{
fn from(value: String) -> Self {
Self::new(value)
}
}
impl From<json::Error> for TantivyError{
fn from(value: json::Error) -> Self {
Self::new(value.to_string())
}
}
impl ToString for TantivyError{
fn to_string(&self) -> String {
return self.reason()
}
}

View File

@ -4,7 +4,7 @@ use tantivy::{
Term,
};
use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer};
use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};
impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,
@ -14,7 +14,7 @@ impl IndexReaderWrapper {
let mut tokenizer = self
.index
.tokenizer_for_field(self.field)
.unwrap_or(default_tokenizer())
.unwrap_or(standard_analyzer(vec![]))
.clone();
let mut token_stream = tokenizer.token_stream(q);
let mut terms: Vec<Term> = Vec::new();

View File

@ -1,8 +1,14 @@
use std::{collections::HashMap, ffi::CStr};
use std::{ffi::CStr};
use libc::{c_char, c_void};
use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer};
use crate::{
array::RustArray,
string_c::c_str_to_str,
index_reader::IndexReaderWrapper,
tokenizer::create_tokenizer,
log::init_log,
};
#[no_mangle]
pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray {
@ -18,23 +24,22 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
pub extern "C" fn tantivy_register_tokenizer(
ptr: *mut c_void,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
) {
init_log();
let real = ptr as *mut IndexReaderWrapper;
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
};
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Some(text_analyzer) => unsafe {
Ok(text_analyzer) => unsafe {
(*real).register_tokenizer(
String::from(tokenizer_name_str.to_str().unwrap()),
text_analyzer,
);
},
None => {
panic!("unsupported tokenizer");
}
Err(err) => {
panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
},
}
}

View File

@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::ffi::c_char;
use std::ffi::c_void;
use std::ffi::CStr;
@ -6,26 +5,27 @@ use std::ffi::CStr;
use crate::index_writer::IndexWriterWrapper;
use crate::tokenizer::create_tokenizer;
use crate::util::create_binding;
use crate::string_c::c_str_to_str;
use crate::log::init_log;
#[no_mangle]
pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tokenizer_name: *const c_char,
tokenizer_params: *mut c_void,
tokenizer_params: *const c_char,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> *mut c_void {
init_log();
let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
};
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Some(text_analyzer) => {
Ok(text_analyzer) => {
let wrapper = IndexWriterWrapper::create_text_writer(
String::from(field_name_str),
String::from(path_str),
@ -37,8 +37,9 @@ pub extern "C" fn tantivy_create_text_writer(
);
create_binding(wrapper)
}
None => {
Err(err) => {
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
std::ptr::null_mut()
}
},
}
}

View File

@ -15,8 +15,10 @@ mod log;
mod string_c;
mod token_stream_c;
mod tokenizer;
mod tokenizer_filter;
mod tokenizer_c;
mod util;
mod error;
mod util_c;
mod vec_collector;

View File

@ -1,54 +1,254 @@
use lazy_static::lazy_static;
use log::{info, warn};
use log::warn;
use std::collections::HashMap;
use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::log::init_log;
use tantivy::tokenizer::*;
use serde_json as json;
lazy_static! {
static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
use crate::tokenizer_filter::*;
use crate::error::TantivyError;
use crate::util::*;
// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(40));
if stop_words.len() > 0{
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
pub(crate) fn default_tokenizer() -> TextAnalyzer {
DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
fn standard_builder() -> TextAnalyzerBuilder{
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
}
fn jieba_tokenizer() -> TextAnalyzer {
tantivy_jieba::JiebaTokenizer {}.into()
fn whitespace_builder()-> TextAnalyzerBuilder{
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}
pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
init_log();
match params.get("tokenizer") {
Some(tokenizer_name) => match tokenizer_name.as_str() {
"default" => {
Some(default_tokenizer())
}
"jieba" => {
Some(jieba_tokenizer())
}
s => {
warn!("unsupported tokenizer: {}", s);
None
}
},
None => {
Some(default_tokenizer())
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
match name.as_str() {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
other => {
warn!("unsupported tokenizer: {}", other);
Err(format!("unsupported tokenizer: {}", other).into())
}
}
}
struct AnalyzerBuilder<'a>{
// builder: TextAnalyzerBuilder
filters:HashMap<String, SystemFilter>,
params:&'a json::Map<String, json::Value>
}
impl AnalyzerBuilder<'_>{
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{
AnalyzerBuilder{
filters: HashMap::new(),
params:params,
}
}
fn get_tokenizer_name(&self) -> Result<String, TantivyError>{
let tokenizer=self.params.get("tokenizer");
if tokenizer.is_none(){
return Ok("standard".to_string());
}
if !tokenizer.unwrap().is_string(){
return Err(format!("tokenizer name should be string").into());
}
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
}
fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{
match SystemFilter::try_from(params){
Ok(filter) => {
self.filters.insert(name.to_string(), filter);
Ok(())
},
Err(e) => {Err(e)},
}
}
fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{
for (name, value) in params{
if !value.is_object(){
continue;
}
self.add_custom_filter(name, value.as_object().unwrap())?;
}
Ok(())
}
fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{
if !params.is_array(){
return Err("filter params should be array".into());
}
let filters = params.as_array().unwrap();
for filter in filters{
if filter.is_string(){
let filter_name = filter.as_str().unwrap();
let costum = self.filters.remove(filter_name);
if !costum.is_none(){
builder = costum.unwrap().transform(builder);
continue;
}
// check if filter was system filter
let system = SystemFilter::from(filter_name);
match system {
SystemFilter::Invalid => {
return Err(format!("build analyzer failed, filter not found :{}", filter_name).into())
}
other => {
builder = other.transform(builder);
},
}
}else if filter.is_object(){
let filter=SystemFilter::try_from(filter.as_object().unwrap())?;
builder = filter.transform(builder);
}
};
Ok(builder)
}
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
for (key, value) in self.params{
match key.as_str(){
"tokenizer" => {},
"filter" => {
// build with filter if filter param exist
builder=self.build_filter(builder, value)?;
},
"max_token_length" => {
if !value.is_u64(){
return Err("max token length should be int type".into());
}
builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
}
other => return Err(format!("unknown analyzer option key: {}", other).into()),
}
}
Ok(builder)
}
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
match type_{
"standard" => {
let value = self.params.get("stop_words");
match value{
Some(value)=>{
let str_list = get_string_list(value, "filter stop_words")?;
Ok(standard_analyzer(str_list))
}
None => Ok(standard_analyzer(vec![]))
}
},
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
}
}
fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
// build base build-in analyzer
match self.params.get("type"){
Some(type_) =>{
if !type_.is_string(){
return Err(format!("analyzer type shoud be string").into())
}
return self.build_template(type_.as_str().unwrap());
},
None => {}
};
//build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?;
// jieba analyzer can't add filter.
if tokenizer_name == "jieba"{
return Ok(tantivy_jieba::JiebaTokenizer{}.into());
}
let mut builder=get_builder_by_name(&tokenizer_name)?;
// build with option
builder = self.build_option(builder)?;
Ok(builder.build())
}
}
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
match json::from_str::<json::Value>(&params){
Ok(value) =>{
if value.is_null(){
return Ok(standard_analyzer(vec![]));
}
if !value.is_object(){
return Err("tokenizer params should be a json map".into());
}
let json_params = value.as_object().unwrap();
// create builder
let analyzer_params=json_params.get("analyzer");
if analyzer_params.is_none(){
return Ok(standard_analyzer(vec![]));
}
if !analyzer_params.unwrap().is_object(){
return Err("analyzer params should be a json map".into());
}
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
// build custom filter
let filter_params=json_params.get("filter");
if !filter_params.is_none() && filter_params.unwrap().is_object(){
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
}
// build analyzer
builder.build()
},
Err(err) => Err(err.into()),
}
}
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
if params.len()==0{
return Ok(standard_analyzer(vec![]));
}
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use crate::tokenizer::create_tokenizer;
#[test]
fn test_create_tokenizer() {
let mut params : HashMap<String, String> = HashMap::new();
params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap());
let params = r#"{"tokenizer": "standard"}"#;
let tokenizer = create_tokenizer(&params);
assert!(tokenizer.is_some());
let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
}
}
#[test]
fn test_jieba_tokenizer() {
let params = r#"{"tokenizer": "jieba"}"#;
let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("系统安全");
while stream.advance(){
let token = stream.token();
let text = token.text.clone();
print!("test token :{}\n", text.as_str())
}
}
}

View File

@ -1,25 +1,34 @@
use std::collections::HashMap;
use libc::c_void;
use libc::{c_void,c_char};
use tantivy::tokenizer::TextAnalyzer;
use crate::{
string_c::c_str_to_str,
tokenizer::create_tokenizer,
util::{create_binding, free_binding},
log::init_log,
};
#[no_mangle]
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
let analyzer = unsafe {
let m = tokenizer_params as *const HashMap<String, String>;
create_tokenizer(&(*m))
};
pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
init_log();
let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
let analyzer = create_tokenizer(&params);
match analyzer {
Some(text_analyzer) => create_binding(text_analyzer),
None => std::ptr::null_mut(),
Ok(text_analyzer) => create_binding(text_analyzer),
Err(err) => {
log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
std::ptr::null_mut()
},
}
}
#[no_mangle]
pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
let analyzer=ptr as *mut TextAnalyzer;
let clone = unsafe {(*analyzer).clone()};
create_binding(clone)
}
#[no_mangle]
pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
free_binding::<TextAnalyzer>(tokenizer);

View File

@ -0,0 +1,154 @@
use tantivy::tokenizer::*;
use serde_json as json;
use crate::error::TantivyError;
use crate::util::*;
pub(crate) enum SystemFilter{
Invalid,
LowerCase(LowerCaser),
AsciiFolding(AsciiFoldingFilter),
AlphaNumOnly(AlphaNumOnlyFilter),
Length(RemoveLongFilter),
Stop(StopWordFilter),
Decompounder(SplitCompoundWords),
Stemmer(Stemmer)
}
impl SystemFilter{
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{
match self{
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::Length(filter) => builder.filter(filter).dynamic(),
Self::Stop(filter) => builder.filter(filter).dynamic(),
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
Self::Stemmer(filter) => builder.filter(filter).dynamic(),
Self::Invalid => builder,
}
}
}
// create length filter from params
// {
// "type": "length",
// "max": 10, // length
// }
// TODO support min length
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{
let limit_str = params.get("max");
if limit_str.is_none() || !limit_str.unwrap().is_u64(){
return Err("lenth max param was none or not uint".into())
}
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit)))
}
fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
let value = params.get("stop_words");
if value.is_none(){
return Err("stop filter stop_words can't be empty".into());
}
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
}
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
let value = params.get("word_list");
if value.is_none() || !value.unwrap().is_array(){
return Err("decompounder word list should be array".into())
}
let stop_words = value.unwrap().as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words{
match element.as_str(){
Some(word) => str_list.push(word.to_string()),
None => return Err("decompounder word list item should be string".into())
}
};
match SplitCompoundWords::from_dictionary(str_list){
Ok(f) => Ok(SystemFilter::Decompounder(f)),
Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into())
}
}
fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
let value = params.get("language");
if value.is_none() || !value.unwrap().is_string(){
return Err("stemmer language field should be string".into())
}
match value.unwrap().as_str().unwrap().into_language(){
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()),
}
}
trait LanguageParser {
type Error;
fn into_language(self) -> Result<Language, Self::Error>;
}
impl LanguageParser for &str {
type Error = TantivyError;
fn into_language(self) -> Result<Language, Self::Error> {
match self.to_lowercase().as_str() {
"arabig" => Ok(Language::Arabic),
"danish" => Ok(Language::Danish),
"dutch" => Ok(Language::Dutch),
"english" => Ok(Language::English),
"finnish" => Ok(Language::Finnish),
"french" => Ok(Language::French),
"german" => Ok(Language::German),
"greek" => Ok(Language::Greek),
"hungarian" => Ok(Language::Hungarian),
"italian" => Ok(Language::Italian),
"norwegian" => Ok(Language::Norwegian),
"portuguese" => Ok(Language::Portuguese),
"romanian" => Ok(Language::Romanian),
"russian" => Ok(Language::Russian),
"spanish" => Ok(Language::Spanish),
"swedish" => Ok(Language::Swedish),
"tamil" => Ok(Language::Tamil),
"turkish" => Ok(Language::Turkish),
other => Err(format!("unsupport language: {}", other).into()),
}
}
}
impl From<&str> for SystemFilter{
fn from(value: &str) -> Self {
match value{
"lowercase" => Self::LowerCase(LowerCaser),
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
_ => Self::Invalid,
}
}
}
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
type Error = TantivyError;
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> {
match params.get(&"type".to_string()){
Some(value) =>{
if !value.is_string(){
return Err("filter type should be string".into());
};
match value.as_str().unwrap(){
"length" => get_length_filter(params),
"stop" => get_stop_words_filter(params),
"decompounder" => get_decompounder_filter(params),
"stemmer" => get_stemmer_filter(params),
other=> Err(format!("unsupport filter type: {}", other).into()),
}
}
None => Err("no type field in filter params".into()),
}
}
}

View File

@ -1,5 +1,7 @@
use std::ffi::c_void;
use std::ops::Bound;
use serde_json as json;
use crate::error::TantivyError;
use tantivy::{directory::MmapDirectory, Index};
@ -28,3 +30,19 @@ pub fn free_binding<T>(ptr: *mut c_void) {
drop(Box::from_raw(real));
}
}
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{
if !value.is_array(){
return Err(format!("{} should be array", label).into())
}
let stop_words = value.as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words{
match element.as_str(){
Some(word) => str_list.push(word.to_string()),
None => return Err(format!("{} list item should be string", label).into())
}
};
Ok(str_list)
}

View File

@ -14,7 +14,7 @@ namespace milvus::tantivy {
using Map = std::map<std::string, std::string>;
static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
static Map DEFAULT_TOKENIZER_PARAMS = {};
static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
@ -101,17 +101,14 @@ struct TantivyIndexWrapper {
bool in_ram,
const char* path,
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
const std::map<std::string, std::string>&
tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
RustHashMap m;
m.from(tokenizer_params);
writer_ = tantivy_create_text_writer(field_name,
path,
tokenizer_name,
m.get_pointer(),
tokenizer_params,
num_threads,
overall_memory_budget_in_bytes,
in_ram);
@ -134,14 +131,11 @@ struct TantivyIndexWrapper {
}
void
register_tokenizer(
const char* tokenizer_name,
const std::map<std::string, std::string>& tokenizer_params) {
RustHashMap m;
m.from(tokenizer_params);
register_tokenizer(const char* tokenizer_name,
const char* tokenizer_params) {
if (reader_ != nullptr) {
tantivy_register_tokenizer(
reader_, tokenizer_name, m.get_pointer());
reader_, tokenizer_name, tokenizer_params);
}
}

View File

@ -11,15 +11,17 @@ struct Tokenizer {
public:
NO_COPY_OR_ASSIGN(Tokenizer);
explicit Tokenizer(const std::map<std::string, std::string>& params) {
RustHashMap m;
m.from(params);
ptr_ = tantivy_create_tokenizer(m.get_pointer());
explicit Tokenizer(std::string&& params) {
auto shared_params = std::make_shared<std::string>(std::move(params));
ptr_ = tantivy_create_tokenizer(shared_params->c_str());
if (ptr_ == nullptr) {
throw std::invalid_argument("invalid tokenizer parameters");
}
}
explicit Tokenizer(void* _ptr) : ptr_(_ptr) {
}
~Tokenizer() {
if (ptr_ != nullptr) {
tantivy_free_tokenizer(ptr_);
@ -34,6 +36,12 @@ struct Tokenizer {
return std::make_unique<TokenStream>(token_stream, shared_text);
}
std::unique_ptr<Tokenizer>
Clone() {
auto newptr = tantivy_clone_tokenizer(ptr_);
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
}
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
std::unique_ptr<TokenStream>
CreateTokenStreamCopyText(const std::string& text) {

View File

@ -47,12 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) {
}
TEST(CTokenizer, Default) {
auto m = create_cmap();
set_cmap(m, "tokenizer", "default");
auto tokenizer_params = R"({"tokenizer": "standard"})";
CTokenizer tokenizer;
{
auto status = create_tokenizer(m, &tokenizer);
auto status = create_tokenizer(tokenizer_params, &tokenizer);
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
}
@ -71,5 +69,4 @@ TEST(CTokenizer, Default) {
free_token_stream(token_stream);
free_tokenizer(tokenizer);
free_cmap(m);
}

View File

@ -10,9 +10,9 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <string>
#include "common/Schema.h"
#include "segcore/segment_c.h"
#include "segcore/SegmentGrowing.h"
#include "segcore/SegmentGrowingImpl.h"
#include "test_utils/DataGen.h"
@ -80,23 +80,19 @@ TEST(ParseJson, Naive) {
TEST(ParseTokenizerParams, NoTokenizerParams) {
TypeParams params{{"k", "v"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(0, p.size());
ASSERT_EQ("{}", std::string(p));
}
TEST(ParseTokenizerParams, Default) {
TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}};
auto p = ParseTokenizerParams(params);
ASSERT_EQ(1, p.size());
auto iter = p.find("tokenizer");
ASSERT_NE(p.end(), iter);
ASSERT_EQ("default", iter->second);
ASSERT_EQ(params.at("tokenizer_params"), p);
}
TEST(TextMatch, Index) {
using Index = index::TextMatchIndex;
auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
"milvus_tokenizer",
std::map<std::string, std::string>{});
auto index = std::make_unique<Index>(
std::numeric_limits<int64_t>::max(), "milvus_tokenizer", "{}");
index->CreateReader();
index->AddText("football, basketball, pingpang", 0);
index->AddText("swimming, football", 1);

View File

@ -297,7 +297,6 @@ func (t *queryTask) CanSkipAllocTimestamp() bool {
}
consistencyLevel = collectionInfo.consistencyLevel
}
return consistencyLevel != commonpb.ConsistencyLevel_Strong
}

View File

@ -111,7 +111,6 @@ func (t *searchTask) CanSkipAllocTimestamp() bool {
}
consistencyLevel = collectionInfo.consistencyLevel
}
return consistencyLevel != commonpb.ConsistencyLevel_Strong
}

View File

@ -33,6 +33,15 @@ func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
return NewCTokenStream(ptr)
}
func (impl *CTokenizer) Clone() (tokenizerapi.Tokenizer, error) {
var newptr C.CTokenizer
status := C.clone_tokenizer(&impl.ptr, &newptr)
if err := HandleCStatus(&status, "failed to clone tokenizer"); err != nil {
return nil, err
}
return NewCTokenizer(newptr), nil
}
func (impl *CTokenizer) Destroy() {
C.free_tokenizer(impl.ptr)
}

View File

@ -9,16 +9,17 @@ package ctokenizer
import "C"
import (
"unsafe"
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
)
func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) {
mm := NewCMap()
defer mm.Destroy()
mm.From(m)
func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) {
paramPtr := C.CString(param)
defer C.free(unsafe.Pointer(paramPtr))
var ptr C.CTokenizer
status := C.create_tokenizer(mm.GetPointer(), &ptr)
status := C.create_tokenizer(paramPtr, &ptr)
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
return nil, err
}

View File

@ -10,7 +10,7 @@ import (
func TestTokenizer(t *testing.T) {
// default tokenizer.
{
m := make(map[string]string)
m := "{\"tokenizer\": \"standard\"}"
tokenizer, err := NewTokenizer(m)
assert.NoError(t, err)
defer tokenizer.Destroy()
@ -24,8 +24,7 @@ func TestTokenizer(t *testing.T) {
// jieba tokenizer.
{
m := make(map[string]string)
m["tokenizer"] = "jieba"
m := "{\"tokenizer\": \"jieba\"}"
tokenizer, err := NewTokenizer(m)
assert.NoError(t, err)
defer tokenizer.Destroy()

View File

@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{Key: "enable_match", Value: "true"},
{Key: "tokenizer_params", Value: `{"tokenizer": "default"}`},
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
},
},
{
@ -41,7 +41,7 @@ func TestValidateTextSchema(t *testing.T) {
DataType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{
{Key: "enable_match", Value: "true"},
{Key: "tokenizer_params", Value: `{"tokenizer": "jieba"}`},
{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
},
},
}

View File

@ -40,6 +40,15 @@ type BM25FunctionRunner struct {
concurrency int
}
func getTokenizerParams(field *schemapb.FieldSchema) string {
for _, param := range field.GetTypeParams() {
if param.Key == "tokenizer_params" {
return param.Value
}
}
return "{}"
}
func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) {
if len(schema.GetOutputFieldIds()) != 1 {
return nil, fmt.Errorf("bm25 function should only have one output field, but now %d", len(schema.GetOutputFieldIds()))
@ -49,17 +58,22 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
schema: schema,
concurrency: 8,
}
var params string
for _, field := range coll.GetFields() {
if field.GetFieldID() == schema.GetOutputFieldIds()[0] {
runner.outputField = field
break
}
if field.GetFieldID() == schema.GetInputFieldIds()[0] {
params = getTokenizerParams(field)
}
}
if runner.outputField == nil {
return nil, fmt.Errorf("no output field")
}
tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
tokenizer, err := ctokenizer.NewTokenizer(params)
if err != nil {
return nil, err
}
@ -69,8 +83,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
}
func (v *BM25FunctionRunner) run(data []string, dst []map[uint32]float32) error {
// TODO AOIASD Support single Tokenizer concurrency
tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
tokenizer, err := v.tokenizer.Clone()
if err != nil {
return err
}

View File

@ -3,5 +3,6 @@ package tokenizerapi
//go:generate mockery --name=Tokenizer --with-expecter
type Tokenizer interface {
NewTokenStream(text string) TokenStream
Clone() (Tokenizer, error)
Destroy()
}

View File

@ -778,7 +778,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
fields = [
gen_int64_field(),

View File

@ -33,7 +33,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_collection_for_full_text_search(self, tokenizer):
"""
target: test create collection with full text search
@ -97,7 +97,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
assert len(res["functions"]) == len(text_fields)
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer):
"""
target: test create collection with full text search twice with same schema
@ -175,7 +175,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("tokenizer", ["unsupported"])
@pytest.mark.xfail(reason="")
@pytest.mark.skip(reason="check not implement may cause panic")
def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer):
"""
target: test create collection with full text search with unsupported tokenizer
@ -249,7 +249,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
expected: create collection failed
"""
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
dim = 128
fields = [
@ -327,7 +327,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
expected: create collection failed
"""
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
dim = 128
fields = [
@ -397,7 +397,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [False, True])
@pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable):
"""
target: test insert data with full text search
@ -542,7 +542,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("enable_dynamic_field", [True])
@pytest.mark.parametrize("nullable", [False])
@pytest.mark.parametrize("text_lang", ["en"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field):
"""
target: test insert data with full text search and enable dynamic field
@ -692,7 +692,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [True])
@pytest.mark.parametrize("text_lang", ["en"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable):
"""
target: test insert data for full text search with dataframe
@ -831,7 +831,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
assert len(data) == count
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
"""
target: test insert data with full text search with part of empty string
@ -990,7 +990,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nullable", [True])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable):
"""
target: test insert data with full text search with non varchar data
@ -1089,7 +1089,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nullable", [False, True])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
def test_upsert_for_full_text_search(self, tokenizer, nullable):
"""
@ -1260,7 +1260,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nullable", [False])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable):
"""
@ -1402,7 +1402,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
"""
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_delete_for_full_text_search(self, tokenizer):
"""
target: test delete data for full text search
@ -1564,7 +1564,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("b", [0.1])
@pytest.mark.parametrize("k", [1.2])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_index_for_full_text_search_default(
self, tokenizer, index_type, k, b
):
@ -1688,7 +1688,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("b", [0.5])
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_with_invalid_index_type(
self, tokenizer, index_type, k, b
):
@ -1796,7 +1796,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_index_with_invalid_metric_type(
self, tokenizer, index_type, metric_type, k, b
):
@ -1903,7 +1903,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("b", [0.5])
@pytest.mark.parametrize("k", [1.5])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
self, tokenizer, index_type, k, b
):
@ -2000,7 +2000,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("b", [-1])
@pytest.mark.parametrize("k", [-1])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_create_full_text_search_with_invalid_bm25_params(
self, tokenizer, index_type, k, b
):
@ -2121,7 +2121,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("expr", ["text_match", "id_range"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.parametrize("offset", [10, 0])
def test_full_text_search_default(
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
@ -2317,7 +2317,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("expr", ["text_match"])
@pytest.mark.parametrize("offset", [10])
@pytest.mark.parametrize("tokenizer", ["jieba"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
def test_full_text_search_with_jieba_tokenizer(
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
@ -2329,7 +2328,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
expected: full text search successfully and result is correct
"""
tokenizer_params = {
"tokenizer": tokenizer,
"tokenizer": tokenizer,
}
dim = 128
fields = [
@ -2511,7 +2510,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", [None])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_full_text_search_with_range_search(
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
@ -2676,7 +2675,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("expr", [None])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_full_text_search_with_search_iterator(
self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
):
@ -2829,7 +2828,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("invalid_search_data", ["empty_text"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022")
def test_search_for_full_text_search_with_empty_string_search_data(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
@ -2959,7 +2958,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
@pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_search_for_full_text_search_with_invalid_search_data(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
):
@ -3106,7 +3105,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
@pytest.mark.parametrize("enable_partition_key", [True])
@pytest.mark.parametrize("enable_inverted_index", [True])
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_hybrid_search_with_full_text_search(
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type
):

View File

@ -4441,7 +4441,7 @@ class TestQueryTextMatch(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("enable_inverted_index", [True, False])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_query_text_match_en_normal(
self, tokenizer, enable_inverted_index, enable_partition_key
):
@ -4724,24 +4724,16 @@ class TestQueryTextMatch(TestcaseBase):
expected: get the correct token, text match successfully and result is correct
"""
tokenizer_params = {
"tokenizer": "standard",
"alpha_num_only": True,
"ascii_folding": True,
"lower_case": True,
"max_token_length": 40,
"split_compound_words": [
"dampf",
"schiff",
"fahrt",
"brot",
"backen",
"automat",
],
"stemmer": "English",
"stop": {
"language": "English",
"words": ["an", "the"],
},
"tokenizer": "standard",
# "lowercase", "asciifolding", "alphanumonly" was system filter
"filter":["lowercase", "asciifolding", "alphanumonly",
{
"type": "stop",
"stop_words": ["in", "of"],
}, {
"type": "stemmer",
"language": "english",
}],
}
dim = 128
fields = [
@ -4852,7 +4844,7 @@ class TestQueryTextMatch(TestcaseBase):
expected: query successfully and result is correct
"""
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
# 1. initialize with data
dim = 128
@ -4966,7 +4958,7 @@ class TestQueryTextMatch(TestcaseBase):
expected: query successfully and result is correct
"""
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
# 1. initialize with data
dim = 128
@ -5109,7 +5101,7 @@ class TestQueryTextMatch(TestcaseBase):
# 1. initialize with data
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
# 1. initialize with data
dim = 128
@ -5254,7 +5246,7 @@ class TestQueryTextMatch(TestcaseBase):
# 1. initialize with data
fake_en = Faker("en_US")
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
dim = 128
default_fields = [
@ -5481,7 +5473,7 @@ class TestQueryTextMatch(TestcaseBase):
"""
# 1. initialize with data
tokenizer_params = {
"tokenizer": "default",
"tokenizer": "standard",
}
# 1. initialize with data
dim = 128

View File

@ -13290,7 +13290,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("enable_inverted_index", [True, False])
@pytest.mark.parametrize("tokenizer", ["default"])
@pytest.mark.parametrize("tokenizer", ["standard"])
def test_search_with_text_match_filter_normal_en(
self, tokenizer, enable_inverted_index, enable_partition_key
):

View File

@ -1881,7 +1881,7 @@ class TestSearchVector(TestBase):
assert len(res) == limit
@pytest.mark.parametrize("tokenizer", ["jieba", "default"])
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
def test_search_vector_with_text_match_filter(self, tokenizer):
"""
Query a vector with a simple payload
@ -2718,7 +2718,7 @@ class TestQueryVector(TestBase):
if "like" in filter_expr:
assert name.startswith(prefix)
@pytest.mark.parametrize("tokenizer", ["jieba", "default"])
@pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
def test_query_vector_with_text_match_filter(self, tokenizer):
"""
Query a vector with a simple payload