feat: Tokenizer support build with params and clone for concurrency (#37048)

relate: https://github.com/milvus-io/milvus/issues/35853 https://github.com/milvus-io/milvus/issues/36751 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
2024-11-29 18:38:44 +08:00 · 2024-11-06 17:48:24 +08:00 · 2024-11-06 17:48:24 +08:00 · d67853fa89
commit d67853fa89
parent 8714774305
39 changed files with 667 additions and 228 deletions
--- a/internal/core/src/common/FieldMeta.cpp
+++ b/internal/core/src/common/FieldMeta.cpp
@ -22,18 +22,9 @@ TokenizerParams
 ParseTokenizerParams(const TypeParams& params) {
    auto iter = params.find("tokenizer_params");
    if (iter == params.end()) {
-        return {};
+        return "{}";
    }
-    nlohmann::json j = nlohmann::json::parse(iter->second);
-    std::map<std::string, std::string> ret;
-    for (const auto& [k, v] : j.items()) {
-        try {
-            ret[k] = v.get<std::string>();
-        } catch (std::exception& e) {
-            ret[k] = v.dump();
-        }
-    }
-    return ret;
+    return iter->second;
 }

 bool
--- a/internal/core/src/common/FieldMeta.h
+++ b/internal/core/src/common/FieldMeta.h
@ -25,7 +25,7 @@

 namespace milvus {
 using TypeParams = std::map<std::string, std::string>;
-using TokenizerParams = std::map<std::string, std::string>;
+using TokenizerParams = std::string;

 TokenizerParams
 ParseTokenizerParams(const TypeParams& params);
--- a/internal/core/src/index/TextMatchIndex.cpp
+++ b/internal/core/src/index/TextMatchIndex.cpp
@ -19,10 +19,9 @@
 namespace milvus::index {
 constexpr const char* TMP_TEXT_LOG_PREFIX = "/tmp/milvus/text-log/";

-TextMatchIndex::TextMatchIndex(
-    int64_t commit_interval_in_ms,
-    const char* tokenizer_name,
-    const std::map<std::string, std::string>& tokenizer_params)
+TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
+                               const char* tokenizer_name,
+                               const char* tokenizer_params)
    : commit_interval_in_ms_(commit_interval_in_ms),
      last_commit_time_(stdclock::now()) {
    d_type_ = TantivyDataType::Text;
@ -31,10 +30,9 @@ TextMatchIndex::TextMatchIndex(
        field_name.c_str(), true, "", tokenizer_name, tokenizer_params);
 }

-TextMatchIndex::TextMatchIndex(
-    const std::string& path,
-    const char* tokenizer_name,
-    const std::map<std::string, std::string>& tokenizer_params)
+TextMatchIndex::TextMatchIndex(const std::string& path,
+                               const char* tokenizer_name,
+                               const char* tokenizer_params)
    : commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
      last_commit_time_(stdclock::now()) {
    path_ = path;
@ -47,10 +45,9 @@ TextMatchIndex::TextMatchIndex(
                                                     tokenizer_params);
 }

-TextMatchIndex::TextMatchIndex(
-    const storage::FileManagerContext& ctx,
-    const char* tokenizer_name,
-    const std::map<std::string, std::string>& tokenizer_params)
+TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
+                               const char* tokenizer_name,
+                               const char* tokenizer_params)
    : commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
      last_commit_time_(stdclock::now()) {
    schema_ = ctx.fieldDataMeta.field_schema;
@ -174,9 +171,8 @@ TextMatchIndex::CreateReader() {
 }

 void
-TextMatchIndex::RegisterTokenizer(
-    const char* tokenizer_name,
-    const std::map<std::string, std::string>& tokenizer_params) {
+TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
+                                  const char* tokenizer_params) {
    wrapper_->register_tokenizer(tokenizer_name, tokenizer_params);
 }

--- a/internal/core/src/index/TextMatchIndex.h
+++ b/internal/core/src/index/TextMatchIndex.h
@ -22,20 +22,17 @@ using stdclock = std::chrono::high_resolution_clock;
 class TextMatchIndex : public InvertedIndexTantivy<std::string> {
 public:
    // for growing segment.
-    explicit TextMatchIndex(
-        int64_t commit_interval_in_ms,
-        const char* tokenizer_name,
-        const std::map<std::string, std::string>& tokenizer_params);
+    explicit TextMatchIndex(int64_t commit_interval_in_ms,
+                            const char* tokenizer_name,
+                            const char* tokenizer_params);
    // for sealed segment.
-    explicit TextMatchIndex(
-        const std::string& path,
-        const char* tokenizer_name,
-        const std::map<std::string, std::string>& tokenizer_params);
+    explicit TextMatchIndex(const std::string& path,
+                            const char* tokenizer_name,
+                            const char* tokenizer_params);
    // for building index.
-    explicit TextMatchIndex(
-        const storage::FileManagerContext& ctx,
-        const char* tokenizer_name,
-        const std::map<std::string, std::string>& tokenizer_params);
+    explicit TextMatchIndex(const storage::FileManagerContext& ctx,
+                            const char* tokenizer_name,
+                            const char* tokenizer_params);
    // for loading index
    explicit TextMatchIndex(const storage::FileManagerContext& ctx);

@ -67,9 +64,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
    CreateReader();

    void
-    RegisterTokenizer(
-        const char* tokenizer_name,
-        const std::map<std::string, std::string>& tokenizer_params);
+    RegisterTokenizer(const char* tokenizer_name, const char* tokenizer_params);

    TargetBitmap
    MatchQuery(const std::string& query);
--- a/internal/core/src/indexbuilder/index_c.cpp
+++ b/internal/core/src/indexbuilder/index_c.cpp
@ -284,7 +284,7 @@ BuildTextIndex(CBinarySet* c_binary_set,
        auto index = std::make_unique<index::TextMatchIndex>(
            fileManagerContext,
            "milvus_tokenizer",
-            field_schema.get_tokenizer_params());
+            field_schema.get_tokenizer_params().c_str());
        index->Build(config);
        auto binary =
            std::make_unique<knowhere::BinarySet>(index->Upload(config));
--- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp
+++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp
@ -1511,13 +1511,13 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
        index = std::make_unique<index::TextMatchIndex>(
            std::numeric_limits<int64_t>::max(),
            "milvus_tokenizer",
-            field_meta.get_tokenizer_params());
+            field_meta.get_tokenizer_params().c_str());
    } else {
        // build text index using mmap.
        index = std::make_unique<index::TextMatchIndex>(
            cfg.GetMmapPath(),
            "milvus_tokenizer",
-            field_meta.get_tokenizer_params());
+            field_meta.get_tokenizer_params().c_str());
    }

    {
@ -1567,7 +1567,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
    index->Reload();

    index->RegisterTokenizer("milvus_tokenizer",
-                             field_meta.get_tokenizer_params());
+                             field_meta.get_tokenizer_params().c_str());

    text_indexes_[field_id] = std::move(index);
 }
@ -1578,7 +1578,7 @@ ChunkedSegmentSealedImpl::LoadTextIndex(
    std::unique_lock lck(mutex_);
    const auto& field_meta = schema_->operator[](field_id);
    index->RegisterTokenizer("milvus_tokenizer",
-                             field_meta.get_tokenizer_params());
+                             field_meta.get_tokenizer_params().c_str());
    text_indexes_[field_id] = std::move(index);
 }

--- a/internal/core/src/segcore/SegmentGrowingImpl.cpp
+++ b/internal/core/src/segcore/SegmentGrowingImpl.cpp
@ -859,11 +859,11 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
               "cannot create text index on non-string type");
    // todo: make this(200) configurable.
    auto index = std::make_unique<index::TextMatchIndex>(
-        200, "milvus_tokenizer", field_meta.get_tokenizer_params());
+        200, "milvus_tokenizer", field_meta.get_tokenizer_params().c_str());
    index->Commit();
    index->CreateReader();
    index->RegisterTokenizer("milvus_tokenizer",
-                             field_meta.get_tokenizer_params());
+                             field_meta.get_tokenizer_params().c_str());
    text_indexes_[field_id] = std::move(index);
 }

--- a/internal/core/src/segcore/SegmentSealedImpl.cpp
+++ b/internal/core/src/segcore/SegmentSealedImpl.cpp
@ -2014,13 +2014,13 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
        index = std::make_unique<index::TextMatchIndex>(
            std::numeric_limits<int64_t>::max(),
            "milvus_tokenizer",
-            field_meta.get_tokenizer_params());
+            field_meta.get_tokenizer_params().c_str());
    } else {
        // build text index using mmap.
        index = std::make_unique<index::TextMatchIndex>(
            cfg.GetMmapPath(),
            "milvus_tokenizer",
-            field_meta.get_tokenizer_params());
+            field_meta.get_tokenizer_params().c_str());
    }

    {
@ -2069,7 +2069,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
    index->Reload();

    index->RegisterTokenizer("milvus_tokenizer",
-                             field_meta.get_tokenizer_params());
+                             field_meta.get_tokenizer_params().c_str());

    text_indexes_[field_id] = std::move(index);
 }
@ -2080,7 +2080,7 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id,
    std::unique_lock lck(mutex_);
    const auto& field_meta = schema_->operator[](field_id);
    index->RegisterTokenizer("milvus_tokenizer",
-                             field_meta.get_tokenizer_params());
+                             field_meta.get_tokenizer_params().c_str());
    text_indexes_[field_id] = std::move(index);
 }

--- a/internal/core/src/segcore/tokenizer_c.cpp
+++ b/internal/core/src/segcore/tokenizer_c.cpp
@ -10,6 +10,7 @@
 // or implied. See the License for the specific language governing permissions and limitations under the License

 #include "segcore/tokenizer_c.h"
+#include <memory>
 #include "common/FieldMeta.h"
 #include "common/protobuf_utils.h"
 #include "pb/schema.pb.h"
@ -19,10 +20,9 @@
 using Map = std::map<std::string, std::string>;

 CStatus
-create_tokenizer(CMap m, CTokenizer* tokenizer) {
+create_tokenizer(const char* params, CTokenizer* tokenizer) {
    try {
-        auto mm = reinterpret_cast<Map*>(m);
-        auto impl = std::make_unique<milvus::tantivy::Tokenizer>(*mm);
+        auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
        *tokenizer = impl.release();
        return milvus::SuccessCStatus();
    } catch (std::exception& e) {
@ -30,6 +30,17 @@ create_tokenizer(CMap m, CTokenizer* tokenizer) {
    }
 }

+CStatus
+clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst) {
+    try {
+        auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(*tokenizer);
+        *rst = impl->Clone().release();
+        return milvus::SuccessCStatus();
+    } catch (std::exception& e) {
+        return milvus::FailureCStatus(&e);
+    }
+}
+
 void
 free_tokenizer(CTokenizer tokenizer) {
    auto impl = reinterpret_cast<milvus::tantivy::Tokenizer*>(tokenizer);
--- a/internal/core/src/segcore/tokenizer_c.h
+++ b/internal/core/src/segcore/tokenizer_c.h
@ -24,7 +24,10 @@ extern "C" {
 typedef void* CTokenizer;

 CStatus
-create_tokenizer(CMap m, CTokenizer* tokenizer);
+create_tokenizer(const char* params, CTokenizer* tokenizer);
+
+CStatus
+clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);

 void
 free_tokenizer(CTokenizer tokenizer);
--- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
@ -1021,11 +1021,12 @@ dependencies = [

 [[package]]
 name = "serde_json"
-version = "1.0.115"
+version = "1.0.128"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
+checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8"
 dependencies = [
 "itoa",
+ "memchr",
 "ryu",
 "serde",
 ]
@ -1166,6 +1167,7 @@ dependencies = [
 "libc",
 "log",
 "scopeguard",
+ "serde_json",
 "tantivy",
 "tantivy-jieba",
 "zstd-sys",
--- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
@ -15,6 +15,7 @@ env_logger = "0.11.3"
 log = "0.4.21"
 tantivy-jieba = "0.10.0"
 lazy_static = "1.4.0"
+serde_json = "1.0.128"

 [build-dependencies]
 cbindgen = "0.26.0"
--- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h
@ -88,7 +88,9 @@ RustArray tantivy_regex_query(void *ptr, const char *pattern);

 RustArray tantivy_match_query(void *ptr, const char *query);

-void tantivy_register_tokenizer(void *ptr, const char *tokenizer_name, void *tokenizer_params);
+void tantivy_register_tokenizer(void *ptr,
+                                const char *tokenizer_name,
+                                const char *tokenizer_params);

 void *tantivy_create_index(const char *field_name,
                           TantivyDataType data_type,
@ -142,7 +144,7 @@ void tantivy_index_add_multi_keywords(void *ptr,
 void *tantivy_create_text_writer(const char *field_name,
                                 const char *path,
                                 const char *tokenizer_name,
-                                 void *tokenizer_params,
+                                 const char *tokenizer_params,
                                 uintptr_t num_threads,
                                 uintptr_t overall_memory_budget_in_bytes,
                                 bool in_ram);
@ -157,7 +159,9 @@ bool tantivy_token_stream_advance(void *token_stream);

 const char *tantivy_token_stream_get_token(void *token_stream);

-void *tantivy_create_tokenizer(void *tokenizer_params);
+void *tantivy_create_tokenizer(const char *tokenizer_params);
+
+void *tantivy_clone_tokenizer(void *ptr);

 void tantivy_free_tokenizer(void *tokenizer);

--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs
@ -0,0 +1,40 @@
+use serde_json as json;
+
+#[derive(Debug)]
+pub struct TantivyError{
+    reason: String,
+}
+
+impl TantivyError{
+    fn new(reason:String) -> Self{
+        TantivyError{reason:reason}
+    }
+
+    pub fn reason(&self) -> String{
+        return self.reason.clone()
+    }
+}
+
+impl From<&str> for TantivyError{
+    fn from(value: &str) -> Self {
+        Self::new(value.to_string())
+    }
+}
+
+impl From<String> for TantivyError{
+    fn from(value: String) -> Self {
+        Self::new(value)
+    }
+}
+
+impl From<json::Error> for TantivyError{
+    fn from(value: json::Error) -> Self {
+        Self::new(value.to_string())
+    }
+}
+
+impl ToString for TantivyError{
+    fn to_string(&self) -> String {
+        return self.reason()
+    }
+}
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
@ -4,7 +4,7 @@ use tantivy::{
    Term,
 };

-use crate::{index_reader::IndexReaderWrapper, tokenizer::default_tokenizer};
+use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer};

 impl IndexReaderWrapper {
    // split the query string into multiple tokens using index's default tokenizer,
@ -14,7 +14,7 @@ impl IndexReaderWrapper {
        let mut tokenizer = self
            .index
            .tokenizer_for_field(self.field)
-            .unwrap_or(default_tokenizer())
+            .unwrap_or(standard_analyzer(vec![]))
            .clone();
        let mut token_stream = tokenizer.token_stream(q);
        let mut terms: Vec<Term> = Vec::new();
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs
@ -1,8 +1,14 @@
-use std::{collections::HashMap, ffi::CStr};
+use std::{ffi::CStr};

 use libc::{c_char, c_void};

-use crate::{array::RustArray, index_reader::IndexReaderWrapper, tokenizer::create_tokenizer};
+use crate::{
+    array::RustArray,
+    string_c::c_str_to_str,
+    index_reader::IndexReaderWrapper,
+    tokenizer::create_tokenizer,
+    log::init_log,
+};

 #[no_mangle]
 pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustArray {
@ -18,23 +24,22 @@ pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) ->
 pub extern "C" fn tantivy_register_tokenizer(
    ptr: *mut c_void,
    tokenizer_name: *const c_char,
-    tokenizer_params: *mut c_void,
+    tokenizer_params: *const c_char,
 ) {
+    init_log();
    let real = ptr as *mut IndexReaderWrapper;
    let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name) };
-    let analyzer = unsafe {
-        let m = tokenizer_params as *const HashMap<String, String>;
-        create_tokenizer(&(*m))
-    };
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
    match analyzer {
-        Some(text_analyzer) => unsafe {
+        Ok(text_analyzer) => unsafe {
            (*real).register_tokenizer(
                String::from(tokenizer_name_str.to_str().unwrap()),
                text_analyzer,
            );
        },
-        None => {
-            panic!("unsupported tokenizer");
-        }
+        Err(err) => {
+            panic!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
+        },
    }
 }
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
@ -1,4 +1,3 @@
-use std::collections::HashMap;
 use std::ffi::c_char;
 use std::ffi::c_void;
 use std::ffi::CStr;
@ -6,26 +5,27 @@ use std::ffi::CStr;
 use crate::index_writer::IndexWriterWrapper;
 use crate::tokenizer::create_tokenizer;
 use crate::util::create_binding;
+use crate::string_c::c_str_to_str;
+use crate::log::init_log;

 #[no_mangle]
 pub extern "C" fn tantivy_create_text_writer(
    field_name: *const c_char,
    path: *const c_char,
    tokenizer_name: *const c_char,
-    tokenizer_params: *mut c_void,
+    tokenizer_params: *const c_char,
    num_threads: usize,
    overall_memory_budget_in_bytes: usize,
    in_ram: bool,
 ) -> *mut c_void {
+    init_log();
    let field_name_str = unsafe { CStr::from_ptr(field_name).to_str().unwrap() };
    let path_str = unsafe { CStr::from_ptr(path).to_str().unwrap() };
    let tokenizer_name_str = unsafe { CStr::from_ptr(tokenizer_name).to_str().unwrap() };
-    let analyzer = unsafe {
-        let m = tokenizer_params as *const HashMap<String, String>;
-        create_tokenizer(&(*m))
-    };
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
    match analyzer {
-        Some(text_analyzer) => {
+        Ok(text_analyzer) => {
            let wrapper = IndexWriterWrapper::create_text_writer(
                String::from(field_name_str),
                String::from(path_str),
@ -37,8 +37,9 @@ pub extern "C" fn tantivy_create_text_writer(
            );
            create_binding(wrapper)
        }
-        None => {
+        Err(err) => {
+            log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
            std::ptr::null_mut()
-        }
+        },
    }
 }
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs
@ -15,8 +15,10 @@ mod log;
 mod string_c;
 mod token_stream_c;
 mod tokenizer;
+mod tokenizer_filter;
 mod tokenizer_c;
 mod util;
+mod error;
 mod util_c;
 mod vec_collector;

--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer.rs
@ -1,54 +1,254 @@
-use lazy_static::lazy_static;
-use log::{info, warn};
+use log::warn;
 use std::collections::HashMap;
-use tantivy::tokenizer::{TextAnalyzer, TokenizerManager};
-use crate::log::init_log;
+use tantivy::tokenizer::*;
+use serde_json as json;

-lazy_static! {
-    static ref DEFAULT_TOKENIZER_MANAGER: TokenizerManager = TokenizerManager::default();
+use crate::tokenizer_filter::*;
+use crate::error::TantivyError;
+use crate::util::*;
+
+
+// default build-in analyzer
+pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
+    let builder = standard_builder()
+        .filter(LowerCaser)
+        .filter(RemoveLongFilter::limit(40));
+
+    if stop_words.len() > 0{
+        return builder.filter(StopWordFilter::remove(stop_words)).build();
+    }
+
+    builder.build()
 }

-pub(crate) fn default_tokenizer() -> TextAnalyzer {
-    DEFAULT_TOKENIZER_MANAGER.get("default").unwrap()
+fn standard_builder() -> TextAnalyzerBuilder{
+    TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
 }

-fn jieba_tokenizer() -> TextAnalyzer {
-    tantivy_jieba::JiebaTokenizer {}.into()
+fn whitespace_builder()-> TextAnalyzerBuilder{
+    TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
 }

-pub(crate) fn create_tokenizer(params: &HashMap<String, String>) -> Option<TextAnalyzer> {
-    init_log();
-
-    match params.get("tokenizer") {
-        Some(tokenizer_name) => match tokenizer_name.as_str() {
-            "default" => {
-                Some(default_tokenizer())
-            }
-            "jieba" => {
-                Some(jieba_tokenizer())
-            }
-            s => {
-                warn!("unsupported tokenizer: {}", s);
-                None
-            }
-        },
-        None => {
-            Some(default_tokenizer())
+fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
+    match name.as_str() {
+        "standard" => Ok(standard_builder()),
+        "whitespace" => Ok(whitespace_builder()),
+        other => {
+            warn!("unsupported tokenizer: {}", other);
+            Err(format!("unsupported tokenizer: {}", other).into())
        }
    }
 }

+struct AnalyzerBuilder<'a>{
+    // builder: TextAnalyzerBuilder
+    filters:HashMap<String, SystemFilter>,
+    params:&'a json::Map<String, json::Value>
+}
+
+impl AnalyzerBuilder<'_>{
+    fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{
+        AnalyzerBuilder{
+            filters: HashMap::new(),
+            params:params,
+        }
+    }
+
+    fn get_tokenizer_name(&self) -> Result<String, TantivyError>{
+        let tokenizer=self.params.get("tokenizer");
+        if tokenizer.is_none(){
+            return Ok("standard".to_string());
+        }
+        if !tokenizer.unwrap().is_string(){
+            return Err(format!("tokenizer name should be string").into());
+        }
+
+        Ok(tokenizer.unwrap().as_str().unwrap().to_string())
+    }
+
+    fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{
+        match SystemFilter::try_from(params){
+            Ok(filter) => {
+                self.filters.insert(name.to_string(), filter);
+                Ok(())
+            },
+            Err(e) => {Err(e)},
+        }
+    }
+
+    fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{
+        for (name, value) in params{
+            if !value.is_object(){
+                continue;
+            }
+            self.add_custom_filter(name, value.as_object().unwrap())?;
+        }
+        Ok(())
+    }
+
+    fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{
+        if !params.is_array(){
+            return Err("filter params should be array".into());
+        }
+    
+        let filters = params.as_array().unwrap();
+        for filter in filters{
+            if filter.is_string(){
+                let filter_name = filter.as_str().unwrap();
+                let costum = self.filters.remove(filter_name);
+                if !costum.is_none(){
+                    builder = costum.unwrap().transform(builder);
+                    continue;
+                }
+    
+                // check if filter was system filter
+                let system = SystemFilter::from(filter_name);
+                match system {
+                    SystemFilter::Invalid => {
+                        return Err(format!("build analyzer failed, filter not found :{}", filter_name).into())
+                    }
+                    other => {
+                        builder = other.transform(builder);
+                    },
+                }
+            }else if filter.is_object(){
+                let filter=SystemFilter::try_from(filter.as_object().unwrap())?;
+                builder = filter.transform(builder);
+            }
+        };
+        Ok(builder)
+    }
+
+    fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
+        for (key, value) in self.params{
+            match key.as_str(){
+                "tokenizer" => {},
+                "filter" => {
+                    // build with filter if filter param exist
+                    builder=self.build_filter(builder, value)?;
+                },
+                "max_token_length" => {
+                    if !value.is_u64(){
+                        return Err("max token length should be int type".into());
+                    }
+                    builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
+                }
+                other => return Err(format!("unknown analyzer option key: {}", other).into()),
+            }
+        }
+        Ok(builder)
+    }
+
+    fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
+        match type_{
+            "standard" => {
+                let value = self.params.get("stop_words");
+                match value{
+                    Some(value)=>{
+                        let str_list = get_string_list(value, "filter stop_words")?;
+                        Ok(standard_analyzer(str_list))
+                    }
+                    None => Ok(standard_analyzer(vec![]))
+                }                
+            },
+            other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
+        }
+    } 
+
+    fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
+        // build base build-in analyzer
+        match self.params.get("type"){
+            Some(type_) =>{
+                if !type_.is_string(){
+                    return Err(format!("analyzer type shoud be string").into())
+                }
+                return self.build_template(type_.as_str().unwrap());
+            },
+            None => {}
+        };
+
+        //build custom analyzer
+        let tokenizer_name = self.get_tokenizer_name()?; 
+
+        // jieba analyzer can't add filter.     
+        if tokenizer_name == "jieba"{
+            return Ok(tantivy_jieba::JiebaTokenizer{}.into());
+        }
+
+        let mut builder=get_builder_by_name(&tokenizer_name)?;
+        
+        // build with option
+        builder = self.build_option(builder)?;
+        Ok(builder.build())
+    }
+}
+
+pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
+    match json::from_str::<json::Value>(&params){
+        Ok(value) =>{
+            if value.is_null(){
+                return Ok(standard_analyzer(vec![]));
+            }
+            if !value.is_object(){
+                return Err("tokenizer params should be a json map".into());
+            }
+            let json_params = value.as_object().unwrap();
+
+            // create builder
+            let analyzer_params=json_params.get("analyzer");
+            if analyzer_params.is_none(){
+                return Ok(standard_analyzer(vec![]));
+            }
+            if !analyzer_params.unwrap().is_object(){
+                return Err("analyzer params should be a json map".into());
+            }
+            let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
+    
+            // build custom filter
+            let filter_params=json_params.get("filter");
+            if !filter_params.is_none() && filter_params.unwrap().is_object(){
+                builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
+            }
+
+            // build analyzer
+            builder.build()
+        },
+        Err(err) => Err(err.into()),
+    }
+}
+
+pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
+    if params.len()==0{
+        return Ok(standard_analyzer(vec![]));
+    }
+    create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
+}
+
 #[cfg(test)]
 mod tests {
-    use std::collections::HashMap;
    use crate::tokenizer::create_tokenizer;

    #[test]
    fn test_create_tokenizer() {
-        let mut params : HashMap<String, String> = HashMap::new();
-        params.insert("tokenizer".parse().unwrap(), "jieba".parse().unwrap());
+        let params = r#"{"tokenizer": "standard"}"#;

-        let tokenizer = create_tokenizer(&params);
-        assert!(tokenizer.is_some());
+        let tokenizer = create_tokenizer(&params.to_string());
+        assert!(tokenizer.is_ok());
    }
-}
+
+    #[test]
+    fn test_jieba_tokenizer() {
+        let params = r#"{"tokenizer": "jieba"}"#;
+
+        let tokenizer = create_tokenizer(&params.to_string());
+        assert!(tokenizer.is_ok());
+        let mut bining = tokenizer.unwrap();
+
+        let mut stream = bining.token_stream("系统安全");
+        while stream.advance(){
+            let token = stream.token();
+            let text = token.text.clone();
+            print!("test token :{}\n", text.as_str())
+        }
+    }
+}
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_c.rs
@ -1,25 +1,34 @@
-use std::collections::HashMap;
-
-use libc::c_void;
+use libc::{c_void,c_char};
 use tantivy::tokenizer::TextAnalyzer;

 use crate::{
+    string_c::c_str_to_str,
    tokenizer::create_tokenizer,
    util::{create_binding, free_binding},
+    log::init_log,
 };

 #[no_mangle]
-pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *mut c_void) -> *mut c_void {
-    let analyzer = unsafe {
-        let m = tokenizer_params as *const HashMap<String, String>;
-        create_tokenizer(&(*m))
-    };
+pub extern "C" fn tantivy_create_tokenizer(tokenizer_params: *const c_char) -> *mut c_void {
+    init_log();
+    let params = unsafe{c_str_to_str(tokenizer_params).to_string()};
+    let analyzer = create_tokenizer(&params);
    match analyzer {
-        Some(text_analyzer) => create_binding(text_analyzer),
-        None => std::ptr::null_mut(),
+        Ok(text_analyzer) => create_binding(text_analyzer),
+        Err(err) => {
+            log::warn!("create tokenizer failed with error: {} param: {}", err.to_string(), params);
+            std::ptr::null_mut()
+        },
    }
 }

+#[no_mangle]
+pub extern "C" fn tantivy_clone_tokenizer(ptr: *mut c_void) -> *mut c_void {
+    let analyzer=ptr as *mut TextAnalyzer;
+    let clone = unsafe {(*analyzer).clone()};
+    create_binding(clone)
+}
+
 #[no_mangle]
 pub extern "C" fn tantivy_free_tokenizer(tokenizer: *mut c_void) {
    free_binding::<TextAnalyzer>(tokenizer);
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/tokenizer_filter.rs
@ -0,0 +1,154 @@
+use tantivy::tokenizer::*;
+use serde_json as json;
+
+use crate::error::TantivyError;
+use crate::util::*;
+
+pub(crate) enum SystemFilter{
+    Invalid,
+    LowerCase(LowerCaser),
+    AsciiFolding(AsciiFoldingFilter),
+    AlphaNumOnly(AlphaNumOnlyFilter),
+    Length(RemoveLongFilter),
+    Stop(StopWordFilter),
+    Decompounder(SplitCompoundWords),
+    Stemmer(Stemmer)
+}
+
+impl SystemFilter{
+    pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{
+        match self{
+            Self::LowerCase(filter) => builder.filter(filter).dynamic(),
+            Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
+            Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
+            Self::Length(filter) => builder.filter(filter).dynamic(),
+            Self::Stop(filter) => builder.filter(filter).dynamic(),
+            Self::Decompounder(filter) => builder.filter(filter).dynamic(),
+            Self::Stemmer(filter) => builder.filter(filter).dynamic(),
+            Self::Invalid => builder,
+        }
+    }
+}
+
+//  create length filter from params
+// {
+//     "type": "length",
+//     "max": 10, // length
+// }
+// TODO support min length
+fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{
+    let limit_str = params.get("max");
+    if limit_str.is_none() || !limit_str.unwrap().is_u64(){
+        return Err("lenth max param was none or not uint".into())
+    }
+    let limit = limit_str.unwrap().as_u64().unwrap() as usize;
+    Ok(SystemFilter::Length(RemoveLongFilter::limit(limit)))
+}
+
+fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
+    let value = params.get("stop_words");
+    if value.is_none(){
+        return Err("stop filter stop_words can't be empty".into());
+    }
+    let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
+    Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
+}
+
+fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
+    let value = params.get("word_list");
+    if value.is_none() || !value.unwrap().is_array(){
+        return Err("decompounder word list should be array".into())
+    }
+
+    let stop_words = value.unwrap().as_array().unwrap();
+    let mut str_list = Vec::<String>::new();
+    for element in stop_words{
+        match element.as_str(){
+            Some(word) => str_list.push(word.to_string()),
+            None => return Err("decompounder word list item should be string".into())
+        }
+    };
+
+    match SplitCompoundWords::from_dictionary(str_list){
+        Ok(f) => Ok(SystemFilter::Decompounder(f)),
+        Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into())
+    }
+}
+
+fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
+    let value = params.get("language");
+    if value.is_none() || !value.unwrap().is_string(){
+        return Err("stemmer language field should be string".into())
+    }
+
+    match value.unwrap().as_str().unwrap().into_language(){
+        Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
+        Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()),
+    }
+}
+
+trait LanguageParser {
+    type Error;
+    fn into_language(self) -> Result<Language, Self::Error>;
+}
+
+impl LanguageParser for &str {   
+    type Error = TantivyError;
+    fn into_language(self) -> Result<Language, Self::Error> {
+        match self.to_lowercase().as_str() {
+            "arabig" => Ok(Language::Arabic),
+            "danish" => Ok(Language::Danish),
+            "dutch" => Ok(Language::Dutch),
+            "english" => Ok(Language::English),
+            "finnish" => Ok(Language::Finnish),
+            "french" => Ok(Language::French),
+            "german" => Ok(Language::German),
+            "greek" => Ok(Language::Greek),
+            "hungarian" => Ok(Language::Hungarian),
+            "italian" => Ok(Language::Italian),
+            "norwegian" => Ok(Language::Norwegian),
+            "portuguese" => Ok(Language::Portuguese),
+            "romanian" => Ok(Language::Romanian),
+            "russian" => Ok(Language::Russian),
+            "spanish" => Ok(Language::Spanish),
+            "swedish" => Ok(Language::Swedish),
+            "tamil" => Ok(Language::Tamil),
+            "turkish" => Ok(Language::Turkish),
+            other => Err(format!("unsupport language: {}", other).into()),
+        }
+    }
+}
+
+impl From<&str> for SystemFilter{
+    fn from(value: &str) -> Self {
+        match value{
+            "lowercase" => Self::LowerCase(LowerCaser),
+            "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
+            "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
+            _ => Self::Invalid,
+        }
+    }
+}
+
+impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
+    type Error = TantivyError;
+
+    fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> {
+        match params.get(&"type".to_string()){
+            Some(value) =>{
+                if !value.is_string(){
+                    return Err("filter type should be string".into());
+                };
+
+                match value.as_str().unwrap(){
+                    "length" => get_length_filter(params),
+                    "stop" => get_stop_words_filter(params),
+                    "decompounder" => get_decompounder_filter(params),
+                    "stemmer" => get_stemmer_filter(params),
+                    other=> Err(format!("unsupport filter type: {}", other).into()),
+                }
+            }
+            None => Err("no type field in filter params".into()),
+        }
+    }
+}
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/util.rs
@ -1,5 +1,7 @@
 use std::ffi::c_void;
 use std::ops::Bound;
+use serde_json as json;
+use crate::error::TantivyError;

 use tantivy::{directory::MmapDirectory, Index};

@ -28,3 +30,19 @@ pub fn free_binding<T>(ptr: *mut c_void) {
        drop(Box::from_raw(real));
    }
 }
+
+pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{
+    if !value.is_array(){
+        return Err(format!("{} should be array", label).into())
+    }
+
+    let stop_words = value.as_array().unwrap();
+    let mut str_list = Vec::<String>::new();
+    for element in stop_words{
+        match element.as_str(){
+            Some(word) => str_list.push(word.to_string()),
+            None => return Err(format!("{} list item should be string", label).into())
+        }
+    };
+    Ok(str_list)
+}
--- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h
+++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h
@ -14,7 +14,7 @@ namespace milvus::tantivy {
 using Map = std::map<std::string, std::string>;

 static constexpr const char* DEFAULT_TOKENIZER_NAME = "milvus_tokenizer";
-static Map DEFAULT_TOKENIZER_PARAMS = {};
+static const char* DEFAULT_TOKENIZER_PARAMS = "{}";
 static constexpr uintptr_t DEFAULT_NUM_THREADS = 4;
 static constexpr uintptr_t DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES =
    DEFAULT_NUM_THREADS * 15 * 1024 * 1024;
@ -101,17 +101,14 @@ struct TantivyIndexWrapper {
                        bool in_ram,
                        const char* path,
                        const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
-                        const std::map<std::string, std::string>&
-                            tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
+                        const char* tokenizer_params = DEFAULT_TOKENIZER_PARAMS,
                        uintptr_t num_threads = DEFAULT_NUM_THREADS,
                        uintptr_t overall_memory_budget_in_bytes =
                            DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
-        RustHashMap m;
-        m.from(tokenizer_params);
        writer_ = tantivy_create_text_writer(field_name,
                                             path,
                                             tokenizer_name,
-                                             m.get_pointer(),
+                                             tokenizer_params,
                                             num_threads,
                                             overall_memory_budget_in_bytes,
                                             in_ram);
@ -134,14 +131,11 @@ struct TantivyIndexWrapper {
    }

    void
-    register_tokenizer(
-        const char* tokenizer_name,
-        const std::map<std::string, std::string>& tokenizer_params) {
-        RustHashMap m;
-        m.from(tokenizer_params);
+    register_tokenizer(const char* tokenizer_name,
+                       const char* tokenizer_params) {
        if (reader_ != nullptr) {
            tantivy_register_tokenizer(
-                reader_, tokenizer_name, m.get_pointer());
+                reader_, tokenizer_name, tokenizer_params);
        }
    }

--- a/internal/core/thirdparty/tantivy/tokenizer.h
+++ b/internal/core/thirdparty/tantivy/tokenizer.h
@ -11,15 +11,17 @@ struct Tokenizer {
 public:
    NO_COPY_OR_ASSIGN(Tokenizer);

-    explicit Tokenizer(const std::map<std::string, std::string>& params) {
-        RustHashMap m;
-        m.from(params);
-        ptr_ = tantivy_create_tokenizer(m.get_pointer());
+    explicit Tokenizer(std::string&& params) {
+        auto shared_params = std::make_shared<std::string>(std::move(params));
+        ptr_ = tantivy_create_tokenizer(shared_params->c_str());
        if (ptr_ == nullptr) {
            throw std::invalid_argument("invalid tokenizer parameters");
        }
    }

+    explicit Tokenizer(void* _ptr) : ptr_(_ptr) {
+    }
+
    ~Tokenizer() {
        if (ptr_ != nullptr) {
            tantivy_free_tokenizer(ptr_);
@ -34,6 +36,12 @@ struct Tokenizer {
        return std::make_unique<TokenStream>(token_stream, shared_text);
    }

+    std::unique_ptr<Tokenizer>
+    Clone() {
+        auto newptr = tantivy_clone_tokenizer(ptr_);
+        return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
+    }
+
    // CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
    std::unique_ptr<TokenStream>
    CreateTokenStreamCopyText(const std::string& text) {
--- a/internal/core/unittest/test_c_tokenizer.cpp
+++ b/internal/core/unittest/test_c_tokenizer.cpp
@ -47,12 +47,10 @@ set_cmap(CMap m, const std::string& key, const std::string& value) {
 }

 TEST(CTokenizer, Default) {
-    auto m = create_cmap();
-    set_cmap(m, "tokenizer", "default");
-
+    auto tokenizer_params = R"({"tokenizer": "standard"})";
    CTokenizer tokenizer;
    {
-        auto status = create_tokenizer(m, &tokenizer);
+        auto status = create_tokenizer(tokenizer_params, &tokenizer);
        ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
    }

@ -71,5 +69,4 @@ TEST(CTokenizer, Default) {

    free_token_stream(token_stream);
    free_tokenizer(tokenizer);
-    free_cmap(m);
 }
--- a/internal/core/unittest/test_text_match.cpp
+++ b/internal/core/unittest/test_text_match.cpp
@ -10,9 +10,9 @@
 // or implied. See the License for the specific language governing permissions and limitations under the License

 #include <gtest/gtest.h>
+#include <string>

 #include "common/Schema.h"
-#include "segcore/segment_c.h"
 #include "segcore/SegmentGrowing.h"
 #include "segcore/SegmentGrowingImpl.h"
 #include "test_utils/DataGen.h"
@ -80,23 +80,19 @@ TEST(ParseJson, Naive) {
 TEST(ParseTokenizerParams, NoTokenizerParams) {
    TypeParams params{{"k", "v"}};
    auto p = ParseTokenizerParams(params);
-    ASSERT_EQ(0, p.size());
+    ASSERT_EQ("{}", std::string(p));
 }

 TEST(ParseTokenizerParams, Default) {
-    TypeParams params{{"tokenizer_params", R"({"tokenizer": "default"})"}};
+    TypeParams params{{"tokenizer_params", R"({"tokenizer": "standard"})"}};
    auto p = ParseTokenizerParams(params);
-    ASSERT_EQ(1, p.size());
-    auto iter = p.find("tokenizer");
-    ASSERT_NE(p.end(), iter);
-    ASSERT_EQ("default", iter->second);
+    ASSERT_EQ(params.at("tokenizer_params"), p);
 }

 TEST(TextMatch, Index) {
    using Index = index::TextMatchIndex;
-    auto index = std::make_unique<Index>(std::numeric_limits<int64_t>::max(),
-                                         "milvus_tokenizer",
-                                         std::map<std::string, std::string>{});
+    auto index = std::make_unique<Index>(
+        std::numeric_limits<int64_t>::max(), "milvus_tokenizer", "{}");
    index->CreateReader();
    index->AddText("football, basketball, pingpang", 0);
    index->AddText("swimming, football", 1);
--- a/internal/proxy/task_query.go
+++ b/internal/proxy/task_query.go
@ -297,7 +297,6 @@ func (t *queryTask) CanSkipAllocTimestamp() bool {
 		}
 		consistencyLevel = collectionInfo.consistencyLevel
 	}
-
 	return consistencyLevel != commonpb.ConsistencyLevel_Strong
 }

--- a/internal/proxy/task_search.go
+++ b/internal/proxy/task_search.go
@ -111,7 +111,6 @@ func (t *searchTask) CanSkipAllocTimestamp() bool {
 		}
 		consistencyLevel = collectionInfo.consistencyLevel
 	}
-
 	return consistencyLevel != commonpb.ConsistencyLevel_Strong
 }

--- a/internal/util/ctokenizer/c_tokenizer.go
+++ b/internal/util/ctokenizer/c_tokenizer.go
@ -33,6 +33,15 @@ func (impl *CTokenizer) NewTokenStream(text string) tokenizerapi.TokenStream {
 	return NewCTokenStream(ptr)
 }

+func (impl *CTokenizer) Clone() (tokenizerapi.Tokenizer, error) {
+	var newptr C.CTokenizer
+	status := C.clone_tokenizer(&impl.ptr, &newptr)
+	if err := HandleCStatus(&status, "failed to clone tokenizer"); err != nil {
+		return nil, err
+	}
+	return NewCTokenizer(newptr), nil
+}
+
 func (impl *CTokenizer) Destroy() {
 	C.free_tokenizer(impl.ptr)
 }
--- a/internal/util/ctokenizer/c_tokenizer_factory.go
+++ b/internal/util/ctokenizer/c_tokenizer_factory.go
@ -9,16 +9,17 @@ package ctokenizer
 import "C"

 import (
+	"unsafe"
+
 	"github.com/milvus-io/milvus/internal/util/tokenizerapi"
 )

-func NewTokenizer(m map[string]string) (tokenizerapi.Tokenizer, error) {
-	mm := NewCMap()
-	defer mm.Destroy()
-	mm.From(m)
+func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) {
+	paramPtr := C.CString(param)
+	defer C.free(unsafe.Pointer(paramPtr))

 	var ptr C.CTokenizer
-	status := C.create_tokenizer(mm.GetPointer(), &ptr)
+	status := C.create_tokenizer(paramPtr, &ptr)
 	if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
 		return nil, err
 	}
--- a/internal/util/ctokenizer/c_tokenizer_test.go
+++ b/internal/util/ctokenizer/c_tokenizer_test.go
@ -10,7 +10,7 @@ import (
 func TestTokenizer(t *testing.T) {
 	// default tokenizer.
 	{
-		m := make(map[string]string)
+		m := "{\"tokenizer\": \"standard\"}"
 		tokenizer, err := NewTokenizer(m)
 		assert.NoError(t, err)
 		defer tokenizer.Destroy()
@ -24,8 +24,7 @@ func TestTokenizer(t *testing.T) {

 	// jieba tokenizer.
 	{
-		m := make(map[string]string)
-		m["tokenizer"] = "jieba"
+		m := "{\"tokenizer\": \"jieba\"}"
 		tokenizer, err := NewTokenizer(m)
 		assert.NoError(t, err)
 		defer tokenizer.Destroy()
--- a/internal/util/ctokenizer/text_schema_validator_test.go
+++ b/internal/util/ctokenizer/text_schema_validator_test.go
@ -33,7 +33,7 @@ func TestValidateTextSchema(t *testing.T) {
 			DataType: schemapb.DataType_VarChar,
 			TypeParams: []*commonpb.KeyValuePair{
 				{Key: "enable_match", Value: "true"},
-				{Key: "tokenizer_params", Value: `{"tokenizer": "default"}`},
+				{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
 			},
 		},
 		{
@ -41,7 +41,7 @@ func TestValidateTextSchema(t *testing.T) {
 			DataType: schemapb.DataType_VarChar,
 			TypeParams: []*commonpb.KeyValuePair{
 				{Key: "enable_match", Value: "true"},
-				{Key: "tokenizer_params", Value: `{"tokenizer": "jieba"}`},
+				{Key: "tokenizer_params", Value: `{"tokenizer": "standard"}`},
 			},
 		},
 	}
--- a/internal/util/function/bm25_function.go
+++ b/internal/util/function/bm25_function.go
@ -40,6 +40,15 @@ type BM25FunctionRunner struct {
 	concurrency int
 }

+func getTokenizerParams(field *schemapb.FieldSchema) string {
+	for _, param := range field.GetTypeParams() {
+		if param.Key == "tokenizer_params" {
+			return param.Value
+		}
+	}
+	return "{}"
+}
+
 func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.FunctionSchema) (*BM25FunctionRunner, error) {
 	if len(schema.GetOutputFieldIds()) != 1 {
 		return nil, fmt.Errorf("bm25 function should only have one output field, but now %d", len(schema.GetOutputFieldIds()))
@ -49,17 +58,22 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
 		schema:      schema,
 		concurrency: 8,
 	}
+	var params string
 	for _, field := range coll.GetFields() {
 		if field.GetFieldID() == schema.GetOutputFieldIds()[0] {
 			runner.outputField = field
 			break
 		}
+
+		if field.GetFieldID() == schema.GetInputFieldIds()[0] {
+			params = getTokenizerParams(field)
+		}
 	}

 	if runner.outputField == nil {
 		return nil, fmt.Errorf("no output field")
 	}
-	tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
+	tokenizer, err := ctokenizer.NewTokenizer(params)
 	if err != nil {
 		return nil, err
 	}
@ -69,8 +83,7 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
 }

 func (v *BM25FunctionRunner) run(data []string, dst []map[uint32]float32) error {
-	// TODO AOIASD Support single Tokenizer concurrency
-	tokenizer, err := ctokenizer.NewTokenizer(map[string]string{})
+	tokenizer, err := v.tokenizer.Clone()
 	if err != nil {
 		return err
 	}
--- a/internal/util/tokenizerapi/tokenizer.go
+++ b/internal/util/tokenizerapi/tokenizer.go
@ -3,5 +3,6 @@ package tokenizerapi
 //go:generate mockery --name=Tokenizer --with-expecter
 type Tokenizer interface {
 	NewTokenStream(text string) TokenStream
+	Clone() (Tokenizer, error)
 	Destroy()
 }
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@ -778,7 +778,7 @@ def gen_default_collection_schema(description=ct.default_desc, primary_field=ct.
 def gen_all_datatype_collection_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
                                       auto_id=False, dim=ct.default_dim, enable_dynamic_field=True, **kwargs):
    tokenizer_params = {
-        "tokenizer": "default",
+        "tokenizer": "standard",
    }
    fields = [
        gen_int64_field(),
--- a/tests/python_client/testcases/test_full_text_search.py
+++ b/tests/python_client/testcases/test_full_text_search.py
@ -33,7 +33,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
    """

    @pytest.mark.tags(CaseLabel.L0)
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_collection_for_full_text_search(self, tokenizer):
        """
        target: test create collection with full text search
@ -97,7 +97,7 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase):
        assert len(res["functions"]) == len(text_fields)

    @pytest.mark.tags(CaseLabel.L0)
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer):
        """
        target: test create collection with full text search twice with same schema
@ -175,7 +175,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("tokenizer", ["unsupported"])
-    @pytest.mark.xfail(reason="")
+    @pytest.mark.skip(reason="check not implement may cause panic")
    def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer):
        """
        target: test create collection with full text search with unsupported tokenizer
@ -249,7 +249,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
        expected: create collection failed
        """
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        dim = 128
        fields = [
@ -327,7 +327,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
        expected: create collection failed
        """
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        dim = 128
        fields = [
@ -397,7 +397,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
    @pytest.mark.tags(CaseLabel.L0)
    @pytest.mark.parametrize("nullable", [False, True])
    @pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_insert_for_full_text_search_default(self, tokenizer, text_lang, nullable):
        """
        target: test insert data with full text search
@ -542,7 +542,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("enable_dynamic_field", [True])
    @pytest.mark.parametrize("nullable", [False])
    @pytest.mark.parametrize("text_lang", ["en"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field):
        """
        target: test insert data with full text search and enable dynamic field
@ -692,7 +692,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
    @pytest.mark.tags(CaseLabel.L0)
    @pytest.mark.parametrize("nullable", [True])
    @pytest.mark.parametrize("text_lang", ["en"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable):
        """
        target: test insert data for full text search with dataframe
@ -831,7 +831,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
        assert len(data) == count

    @pytest.mark.tags(CaseLabel.L2)
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_insert_for_full_text_search_with_part_of_empty_string(self, tokenizer):
        """
        target: test insert data with full text search with part of empty string
@ -990,7 +990,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase):

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("nullable", [True])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable):
        """
        target: test insert data with full text search with non varchar data
@ -1089,7 +1089,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase):

    @pytest.mark.tags(CaseLabel.L0)
    @pytest.mark.parametrize("nullable", [False, True])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
    def test_upsert_for_full_text_search(self, tokenizer, nullable):
        """
@ -1260,7 +1260,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):

    @pytest.mark.tags(CaseLabel.L1)
    @pytest.mark.parametrize("nullable", [False])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37021")
    def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable):
        """
@ -1402,7 +1402,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
    """

    @pytest.mark.tags(CaseLabel.L1)
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_delete_for_full_text_search(self, tokenizer):
        """
        target: test delete data for full text search
@ -1564,7 +1564,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("b", [0.1])
    @pytest.mark.parametrize("k", [1.2])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_index_for_full_text_search_default(
            self, tokenizer, index_type, k, b
    ):
@ -1688,7 +1688,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("b", [0.5])
    @pytest.mark.parametrize("k", [1.5])
    @pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_full_text_search_with_invalid_index_type(
            self, tokenizer, index_type, k, b
    ):
@ -1796,7 +1796,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("k", [1.5])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
    @pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_full_text_search_index_with_invalid_metric_type(
            self, tokenizer, index_type, metric_type, k, b
    ):
@ -1903,7 +1903,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("b", [0.5])
    @pytest.mark.parametrize("k", [1.5])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_index_using_bm25_metric_type_for_non_bm25_output_field(
            self, tokenizer, index_type, k, b
    ):
@ -2000,7 +2000,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("b", [-1])
    @pytest.mark.parametrize("k", [-1])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_create_full_text_search_with_invalid_bm25_params(
            self, tokenizer, index_type, k, b
    ):
@ -2121,7 +2121,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
    @pytest.mark.parametrize("expr", ["text_match", "id_range"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    @pytest.mark.parametrize("offset", [10, 0])
    def test_full_text_search_default(
            self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
@ -2317,7 +2317,6 @@ class TestSearchWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("expr", ["text_match"])
    @pytest.mark.parametrize("offset", [10])
    @pytest.mark.parametrize("tokenizer", ["jieba"])
-    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
    def test_full_text_search_with_jieba_tokenizer(
            self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
    ):
@ -2329,7 +2328,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
        expected: full text search successfully and result is correct
        """
        tokenizer_params = {
-            "tokenizer": tokenizer,
+                "tokenizer": tokenizer,
        }
        dim = 128
        fields = [
@ -2511,7 +2510,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
    @pytest.mark.parametrize("expr", [None])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_full_text_search_with_range_search(
            self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
    ):
@ -2676,7 +2675,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
    @pytest.mark.parametrize("expr", [None])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_full_text_search_with_search_iterator(
            self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
    ):
@ -2829,7 +2828,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
    @pytest.mark.parametrize("invalid_search_data", ["empty_text"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022")
    def test_search_for_full_text_search_with_empty_string_search_data(
            self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
@ -2959,7 +2958,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"])
    @pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_search_for_full_text_search_with_invalid_search_data(
            self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data
    ):
@ -3106,7 +3105,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
    @pytest.mark.parametrize("enable_partition_key", [True])
    @pytest.mark.parametrize("enable_inverted_index", [True])
    @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_hybrid_search_with_full_text_search(
            self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type
    ):
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@ -4441,7 +4441,7 @@ class TestQueryTextMatch(TestcaseBase):
    @pytest.mark.tags(CaseLabel.L0)
    @pytest.mark.parametrize("enable_partition_key", [True, False])
    @pytest.mark.parametrize("enable_inverted_index", [True, False])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_query_text_match_en_normal(
        self, tokenizer, enable_inverted_index, enable_partition_key
    ):
@ -4724,24 +4724,16 @@ class TestQueryTextMatch(TestcaseBase):
        expected: get the correct token, text match successfully and result is correct
        """
        tokenizer_params = {
-            "tokenizer": "standard",
-            "alpha_num_only": True,
-            "ascii_folding": True,
-            "lower_case": True,
-            "max_token_length": 40,
-            "split_compound_words": [
-                "dampf",
-                "schiff",
-                "fahrt",
-                "brot",
-                "backen",
-                "automat",
-            ],
-            "stemmer": "English",
-            "stop": {
-                "language": "English",
-                "words": ["an", "the"],
-            },
+                "tokenizer": "standard",
+                # "lowercase", "asciifolding", "alphanumonly" was system filter
+                "filter":["lowercase", "asciifolding", "alphanumonly",
+                {
+                    "type": "stop",
+                    "stop_words": ["in", "of"],
+                }, {
+                    "type": "stemmer",
+                    "language": "english",
+                }],
        }
        dim = 128
        fields = [
@ -4852,7 +4844,7 @@ class TestQueryTextMatch(TestcaseBase):
        expected: query successfully and result is correct
        """
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        # 1. initialize with data
        dim = 128
@ -4966,7 +4958,7 @@ class TestQueryTextMatch(TestcaseBase):
        expected: query successfully and result is correct
        """
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        # 1. initialize with data
        dim = 128
@ -5109,7 +5101,7 @@ class TestQueryTextMatch(TestcaseBase):

        # 1. initialize with data
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        # 1. initialize with data
        dim = 128
@ -5254,7 +5246,7 @@ class TestQueryTextMatch(TestcaseBase):
        # 1. initialize with data
        fake_en = Faker("en_US")
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        dim = 128
        default_fields = [
@ -5481,7 +5473,7 @@ class TestQueryTextMatch(TestcaseBase):
        """
        # 1. initialize with data
        tokenizer_params = {
-            "tokenizer": "default",
+            "tokenizer": "standard",
        }
        # 1. initialize with data
        dim = 128
--- a/tests/python_client/testcases/test_search.py
+++ b/tests/python_client/testcases/test_search.py
@ -13290,7 +13290,7 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
    @pytest.mark.tags(CaseLabel.L0)
    @pytest.mark.parametrize("enable_partition_key", [True, False])
    @pytest.mark.parametrize("enable_inverted_index", [True, False])
-    @pytest.mark.parametrize("tokenizer", ["default"])
+    @pytest.mark.parametrize("tokenizer", ["standard"])
    def test_search_with_text_match_filter_normal_en(
        self, tokenizer, enable_inverted_index, enable_partition_key
    ):
--- a/tests/restful_client_v2/testcases/test_vector_operations.py
+++ b/tests/restful_client_v2/testcases/test_vector_operations.py
@ -1881,7 +1881,7 @@ class TestSearchVector(TestBase):
            assert len(res) == limit


-    @pytest.mark.parametrize("tokenizer", ["jieba", "default"])
+    @pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
    def test_search_vector_with_text_match_filter(self, tokenizer):
        """
        Query a vector with a simple payload
@ -2718,7 +2718,7 @@ class TestQueryVector(TestBase):
            if "like" in filter_expr:
                assert name.startswith(prefix)

-    @pytest.mark.parametrize("tokenizer", ["jieba", "default"])
+    @pytest.mark.parametrize("tokenizer", ["jieba", "standard"])
    def test_query_vector_with_text_match_filter(self, tokenizer):
        """
        Query a vector with a simple payload