mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-29 18:38:44 +08:00
feat: Add chinese and english analyzer with refactor jieba tokenizer (#37494)
Some checks are pending
Code Checker / Code Checker AMD64 Ubuntu 22.04 (push) Waiting to run
Code Checker / Code Checker Amazonlinux 2023 (push) Waiting to run
Code Checker / Code Checker rockylinux8 (push) Waiting to run
Mac Code Checker / Code Checker MacOS 12 (push) Waiting to run
Build and test / Build and test AMD64 Ubuntu 22.04 (push) Waiting to run
Build and test / UT for Cpp (push) Blocked by required conditions
Build and test / UT for Go (push) Blocked by required conditions
Build and test / Integration Test (push) Blocked by required conditions
Build and test / Upload Code Coverage (push) Blocked by required conditions
Some checks are pending
Code Checker / Code Checker AMD64 Ubuntu 22.04 (push) Waiting to run
Code Checker / Code Checker Amazonlinux 2023 (push) Waiting to run
Code Checker / Code Checker rockylinux8 (push) Waiting to run
Mac Code Checker / Code Checker MacOS 12 (push) Waiting to run
Build and test / Build and test AMD64 Ubuntu 22.04 (push) Waiting to run
Build and test / UT for Cpp (push) Blocked by required conditions
Build and test / UT for Go (push) Blocked by required conditions
Build and test / Integration Test (push) Blocked by required conditions
Build and test / Upload Code Coverage (push) Blocked by required conditions
relate: https://github.com/milvus-io/milvus/issues/35853 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
1304b40552
commit
1c5b5e1e3d
32
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
32
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -904,14 +904,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.4"
|
||||
version = "1.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
|
||||
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata 0.4.6",
|
||||
"regex-syntax 0.8.2",
|
||||
"regex-automata 0.4.8",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -925,13 +925,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.6"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
|
||||
checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.8.2",
|
||||
"regex-syntax 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -942,9 +942,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "rust-stemmers"
|
||||
@ -1163,13 +1163,14 @@ dependencies = [
|
||||
"cbindgen",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"regex",
|
||||
"scopeguard",
|
||||
"serde_json",
|
||||
"tantivy",
|
||||
"tantivy-jieba",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
@ -1222,17 +1223,6 @@ dependencies = [
|
||||
"utf8-ranges",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-jieba"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44022293c12a8f878e03439b2f11806d3d394130fe33d4e7781cba91abbac0a4"
|
||||
dependencies = [
|
||||
"jieba-rs",
|
||||
"lazy_static",
|
||||
"tantivy-tokenizer-api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.21.0"
|
||||
|
@ -13,9 +13,10 @@ scopeguard = "1.2"
|
||||
zstd-sys = "=2.0.9"
|
||||
env_logger = "0.11.3"
|
||||
log = "0.4.21"
|
||||
tantivy-jieba = "0.10.0"
|
||||
lazy_static = "1.4.0"
|
||||
serde_json = "1.0.128"
|
||||
jieba-rs = "0.6.8"
|
||||
regex = "1.11.1"
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen = "0.26.0"
|
||||
|
83
internal/core/thirdparty/tantivy/tantivy-binding/src/jieba_tokenizer.rs
vendored
Normal file
83
internal/core/thirdparty/tantivy/tantivy-binding/src/jieba_tokenizer.rs
vendored
Normal file
@ -0,0 +1,83 @@
|
||||
use jieba_rs;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum JiebaMode {
|
||||
Exact,
|
||||
Search,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct JiebaTokenizer{
|
||||
mode: JiebaMode,
|
||||
hmm: bool,
|
||||
}
|
||||
|
||||
pub struct JiebaTokenStream {
|
||||
tokens: Vec<Token>,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
impl TokenStream for JiebaTokenStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.index < self.tokens.len() {
|
||||
self.index += 1;
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.tokens[self.index - 1]
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.tokens[self.index - 1]
|
||||
}
|
||||
}
|
||||
|
||||
impl JiebaTokenizer {
|
||||
pub fn new() -> JiebaTokenizer{
|
||||
JiebaTokenizer{mode: JiebaMode::Search, hmm: true}
|
||||
}
|
||||
|
||||
fn tokenize(&self, text: &str) -> Vec<Token>{
|
||||
let mut indices = text.char_indices().collect::<Vec<_>>();
|
||||
indices.push((text.len(), '\0'));
|
||||
let ori_tokens = match self.mode{
|
||||
JiebaMode::Exact => {
|
||||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm)
|
||||
},
|
||||
JiebaMode::Search => {
|
||||
JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm)
|
||||
},
|
||||
};
|
||||
|
||||
let mut tokens = Vec::with_capacity(ori_tokens.len());
|
||||
for token in ori_tokens {
|
||||
tokens.push(Token {
|
||||
offset_from: indices[token.start].0,
|
||||
offset_to: indices[token.end].0,
|
||||
position: token.start,
|
||||
text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
|
||||
position_length: token.end - token.start,
|
||||
});
|
||||
}
|
||||
tokens
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for JiebaTokenizer {
|
||||
type TokenStream<'a> = JiebaTokenStream;
|
||||
|
||||
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
|
||||
let tokens = self.tokenize(text);
|
||||
JiebaTokenStream { tokens, index: 0 }
|
||||
}
|
||||
}
|
@ -21,6 +21,8 @@ mod util;
|
||||
mod error;
|
||||
mod util_c;
|
||||
mod vec_collector;
|
||||
mod stop_words;
|
||||
mod jieba_tokenizer;
|
||||
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
|
5
internal/core/thirdparty/tantivy/tantivy-binding/src/stop_words.rs
vendored
Normal file
5
internal/core/thirdparty/tantivy/tantivy-binding/src/stop_words.rs
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
pub const ENGLISH: &[&str] = &[
|
||||
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
|
||||
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
|
||||
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
|
||||
];
|
@ -1,18 +1,42 @@
|
||||
use log::warn;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
use serde_json as json;
|
||||
|
||||
use crate::stop_words;
|
||||
use crate::tokenizer_filter::*;
|
||||
use crate::jieba_tokenizer::JiebaTokenizer;
|
||||
use crate::error::TantivyError;
|
||||
use crate::util::*;
|
||||
|
||||
|
||||
// default build-in analyzer
|
||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser);
|
||||
|
||||
if stop_words.len() > 0{
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||
let builder = jieba_builder().filter(CnCharOnlyFilter);
|
||||
if stop_words.len() > 0{
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser)
|
||||
.filter(RemoveLongFilter::limit(40));
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.filter(StopWordFilter::remove(stop_words::ENGLISH.iter().map(|&word| word.to_owned())));
|
||||
|
||||
if stop_words.len() > 0{
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
@ -29,10 +53,15 @@ fn whitespace_builder()-> TextAnalyzerBuilder{
|
||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn jieba_builder() -> TextAnalyzerBuilder{
|
||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||
}
|
||||
|
||||
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
match name.as_str() {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => Ok(jieba_builder()),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
Err(format!("unsupported tokenizer: {}", other).into())
|
||||
@ -92,6 +121,7 @@ impl AnalyzerBuilder<'_>{
|
||||
}
|
||||
|
||||
let filters = params.as_array().unwrap();
|
||||
|
||||
for filter in filters{
|
||||
if filter.is_string(){
|
||||
let filter_name = filter.as_str().unwrap();
|
||||
@ -127,30 +157,34 @@ impl AnalyzerBuilder<'_>{
|
||||
// build with filter if filter param exist
|
||||
builder=self.build_filter(builder, value)?;
|
||||
},
|
||||
"max_token_length" => {
|
||||
if !value.is_u64(){
|
||||
return Err("max token length should be int type".into());
|
||||
}
|
||||
builder = builder.filter_dynamic(RemoveLongFilter::limit(value.as_u64().unwrap() as usize));
|
||||
}
|
||||
other => return Err(format!("unknown analyzer option key: {}", other).into()),
|
||||
}
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn get_stop_words_option(&self) -> Result<Vec<String>, TantivyError>{
|
||||
let value = self.params.get("stop_words");
|
||||
match value{
|
||||
Some(value)=>{
|
||||
let str_list = get_string_list(value, "filter stop_words")?;
|
||||
Ok(get_stop_words_list(str_list))
|
||||
}
|
||||
None => Ok(vec![])
|
||||
}
|
||||
}
|
||||
|
||||
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
|
||||
match type_{
|
||||
"standard" => {
|
||||
let value = self.params.get("stop_words");
|
||||
match value{
|
||||
Some(value)=>{
|
||||
let str_list = get_string_list(value, "filter stop_words")?;
|
||||
Ok(standard_analyzer(str_list))
|
||||
}
|
||||
None => Ok(standard_analyzer(vec![]))
|
||||
}
|
||||
Ok(standard_analyzer(self.get_stop_words_option()?))
|
||||
},
|
||||
"chinese" => {
|
||||
Ok(chinese_analyzer(self.get_stop_words_option()?))
|
||||
},
|
||||
"english" => {
|
||||
Ok(english_analyzer(self.get_stop_words_option()?))
|
||||
}
|
||||
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
|
||||
}
|
||||
}
|
||||
@ -168,13 +202,7 @@ impl AnalyzerBuilder<'_>{
|
||||
};
|
||||
|
||||
//build custom analyzer
|
||||
let tokenizer_name = self.get_tokenizer_name()?;
|
||||
|
||||
// jieba analyzer can't add filter.
|
||||
if tokenizer_name == "jieba"{
|
||||
return Ok(tantivy_jieba::JiebaTokenizer{}.into());
|
||||
}
|
||||
|
||||
let tokenizer_name = self.get_tokenizer_name()?;
|
||||
let mut builder=get_builder_by_name(&tokenizer_name)?;
|
||||
|
||||
// build with option
|
||||
@ -227,28 +255,37 @@ pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyE
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::tokenizer::create_tokenizer;
|
||||
use regex;
|
||||
|
||||
#[test]
|
||||
fn test_create_tokenizer() {
|
||||
let params = r#"{"tokenizer": "standard"}"#;
|
||||
fn test_standard_analyzer() {
|
||||
let params = r#"{
|
||||
"type": "standard",
|
||||
"stop_words": ["_english_"]
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_jieba_tokenizer() {
|
||||
let params = r#"{"tokenizer": "jieba"}"#;
|
||||
fn test_chinese_analyzer() {
|
||||
let params = r#"{
|
||||
"type": "chinese"
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
let mut stream = bining.token_stream("系统安全");
|
||||
let regex = regex::Regex::new("\\p{Han}+").unwrap();
|
||||
|
||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||
while stream.advance(){
|
||||
let token = stream.token();
|
||||
let text = token.text.clone();
|
||||
print!("test token :{}\n", text.as_str())
|
||||
print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str()))
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
use tantivy::tokenizer::*;
|
||||
use serde_json as json;
|
||||
use regex;
|
||||
|
||||
use crate::error::TantivyError;
|
||||
use crate::util::*;
|
||||
@ -9,6 +10,7 @@ pub(crate) enum SystemFilter{
|
||||
LowerCase(LowerCaser),
|
||||
AsciiFolding(AsciiFoldingFilter),
|
||||
AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
CnCharOnly(CnCharOnlyFilter),
|
||||
Length(RemoveLongFilter),
|
||||
Stop(StopWordFilter),
|
||||
Decompounder(SplitCompoundWords),
|
||||
@ -21,6 +23,7 @@ impl SystemFilter{
|
||||
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
||||
@ -51,7 +54,7 @@ fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<Syste
|
||||
return Err("stop filter stop_words can't be empty".into());
|
||||
}
|
||||
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(str_list)))
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(get_stop_words_list(str_list))))
|
||||
}
|
||||
|
||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
@ -125,6 +128,7 @@ impl From<&str> for SystemFilter{
|
||||
"lowercase" => Self::LowerCase(LowerCaser),
|
||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
||||
_ => Self::Invalid,
|
||||
}
|
||||
}
|
||||
@ -152,3 +156,52 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CnCharOnlyFilter;
|
||||
|
||||
pub struct CnCharOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnCharOnlyFilter{
|
||||
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
||||
CnCharOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CnCharOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnCharOnlyFilterStream {
|
||||
regex: regex::Regex::new("\\p{Han}+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
@ -1,10 +1,11 @@
|
||||
use std::ffi::c_void;
|
||||
use std::ops::Bound;
|
||||
use serde_json as json;
|
||||
use crate::error::TantivyError;
|
||||
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
use crate::stop_words;
|
||||
use crate::error::TantivyError;
|
||||
|
||||
pub fn index_exist(path: &str) -> bool {
|
||||
let dir = MmapDirectory::open(path).unwrap();
|
||||
Index::exists(&dir).unwrap()
|
||||
@ -45,4 +46,23 @@ pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<St
|
||||
}
|
||||
};
|
||||
Ok(str_list)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list:Vec<String>) -> Vec<String>{
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list{
|
||||
if str.len()>0 && str.chars().nth(0).unwrap() == '_'{
|
||||
match str.as_str(){
|
||||
"_english_" =>{
|
||||
for word in stop_words::ENGLISH{
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
stop_words.push(str);
|
||||
}
|
||||
stop_words
|
||||
}
|
||||
|
@ -64,7 +64,6 @@ func NewBM25FunctionRunner(coll *schemapb.CollectionSchema, schema *schemapb.Fun
|
||||
for _, field := range coll.GetFields() {
|
||||
if field.GetFieldID() == schema.GetOutputFieldIds()[0] {
|
||||
runner.outputField = field
|
||||
break
|
||||
}
|
||||
|
||||
if field.GetFieldID() == schema.GetInputFieldIds()[0] {
|
||||
|
Loading…
Reference in New Issue
Block a user