mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-29 18:38:44 +08:00
enhance: Optimize chinese analyzer and support CnAlphaNumFilter (#37727)
relate: https://github.com/milvus-io/milvus/issues/35853 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
0ba868ae64
commit
3b5a0df159
@ -24,7 +24,7 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
}
|
||||
|
||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||
let builder = jieba_builder().filter(CnCharOnlyFilter);
|
||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0{
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
@ -275,17 +275,17 @@ mod tests {
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
let regex = regex::Regex::new("\\p{Han}+").unwrap();
|
||||
|
||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance(){
|
||||
let token = stream.token();
|
||||
let text = token.text.clone();
|
||||
print!("test token :{} symbol: {}\n", text.as_str(), regex.is_match(text.as_str()))
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
|
||||
}
|
@ -11,6 +11,7 @@ pub(crate) enum SystemFilter{
|
||||
AsciiFolding(AsciiFoldingFilter),
|
||||
AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
CnCharOnly(CnCharOnlyFilter),
|
||||
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||
Length(RemoveLongFilter),
|
||||
Stop(StopWordFilter),
|
||||
Decompounder(SplitCompoundWords),
|
||||
@ -24,6 +25,7 @@ impl SystemFilter{
|
||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
||||
@ -129,6 +131,7 @@ impl From<&str> for SystemFilter{
|
||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
||||
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||
_ => Self::Invalid,
|
||||
}
|
||||
}
|
||||
@ -201,6 +204,54 @@ impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CnAlphaNumOnlyFilter;
|
||||
|
||||
pub struct CnAlphaNumOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnAlphaNumOnlyFilter{
|
||||
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||
CnAlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnAlphaNumOnlyFilterStream {
|
||||
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user