enhance: optimize self defined rust error (#37975)

Prepare for issue: https://github.com/milvus-io/milvus/issues/37930

Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
Bingyi Sun 2024-11-28 20:30:36 +08:00 committed by GitHub
parent 84698c072a
commit e6af806a0d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 262 additions and 203 deletions

View File

@ -1,40 +1,35 @@
use core::fmt;
use serde_json as json; use serde_json as json;
#[derive(Debug)] #[derive(Debug)]
pub struct TantivyError{ pub enum TantivyBindingError {
reason: String, JsonError(serde_json::Error),
InternalError(String),
} }
impl TantivyError{ impl From<serde_json::Error> for TantivyBindingError {
fn new(reason:String) -> Self{ fn from(value: serde_json::Error) -> Self {
TantivyError{reason:reason} TantivyBindingError::JsonError(value)
}
pub fn reason(&self) -> String{
return self.reason.clone()
} }
} }
impl From<&str> for TantivyError{ impl fmt::Display for TantivyBindingError {
fn from(value: &str) -> Self { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
Self::new(value.to_string()) match self {
TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e),
TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e),
}
} }
} }
impl From<String> for TantivyError{ impl std::error::Error for TantivyBindingError {
fn from(value: String) -> Self { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
Self::new(value) match self {
TantivyBindingError::JsonError(e) => Some(e),
TantivyBindingError::InternalError(_) => None,
}
} }
} }
impl From<json::Error> for TantivyError{ pub type Result<T> = std::result::Result<T, TantivyBindingError>;
fn from(value: json::Error) -> Self {
Self::new(value.to_string())
}
}
impl ToString for TantivyError{
fn to_string(&self) -> String {
return self.reason()
}
}

View File

@ -1,113 +1,123 @@
use log::warn; use log::warn;
use std::collections::HashMap;
use tantivy::tokenizer::*;
use tantivy::tokenizer::StopWordFilter;
use serde_json as json; use serde_json as json;
use std::collections::HashMap;
use tantivy::tokenizer::StopWordFilter;
use tantivy::tokenizer::*;
use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::jieba_tokenizer::JiebaTokenizer;
use crate::stop_words; use crate::stop_words;
use crate::tokenizer_filter::*; use crate::tokenizer_filter::*;
use crate::jieba_tokenizer::JiebaTokenizer;
use crate::error::TantivyError;
use crate::util::*; use crate::util::*;
// default build-in analyzer // default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer { pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder() let builder = standard_builder().filter(LowerCaser);
.filter(LowerCaser);
if stop_words.len() > 0{ if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build(); return builder.filter(StopWordFilter::remove(stop_words)).build();
} }
builder.build() builder.build()
} }
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{ fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0{ if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build(); return builder.filter(StopWordFilter::remove(stop_words)).build();
} }
builder.build() builder.build()
} }
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer{ fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder() let builder = standard_builder()
.filter(LowerCaser) .filter(LowerCaser)
.filter(Stemmer::new(Language::English)) .filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(stop_words::ENGLISH.iter().map(|&word| word.to_owned()))); .filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));
if stop_words.len() > 0{ if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build(); return builder.filter(StopWordFilter::remove(stop_words)).build();
} }
builder.build() builder.build()
} }
fn standard_builder() -> TextAnalyzerBuilder{ fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
} }
fn whitespace_builder()-> TextAnalyzerBuilder{ fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
} }
fn jieba_builder() -> TextAnalyzerBuilder{ fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
} }
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{ fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
match name.as_str() { match name.as_str() {
"standard" => Ok(standard_builder()), "standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()), "whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()), "jieba" => Ok(jieba_builder()),
other => { other => {
warn!("unsupported tokenizer: {}", other); warn!("unsupported tokenizer: {}", other);
Err(format!("unsupported tokenizer: {}", other).into()) Err(TantivyBindingError::InternalError(format!(
"unsupported tokenizer: {}",
other
)))
} }
} }
} }
struct AnalyzerBuilder<'a>{ struct AnalyzerBuilder<'a> {
// builder: TextAnalyzerBuilder // builder: TextAnalyzerBuilder
filters:HashMap<String, SystemFilter>, filters: HashMap<String, SystemFilter>,
params:&'a json::Map<String, json::Value> params: &'a json::Map<String, json::Value>,
} }
impl AnalyzerBuilder<'_>{ impl AnalyzerBuilder<'_> {
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{ fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
AnalyzerBuilder{ AnalyzerBuilder {
filters: HashMap::new(), filters: HashMap::new(),
params:params, params: params,
} }
} }
fn get_tokenizer_name(&self) -> Result<String, TantivyError>{ fn get_tokenizer_name(&self) -> Result<String> {
let tokenizer=self.params.get("tokenizer"); let tokenizer = self.params.get("tokenizer");
if tokenizer.is_none(){ if tokenizer.is_none() {
return Ok("standard".to_string()); return Ok("standard".to_string());
} }
if !tokenizer.unwrap().is_string(){ if !tokenizer.unwrap().is_string() {
return Err(format!("tokenizer name should be string").into()); return Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string"
)));
} }
Ok(tokenizer.unwrap().as_str().unwrap().to_string()) Ok(tokenizer.unwrap().as_str().unwrap().to_string())
} }
fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{ fn add_custom_filter(
match SystemFilter::try_from(params){ &mut self,
name: &String,
params: &json::Map<String, json::Value>,
) -> Result<()> {
match SystemFilter::try_from(params) {
Ok(filter) => { Ok(filter) => {
self.filters.insert(name.to_string(), filter); self.filters.insert(name.to_string(), filter);
Ok(()) Ok(())
}, }
Err(e) => {Err(e)}, Err(e) => Err(e),
} }
} }
fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{ fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
for (name, value) in params{ for (name, value) in params {
if !value.is_object(){ if !value.is_object() {
continue; continue;
} }
self.add_custom_filter(name, value.as_object().unwrap())?; self.add_custom_filter(name, value.as_object().unwrap())?;
@ -115,138 +125,155 @@ impl AnalyzerBuilder<'_>{
Ok(()) Ok(())
} }
fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{ fn build_filter(
if !params.is_array(){ &mut self,
return Err("filter params should be array".into()); mut builder: TextAnalyzerBuilder,
params: &json::Value,
) -> Result<TextAnalyzerBuilder> {
if !params.is_array() {
return Err(TantivyBindingError::InternalError(
"filter params should be array".to_string(),
));
} }
let filters = params.as_array().unwrap(); let filters = params.as_array().unwrap();
for filter in filters{ for filter in filters {
if filter.is_string(){ if filter.is_string() {
let filter_name = filter.as_str().unwrap(); let filter_name = filter.as_str().unwrap();
let costum = self.filters.remove(filter_name); let costum = self.filters.remove(filter_name);
if !costum.is_none(){ if !costum.is_none() {
builder = costum.unwrap().transform(builder); builder = costum.unwrap().transform(builder);
continue; continue;
} }
// check if filter was system filter // check if filter was system filter
let system = SystemFilter::from(filter_name); let system = SystemFilter::from(filter_name);
match system { match system {
SystemFilter::Invalid => { SystemFilter::Invalid => {
return Err(format!("build analyzer failed, filter not found :{}", filter_name).into()) return Err(TantivyBindingError::InternalError(format!(
"build analyzer failed, filter not found :{}",
filter_name
)))
} }
other => { other => {
builder = other.transform(builder); builder = other.transform(builder);
}, }
} }
}else if filter.is_object(){ } else if filter.is_object() {
let filter=SystemFilter::try_from(filter.as_object().unwrap())?; let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
builder = filter.transform(builder); builder = filter.transform(builder);
} }
};
Ok(builder)
}
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
for (key, value) in self.params{
match key.as_str(){
"tokenizer" => {},
"filter" => {
// build with filter if filter param exist
builder=self.build_filter(builder, value)?;
},
other => return Err(format!("unknown analyzer option key: {}", other).into()),
}
} }
Ok(builder) Ok(builder)
} }
fn get_stop_words_option(&self) -> Result<Vec<String>, TantivyError>{ fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder> {
for (key, value) in self.params {
match key.as_str() {
"tokenizer" => {}
"filter" => {
// build with filter if filter param exist
builder = self.build_filter(builder, value)?;
}
other => {
return Err(TantivyBindingError::InternalError(format!(
"unknown analyzer option key: {}",
other
)))
}
}
}
Ok(builder)
}
fn get_stop_words_option(&self) -> Result<Vec<String>> {
let value = self.params.get("stop_words"); let value = self.params.get("stop_words");
match value{ match value {
Some(value)=>{ Some(value) => {
let str_list = get_string_list(value, "filter stop_words")?; let str_list = get_string_list(value, "filter stop_words")?;
Ok(get_stop_words_list(str_list)) Ok(get_stop_words_list(str_list))
} }
None => Ok(vec![]) None => Ok(vec![]),
} }
} }
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{ fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
match type_{ match type_ {
"standard" => { "standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
Ok(standard_analyzer(self.get_stop_words_option()?)) "chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
}, "english" => Ok(english_analyzer(self.get_stop_words_option()?)),
"chinese" => { other_ => Err(TantivyBindingError::InternalError(format!(
Ok(chinese_analyzer(self.get_stop_words_option()?)) "unknown build-in analyzer type: {}",
}, other_
"english" => { ))),
Ok(english_analyzer(self.get_stop_words_option()?))
}
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
} }
} }
fn build(mut self) -> Result<TextAnalyzer, TantivyError>{ fn build(mut self) -> Result<TextAnalyzer> {
// build base build-in analyzer // build base build-in analyzer
match self.params.get("type"){ match self.params.get("type") {
Some(type_) =>{ Some(type_) => {
if !type_.is_string(){ if !type_.is_string() {
return Err(format!("analyzer type shoud be string").into()) return Err(TantivyBindingError::InternalError(format!(
"analyzer type shoud be string"
)));
} }
return self.build_template(type_.as_str().unwrap()); return self.build_template(type_.as_str().unwrap());
}, }
None => {} None => {}
}; };
//build custom analyzer //build custom analyzer
let tokenizer_name = self.get_tokenizer_name()?; let tokenizer_name = self.get_tokenizer_name()?;
let mut builder=get_builder_by_name(&tokenizer_name)?; let mut builder = get_builder_by_name(&tokenizer_name)?;
// build with option // build with option
builder = self.build_option(builder)?; builder = self.build_option(builder)?;
Ok(builder.build()) Ok(builder.build())
} }
} }
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> { pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
match json::from_str::<json::Value>(&params){ match json::from_str::<json::Value>(&params) {
Ok(value) =>{ Ok(value) => {
if value.is_null(){ if value.is_null() {
return Ok(standard_analyzer(vec![])); return Ok(standard_analyzer(vec![]));
} }
if !value.is_object(){ if !value.is_object() {
return Err("tokenizer params should be a json map".into()); return Err(TantivyBindingError::InternalError(
"tokenizer params should be a json map".to_string(),
));
} }
let json_params = value.as_object().unwrap(); let json_params = value.as_object().unwrap();
// create builder // create builder
let analyzer_params=json_params.get("analyzer"); let analyzer_params = json_params.get("analyzer");
if analyzer_params.is_none(){ if analyzer_params.is_none() {
return Ok(standard_analyzer(vec![])); return Ok(standard_analyzer(vec![]));
} }
if !analyzer_params.unwrap().is_object(){ if !analyzer_params.unwrap().is_object() {
return Err("analyzer params should be a json map".into()); return Err(TantivyBindingError::InternalError(
"analyzer params should be a json map".to_string(),
));
} }
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap()); let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
// build custom filter // build custom filter
let filter_params=json_params.get("filter"); let filter_params = json_params.get("filter");
if !filter_params.is_none() && filter_params.unwrap().is_object(){ if !filter_params.is_none() && filter_params.unwrap().is_object() {
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?; builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
} }
// build analyzer // build analyzer
builder.build() builder.build()
}, }
Err(err) => Err(err.into()), Err(err) => Err(err.into()),
} }
} }
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> { pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer> {
if params.len()==0{ if params.len() == 0 {
return Ok(standard_analyzer(vec![])); return Ok(standard_analyzer(vec![]));
} }
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params)) create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
@ -265,7 +292,7 @@ mod tests {
}"#; }"#;
let tokenizer = create_tokenizer(&params.to_string()); let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
} }
#[test] #[test]
@ -275,17 +302,16 @@ mod tests {
}"#; }"#;
let tokenizer = create_tokenizer(&params.to_string()); let tokenizer = create_tokenizer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap(); let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
let mut results = Vec::<String>::new(); let mut results = Vec::<String>::new();
while stream.advance(){ while stream.advance() {
let token = stream.token(); let token = stream.token();
results.push(token.text.clone()); results.push(token.text.clone());
} }
print!("test tokens :{:?}\n", results) print!("test tokens :{:?}\n", results)
} }
}
}

View File

@ -1,11 +1,12 @@
use tantivy::tokenizer::*;
use serde_json as json;
use regex; use regex;
use serde_json as json;
use tantivy::tokenizer::*;
use crate::error::TantivyError; use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::util::*; use crate::util::*;
pub(crate) enum SystemFilter{ pub(crate) enum SystemFilter {
Invalid, Invalid,
LowerCase(LowerCaser), LowerCase(LowerCaser),
AsciiFolding(AsciiFoldingFilter), AsciiFolding(AsciiFoldingFilter),
@ -15,16 +16,16 @@ pub(crate) enum SystemFilter{
Length(RemoveLongFilter), Length(RemoveLongFilter),
Stop(StopWordFilter), Stop(StopWordFilter),
Decompounder(SplitCompoundWords), Decompounder(SplitCompoundWords),
Stemmer(Stemmer) Stemmer(Stemmer),
} }
impl SystemFilter{ impl SystemFilter {
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{ pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
match self{ match self {
Self::LowerCase(filter) => builder.filter(filter).dynamic(), Self::LowerCase(filter) => builder.filter(filter).dynamic(),
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(), Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(), Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::Length(filter) => builder.filter(filter).dynamic(), Self::Length(filter) => builder.filter(filter).dynamic(),
Self::Stop(filter) => builder.filter(filter).dynamic(), Self::Stop(filter) => builder.filter(filter).dynamic(),
@ -41,65 +42,85 @@ impl SystemFilter{
// "max": 10, // length // "max": 10, // length
// } // }
// TODO support min length // TODO support min length
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{ fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let limit_str = params.get("max"); let limit_str = params.get("max");
if limit_str.is_none() || !limit_str.unwrap().is_u64(){ if limit_str.is_none() || !limit_str.unwrap().is_u64() {
return Err("lenth max param was none or not uint".into()) return Err(TantivyBindingError::InternalError(
"lenth max param was none or not uint".to_string(),
));
} }
let limit = limit_str.unwrap().as_u64().unwrap() as usize; let limit = limit_str.unwrap().as_u64().unwrap() as usize;
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit+1))) Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
} }
fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{ fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("stop_words"); let value = params.get("stop_words");
if value.is_none(){ if value.is_none() {
return Err("stop filter stop_words can't be empty".into()); return Err(TantivyBindingError::InternalError(
"stop filter stop_words can't be empty".to_string(),
));
} }
let str_list = get_string_list(value.unwrap(), "stop_words filter")?; let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
Ok(SystemFilter::Stop(StopWordFilter::remove(get_stop_words_list(str_list)))) Ok(SystemFilter::Stop(StopWordFilter::remove(
get_stop_words_list(str_list),
)))
} }
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("word_list"); let value = params.get("word_list");
if value.is_none() || !value.unwrap().is_array(){ if value.is_none() || !value.unwrap().is_array() {
return Err("decompounder word list should be array".into()) return Err(TantivyBindingError::InternalError(
"decompounder word list should be array".to_string(),
));
} }
let stop_words = value.unwrap().as_array().unwrap(); let stop_words = value.unwrap().as_array().unwrap();
let mut str_list = Vec::<String>::new(); let mut str_list = Vec::<String>::new();
for element in stop_words{ for element in stop_words {
match element.as_str(){ match element.as_str() {
Some(word) => str_list.push(word.to_string()), Some(word) => str_list.push(word.to_string()),
None => return Err("decompounder word list item should be string".into()) None => {
return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(),
))
}
} }
}; }
match SplitCompoundWords::from_dictionary(str_list){ match SplitCompoundWords::from_dictionary(str_list) {
Ok(f) => Ok(SystemFilter::Decompounder(f)), Ok(f) => Ok(SystemFilter::Decompounder(f)),
Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into()) Err(e) => Err(TantivyBindingError::InternalError(format!(
"create decompounder failed: {}",
e.to_string()
))),
} }
} }
fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{ fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("language"); let value = params.get("language");
if value.is_none() || !value.unwrap().is_string(){ if value.is_none() || !value.unwrap().is_string() {
return Err("stemmer language field should be string".into()) return Err(TantivyBindingError::InternalError(
"stemmer language field should be string".to_string(),
));
} }
match value.unwrap().as_str().unwrap().into_language(){ match value.unwrap().as_str().unwrap().into_language() {
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()), Err(e) => Err(TantivyBindingError::InternalError(format!(
"create stemmer failed : {}",
e.to_string()
))),
} }
} }
trait LanguageParser { trait LanguageParser {
type Error; type Error;
fn into_language(self) -> Result<Language, Self::Error>; fn into_language(self) -> Result<Language>;
} }
impl LanguageParser for &str { impl LanguageParser for &str {
type Error = TantivyError; type Error = TantivyBindingError;
fn into_language(self) -> Result<Language, Self::Error> { fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() { match self.to_lowercase().as_str() {
"arabig" => Ok(Language::Arabic), "arabig" => Ok(Language::Arabic),
"danish" => Ok(Language::Danish), "danish" => Ok(Language::Danish),
@ -119,14 +140,17 @@ impl LanguageParser for &str {
"swedish" => Ok(Language::Swedish), "swedish" => Ok(Language::Swedish),
"tamil" => Ok(Language::Tamil), "tamil" => Ok(Language::Tamil),
"turkish" => Ok(Language::Turkish), "turkish" => Ok(Language::Turkish),
other => Err(format!("unsupport language: {}", other).into()), other => Err(TantivyBindingError::InternalError(format!(
"unsupport language: {}",
other
))),
} }
} }
} }
impl From<&str> for SystemFilter{ impl From<&str> for SystemFilter {
fn from(value: &str) -> Self { fn from(value: &str) -> Self {
match value{ match value {
"lowercase" => Self::LowerCase(LowerCaser), "lowercase" => Self::LowerCase(LowerCaser),
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
@ -138,24 +162,31 @@ impl From<&str> for SystemFilter{
} }
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter { impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
type Error = TantivyError; type Error = TantivyBindingError;
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> { fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
match params.get(&"type".to_string()){ match params.get(&"type".to_string()) {
Some(value) =>{ Some(value) => {
if !value.is_string(){ if !value.is_string() {
return Err("filter type should be string".into()); return Err(TantivyBindingError::InternalError(
"filter type should be string".to_string(),
));
}; };
match value.as_str().unwrap(){ match value.as_str().unwrap() {
"length" => get_length_filter(params), "length" => get_length_filter(params),
"stop" => get_stop_words_filter(params), "stop" => get_stop_words_filter(params),
"decompounder" => get_decompounder_filter(params), "decompounder" => get_decompounder_filter(params),
"stemmer" => get_stemmer_filter(params), "stemmer" => get_stemmer_filter(params),
other=> Err(format!("unsupport filter type: {}", other).into()), other => Err(TantivyBindingError::InternalError(format!(
"unsupport filter type: {}",
other
))),
} }
} }
None => Err("no type field in filter params".into()), None => Err(TantivyBindingError::InternalError(
"no type field in filter params".to_string(),
)),
} }
} }
} }
@ -167,7 +198,7 @@ pub struct CnCharOnlyFilterStream<T> {
tail: T, tail: T,
} }
impl TokenFilter for CnCharOnlyFilter{ impl TokenFilter for CnCharOnlyFilter {
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>; type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> { fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
@ -216,7 +247,7 @@ pub struct CnAlphaNumOnlyFilterStream<T> {
tail: T, tail: T,
} }
impl TokenFilter for CnAlphaNumOnlyFilter{ impl TokenFilter for CnAlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>; type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> { fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
@ -255,4 +286,4 @@ impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
fn token_mut(&mut self) -> &mut Token { fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut() self.tail.token_mut()
} }
} }

View File

@ -1,10 +1,11 @@
use serde_json as json;
use std::ffi::c_void; use std::ffi::c_void;
use std::ops::Bound; use std::ops::Bound;
use serde_json as json;
use tantivy::{directory::MmapDirectory, Index}; use tantivy::{directory::MmapDirectory, Index};
use crate::error::Result;
use crate::error::TantivyBindingError;
use crate::stop_words; use crate::stop_words;
use crate::error::TantivyError;
pub fn index_exist(path: &str) -> bool { pub fn index_exist(path: &str) -> bool {
let dir = MmapDirectory::open(path).unwrap(); let dir = MmapDirectory::open(path).unwrap();
@ -32,29 +33,35 @@ pub fn free_binding<T>(ptr: *mut c_void) {
} }
} }
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{ pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
if !value.is_array(){ if !value.is_array() {
return Err(format!("{} should be array", label).into()) return Err(TantivyBindingError::InternalError(
format!("{} should be array", label).to_string(),
));
} }
let stop_words = value.as_array().unwrap(); let stop_words = value.as_array().unwrap();
let mut str_list = Vec::<String>::new(); let mut str_list = Vec::<String>::new();
for element in stop_words{ for element in stop_words {
match element.as_str(){ match element.as_str() {
Some(word) => str_list.push(word.to_string()), Some(word) => str_list.push(word.to_string()),
None => return Err(format!("{} list item should be string", label).into()) None => {
return Err(TantivyBindingError::InternalError(
format!("{} list item should be string", label).to_string(),
))
}
} }
}; }
Ok(str_list) Ok(str_list)
} }
pub(crate) fn get_stop_words_list(str_list:Vec<String>) -> Vec<String>{ pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
let mut stop_words = Vec::new(); let mut stop_words = Vec::new();
for str in str_list{ for str in str_list {
if str.len()>0 && str.chars().nth(0).unwrap() == '_'{ if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
match str.as_str(){ match str.as_str() {
"_english_" =>{ "_english_" => {
for word in stop_words::ENGLISH{ for word in stop_words::ENGLISH {
stop_words.push(word.to_string()); stop_words.push(word.to_string());
} }
continue; continue;