mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-29 10:28:41 +08:00
enhance: optimize self defined rust error (#37975)
Prepare for issue: https://github.com/milvus-io/milvus/issues/37930 Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
parent
84698c072a
commit
e6af806a0d
@ -1,40 +1,35 @@
|
||||
use core::fmt;
|
||||
|
||||
use serde_json as json;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TantivyError{
|
||||
reason: String,
|
||||
pub enum TantivyBindingError {
|
||||
JsonError(serde_json::Error),
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
impl TantivyError{
|
||||
fn new(reason:String) -> Self{
|
||||
TantivyError{reason:reason}
|
||||
}
|
||||
|
||||
pub fn reason(&self) -> String{
|
||||
return self.reason.clone()
|
||||
impl From<serde_json::Error> for TantivyBindingError {
|
||||
fn from(value: serde_json::Error) -> Self {
|
||||
TantivyBindingError::JsonError(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for TantivyError{
|
||||
fn from(value: &str) -> Self {
|
||||
Self::new(value.to_string())
|
||||
impl fmt::Display for TantivyBindingError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e),
|
||||
TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<String> for TantivyError{
|
||||
fn from(value: String) -> Self {
|
||||
Self::new(value)
|
||||
impl std::error::Error for TantivyBindingError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self {
|
||||
TantivyBindingError::JsonError(e) => Some(e),
|
||||
TantivyBindingError::InternalError(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<json::Error> for TantivyError{
|
||||
fn from(value: json::Error) -> Self {
|
||||
Self::new(value.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for TantivyError{
|
||||
fn to_string(&self) -> String {
|
||||
return self.reason()
|
||||
}
|
||||
}
|
||||
pub type Result<T> = std::result::Result<T, TantivyBindingError>;
|
||||
|
@ -1,113 +1,123 @@
|
||||
use log::warn;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
use serde_json as json;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::jieba_tokenizer::JiebaTokenizer;
|
||||
use crate::stop_words;
|
||||
use crate::tokenizer_filter::*;
|
||||
use crate::jieba_tokenizer::JiebaTokenizer;
|
||||
use crate::error::TantivyError;
|
||||
use crate::util::*;
|
||||
|
||||
|
||||
// default build-in analyzer
|
||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser);
|
||||
let builder = standard_builder().filter(LowerCaser);
|
||||
|
||||
if stop_words.len() > 0{
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0{
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
||||
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = standard_builder()
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English))
|
||||
.filter(StopWordFilter::remove(stop_words::ENGLISH.iter().map(|&word| word.to_owned())));
|
||||
.filter(StopWordFilter::remove(
|
||||
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
|
||||
));
|
||||
|
||||
if stop_words.len() > 0{
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
builder.build()
|
||||
}
|
||||
|
||||
fn standard_builder() -> TextAnalyzerBuilder{
|
||||
fn standard_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn whitespace_builder()-> TextAnalyzerBuilder{
|
||||
fn whitespace_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||
}
|
||||
|
||||
fn jieba_builder() -> TextAnalyzerBuilder{
|
||||
fn jieba_builder() -> TextAnalyzerBuilder {
|
||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||
}
|
||||
|
||||
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
|
||||
match name.as_str() {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => Ok(jieba_builder()),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
Err(format!("unsupported tokenizer: {}", other).into())
|
||||
Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupported tokenizer: {}",
|
||||
other
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct AnalyzerBuilder<'a>{
|
||||
struct AnalyzerBuilder<'a> {
|
||||
// builder: TextAnalyzerBuilder
|
||||
filters:HashMap<String, SystemFilter>,
|
||||
params:&'a json::Map<String, json::Value>
|
||||
filters: HashMap<String, SystemFilter>,
|
||||
params: &'a json::Map<String, json::Value>,
|
||||
}
|
||||
|
||||
impl AnalyzerBuilder<'_>{
|
||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{
|
||||
AnalyzerBuilder{
|
||||
impl AnalyzerBuilder<'_> {
|
||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
|
||||
AnalyzerBuilder {
|
||||
filters: HashMap::new(),
|
||||
params:params,
|
||||
params: params,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tokenizer_name(&self) -> Result<String, TantivyError>{
|
||||
let tokenizer=self.params.get("tokenizer");
|
||||
if tokenizer.is_none(){
|
||||
fn get_tokenizer_name(&self) -> Result<String> {
|
||||
let tokenizer = self.params.get("tokenizer");
|
||||
if tokenizer.is_none() {
|
||||
return Ok("standard".to_string());
|
||||
}
|
||||
if !tokenizer.unwrap().is_string(){
|
||||
return Err(format!("tokenizer name should be string").into());
|
||||
if !tokenizer.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name should be string"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
|
||||
}
|
||||
|
||||
fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
||||
match SystemFilter::try_from(params){
|
||||
fn add_custom_filter(
|
||||
&mut self,
|
||||
name: &String,
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<()> {
|
||||
match SystemFilter::try_from(params) {
|
||||
Ok(filter) => {
|
||||
self.filters.insert(name.to_string(), filter);
|
||||
Ok(())
|
||||
},
|
||||
Err(e) => {Err(e)},
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
||||
for (name, value) in params{
|
||||
if !value.is_object(){
|
||||
fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
|
||||
for (name, value) in params {
|
||||
if !value.is_object() {
|
||||
continue;
|
||||
}
|
||||
self.add_custom_filter(name, value.as_object().unwrap())?;
|
||||
@ -115,138 +125,155 @@ impl AnalyzerBuilder<'_>{
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
if !params.is_array(){
|
||||
return Err("filter params should be array".into());
|
||||
fn build_filter(
|
||||
&mut self,
|
||||
mut builder: TextAnalyzerBuilder,
|
||||
params: &json::Value,
|
||||
) -> Result<TextAnalyzerBuilder> {
|
||||
if !params.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"filter params should be array".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
let filters = params.as_array().unwrap();
|
||||
|
||||
for filter in filters{
|
||||
if filter.is_string(){
|
||||
for filter in filters {
|
||||
if filter.is_string() {
|
||||
let filter_name = filter.as_str().unwrap();
|
||||
let costum = self.filters.remove(filter_name);
|
||||
if !costum.is_none(){
|
||||
if !costum.is_none() {
|
||||
builder = costum.unwrap().transform(builder);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// check if filter was system filter
|
||||
let system = SystemFilter::from(filter_name);
|
||||
match system {
|
||||
SystemFilter::Invalid => {
|
||||
return Err(format!("build analyzer failed, filter not found :{}", filter_name).into())
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"build analyzer failed, filter not found :{}",
|
||||
filter_name
|
||||
)))
|
||||
}
|
||||
other => {
|
||||
builder = other.transform(builder);
|
||||
},
|
||||
}
|
||||
}
|
||||
}else if filter.is_object(){
|
||||
let filter=SystemFilter::try_from(filter.as_object().unwrap())?;
|
||||
} else if filter.is_object() {
|
||||
let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
|
||||
builder = filter.transform(builder);
|
||||
}
|
||||
};
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
|
||||
for (key, value) in self.params{
|
||||
match key.as_str(){
|
||||
"tokenizer" => {},
|
||||
"filter" => {
|
||||
// build with filter if filter param exist
|
||||
builder=self.build_filter(builder, value)?;
|
||||
},
|
||||
other => return Err(format!("unknown analyzer option key: {}", other).into()),
|
||||
}
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn get_stop_words_option(&self) -> Result<Vec<String>, TantivyError>{
|
||||
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder> {
|
||||
for (key, value) in self.params {
|
||||
match key.as_str() {
|
||||
"tokenizer" => {}
|
||||
"filter" => {
|
||||
// build with filter if filter param exist
|
||||
builder = self.build_filter(builder, value)?;
|
||||
}
|
||||
other => {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"unknown analyzer option key: {}",
|
||||
other
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(builder)
|
||||
}
|
||||
|
||||
fn get_stop_words_option(&self) -> Result<Vec<String>> {
|
||||
let value = self.params.get("stop_words");
|
||||
match value{
|
||||
Some(value)=>{
|
||||
match value {
|
||||
Some(value) => {
|
||||
let str_list = get_string_list(value, "filter stop_words")?;
|
||||
Ok(get_stop_words_list(str_list))
|
||||
}
|
||||
None => Ok(vec![])
|
||||
}
|
||||
None => Ok(vec![]),
|
||||
}
|
||||
}
|
||||
|
||||
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
|
||||
match type_{
|
||||
"standard" => {
|
||||
Ok(standard_analyzer(self.get_stop_words_option()?))
|
||||
},
|
||||
"chinese" => {
|
||||
Ok(chinese_analyzer(self.get_stop_words_option()?))
|
||||
},
|
||||
"english" => {
|
||||
Ok(english_analyzer(self.get_stop_words_option()?))
|
||||
}
|
||||
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
|
||||
fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
|
||||
match type_ {
|
||||
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
|
||||
"chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
|
||||
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
|
||||
other_ => Err(TantivyBindingError::InternalError(format!(
|
||||
"unknown build-in analyzer type: {}",
|
||||
other_
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
|
||||
fn build(mut self) -> Result<TextAnalyzer> {
|
||||
// build base build-in analyzer
|
||||
match self.params.get("type"){
|
||||
Some(type_) =>{
|
||||
if !type_.is_string(){
|
||||
return Err(format!("analyzer type shoud be string").into())
|
||||
match self.params.get("type") {
|
||||
Some(type_) => {
|
||||
if !type_.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"analyzer type shoud be string"
|
||||
)));
|
||||
}
|
||||
return self.build_template(type_.as_str().unwrap());
|
||||
},
|
||||
}
|
||||
None => {}
|
||||
};
|
||||
|
||||
//build custom analyzer
|
||||
let tokenizer_name = self.get_tokenizer_name()?;
|
||||
let mut builder=get_builder_by_name(&tokenizer_name)?;
|
||||
|
||||
let mut builder = get_builder_by_name(&tokenizer_name)?;
|
||||
|
||||
// build with option
|
||||
builder = self.build_option(builder)?;
|
||||
Ok(builder.build())
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
||||
match json::from_str::<json::Value>(¶ms){
|
||||
Ok(value) =>{
|
||||
if value.is_null(){
|
||||
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
|
||||
match json::from_str::<json::Value>(¶ms) {
|
||||
Ok(value) => {
|
||||
if value.is_null() {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
if !value.is_object(){
|
||||
return Err("tokenizer params should be a json map".into());
|
||||
if !value.is_object() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"tokenizer params should be a json map".to_string(),
|
||||
));
|
||||
}
|
||||
let json_params = value.as_object().unwrap();
|
||||
|
||||
// create builder
|
||||
let analyzer_params=json_params.get("analyzer");
|
||||
if analyzer_params.is_none(){
|
||||
let analyzer_params = json_params.get("analyzer");
|
||||
if analyzer_params.is_none() {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
if !analyzer_params.unwrap().is_object(){
|
||||
return Err("analyzer params should be a json map".into());
|
||||
if !analyzer_params.unwrap().is_object() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"analyzer params should be a json map".to_string(),
|
||||
));
|
||||
}
|
||||
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
|
||||
|
||||
|
||||
// build custom filter
|
||||
let filter_params=json_params.get("filter");
|
||||
if !filter_params.is_none() && filter_params.unwrap().is_object(){
|
||||
let filter_params = json_params.get("filter");
|
||||
if !filter_params.is_none() && filter_params.unwrap().is_object() {
|
||||
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
|
||||
}
|
||||
|
||||
// build analyzer
|
||||
builder.build()
|
||||
},
|
||||
}
|
||||
Err(err) => Err(err.into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
||||
if params.len()==0{
|
||||
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer> {
|
||||
if params.len() == 0 {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
||||
@ -265,7 +292,7 @@ mod tests {
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -275,17 +302,16 @@ mod tests {
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance(){
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
print!("test tokens :{:?}\n", results)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,12 @@
|
||||
use tantivy::tokenizer::*;
|
||||
use serde_json as json;
|
||||
use regex;
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use crate::error::TantivyError;
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::util::*;
|
||||
|
||||
pub(crate) enum SystemFilter{
|
||||
pub(crate) enum SystemFilter {
|
||||
Invalid,
|
||||
LowerCase(LowerCaser),
|
||||
AsciiFolding(AsciiFoldingFilter),
|
||||
@ -15,16 +16,16 @@ pub(crate) enum SystemFilter{
|
||||
Length(RemoveLongFilter),
|
||||
Stop(StopWordFilter),
|
||||
Decompounder(SplitCompoundWords),
|
||||
Stemmer(Stemmer)
|
||||
Stemmer(Stemmer),
|
||||
}
|
||||
|
||||
impl SystemFilter{
|
||||
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{
|
||||
match self{
|
||||
impl SystemFilter {
|
||||
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
|
||||
match self {
|
||||
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||
@ -41,65 +42,85 @@ impl SystemFilter{
|
||||
// "max": 10, // length
|
||||
// }
|
||||
// TODO support min length
|
||||
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{
|
||||
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let limit_str = params.get("max");
|
||||
if limit_str.is_none() || !limit_str.unwrap().is_u64(){
|
||||
return Err("lenth max param was none or not uint".into())
|
||||
if limit_str.is_none() || !limit_str.unwrap().is_u64() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"lenth max param was none or not uint".to_string(),
|
||||
));
|
||||
}
|
||||
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
|
||||
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit+1)))
|
||||
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
|
||||
}
|
||||
|
||||
fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("stop_words");
|
||||
if value.is_none(){
|
||||
return Err("stop filter stop_words can't be empty".into());
|
||||
if value.is_none() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"stop filter stop_words can't be empty".to_string(),
|
||||
));
|
||||
}
|
||||
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(get_stop_words_list(str_list))))
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(
|
||||
get_stop_words_list(str_list),
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("word_list");
|
||||
if value.is_none() || !value.unwrap().is_array(){
|
||||
return Err("decompounder word list should be array".into())
|
||||
if value.is_none() || !value.unwrap().is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list should be array".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let stop_words = value.unwrap().as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words{
|
||||
match element.as_str(){
|
||||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => return Err("decompounder word list item should be string".into())
|
||||
None => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list item should be string".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
match SplitCompoundWords::from_dictionary(str_list){
|
||||
match SplitCompoundWords::from_dictionary(str_list) {
|
||||
Ok(f) => Ok(SystemFilter::Decompounder(f)),
|
||||
Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into())
|
||||
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||
"create decompounder failed: {}",
|
||||
e.to_string()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
||||
fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("language");
|
||||
if value.is_none() || !value.unwrap().is_string(){
|
||||
return Err("stemmer language field should be string".into())
|
||||
if value.is_none() || !value.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"stemmer language field should be string".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
match value.unwrap().as_str().unwrap().into_language(){
|
||||
match value.unwrap().as_str().unwrap().into_language() {
|
||||
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
|
||||
Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()),
|
||||
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||
"create stemmer failed : {}",
|
||||
e.to_string()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
trait LanguageParser {
|
||||
type Error;
|
||||
fn into_language(self) -> Result<Language, Self::Error>;
|
||||
fn into_language(self) -> Result<Language>;
|
||||
}
|
||||
|
||||
impl LanguageParser for &str {
|
||||
type Error = TantivyError;
|
||||
fn into_language(self) -> Result<Language, Self::Error> {
|
||||
impl LanguageParser for &str {
|
||||
type Error = TantivyBindingError;
|
||||
fn into_language(self) -> Result<Language> {
|
||||
match self.to_lowercase().as_str() {
|
||||
"arabig" => Ok(Language::Arabic),
|
||||
"danish" => Ok(Language::Danish),
|
||||
@ -119,14 +140,17 @@ impl LanguageParser for &str {
|
||||
"swedish" => Ok(Language::Swedish),
|
||||
"tamil" => Ok(Language::Tamil),
|
||||
"turkish" => Ok(Language::Turkish),
|
||||
other => Err(format!("unsupport language: {}", other).into()),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport language: {}",
|
||||
other
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for SystemFilter{
|
||||
impl From<&str> for SystemFilter {
|
||||
fn from(value: &str) -> Self {
|
||||
match value{
|
||||
match value {
|
||||
"lowercase" => Self::LowerCase(LowerCaser),
|
||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||
@ -138,24 +162,31 @@ impl From<&str> for SystemFilter{
|
||||
}
|
||||
|
||||
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
type Error = TantivyError;
|
||||
type Error = TantivyBindingError;
|
||||
|
||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> {
|
||||
match params.get(&"type".to_string()){
|
||||
Some(value) =>{
|
||||
if !value.is_string(){
|
||||
return Err("filter type should be string".into());
|
||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
match params.get(&"type".to_string()) {
|
||||
Some(value) => {
|
||||
if !value.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"filter type should be string".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
match value.as_str().unwrap(){
|
||||
match value.as_str().unwrap() {
|
||||
"length" => get_length_filter(params),
|
||||
"stop" => get_stop_words_filter(params),
|
||||
"decompounder" => get_decompounder_filter(params),
|
||||
"stemmer" => get_stemmer_filter(params),
|
||||
other=> Err(format!("unsupport filter type: {}", other).into()),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport filter type: {}",
|
||||
other
|
||||
))),
|
||||
}
|
||||
}
|
||||
None => Err("no type field in filter params".into()),
|
||||
None => Err(TantivyBindingError::InternalError(
|
||||
"no type field in filter params".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -167,7 +198,7 @@ pub struct CnCharOnlyFilterStream<T> {
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnCharOnlyFilter{
|
||||
impl TokenFilter for CnCharOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
||||
@ -216,7 +247,7 @@ pub struct CnAlphaNumOnlyFilterStream<T> {
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnAlphaNumOnlyFilter{
|
||||
impl TokenFilter for CnAlphaNumOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||
@ -255,4 +286,4 @@ impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,11 @@
|
||||
use serde_json as json;
|
||||
use std::ffi::c_void;
|
||||
use std::ops::Bound;
|
||||
use serde_json as json;
|
||||
use tantivy::{directory::MmapDirectory, Index};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
use crate::stop_words;
|
||||
use crate::error::TantivyError;
|
||||
|
||||
pub fn index_exist(path: &str) -> bool {
|
||||
let dir = MmapDirectory::open(path).unwrap();
|
||||
@ -32,29 +33,35 @@ pub fn free_binding<T>(ptr: *mut c_void) {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{
|
||||
if !value.is_array(){
|
||||
return Err(format!("{} should be array", label).into())
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} should be array", label).to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let stop_words = value.as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words{
|
||||
match element.as_str(){
|
||||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
None => return Err(format!("{} list item should be string", label).into())
|
||||
None => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} list item should be string", label).to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(str_list)
|
||||
}
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list:Vec<String>) -> Vec<String>{
|
||||
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list{
|
||||
if str.len()>0 && str.chars().nth(0).unwrap() == '_'{
|
||||
match str.as_str(){
|
||||
"_english_" =>{
|
||||
for word in stop_words::ENGLISH{
|
||||
for str in str_list {
|
||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||
match str.as_str() {
|
||||
"_english_" => {
|
||||
for word in stop_words::ENGLISH {
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
|
Loading…
Reference in New Issue
Block a user