mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-29 18:38:44 +08:00
enhance: optimize self defined rust error (#37975)
Prepare for issue: https://github.com/milvus-io/milvus/issues/37930 Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
parent
84698c072a
commit
e6af806a0d
@ -1,40 +1,35 @@
|
|||||||
|
use core::fmt;
|
||||||
|
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct TantivyError{
|
pub enum TantivyBindingError {
|
||||||
reason: String,
|
JsonError(serde_json::Error),
|
||||||
|
InternalError(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TantivyError{
|
impl From<serde_json::Error> for TantivyBindingError {
|
||||||
fn new(reason:String) -> Self{
|
fn from(value: serde_json::Error) -> Self {
|
||||||
TantivyError{reason:reason}
|
TantivyBindingError::JsonError(value)
|
||||||
}
|
|
||||||
|
|
||||||
pub fn reason(&self) -> String{
|
|
||||||
return self.reason.clone()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&str> for TantivyError{
|
impl fmt::Display for TantivyBindingError {
|
||||||
fn from(value: &str) -> Self {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
Self::new(value.to_string())
|
match self {
|
||||||
|
TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e),
|
||||||
|
TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<String> for TantivyError{
|
impl std::error::Error for TantivyBindingError {
|
||||||
fn from(value: String) -> Self {
|
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||||
Self::new(value)
|
match self {
|
||||||
|
TantivyBindingError::JsonError(e) => Some(e),
|
||||||
|
TantivyBindingError::InternalError(_) => None,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<json::Error> for TantivyError{
|
pub type Result<T> = std::result::Result<T, TantivyBindingError>;
|
||||||
fn from(value: json::Error) -> Self {
|
|
||||||
Self::new(value.to_string())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ToString for TantivyError{
|
|
||||||
fn to_string(&self) -> String {
|
|
||||||
return self.reason()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -1,113 +1,123 @@
|
|||||||
use log::warn;
|
use log::warn;
|
||||||
use std::collections::HashMap;
|
|
||||||
use tantivy::tokenizer::*;
|
|
||||||
use tantivy::tokenizer::StopWordFilter;
|
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use tantivy::tokenizer::StopWordFilter;
|
||||||
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
|
use crate::error::Result;
|
||||||
|
use crate::error::TantivyBindingError;
|
||||||
|
use crate::jieba_tokenizer::JiebaTokenizer;
|
||||||
use crate::stop_words;
|
use crate::stop_words;
|
||||||
use crate::tokenizer_filter::*;
|
use crate::tokenizer_filter::*;
|
||||||
use crate::jieba_tokenizer::JiebaTokenizer;
|
|
||||||
use crate::error::TantivyError;
|
|
||||||
use crate::util::*;
|
use crate::util::*;
|
||||||
|
|
||||||
|
|
||||||
// default build-in analyzer
|
// default build-in analyzer
|
||||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||||
let builder = standard_builder()
|
let builder = standard_builder().filter(LowerCaser);
|
||||||
.filter(LowerCaser);
|
|
||||||
|
|
||||||
if stop_words.len() > 0{
|
if stop_words.len() > 0 {
|
||||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||||
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
|
||||||
if stop_words.len() > 0{
|
if stop_words.len() > 0 {
|
||||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer{
|
fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||||
let builder = standard_builder()
|
let builder = standard_builder()
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(Stemmer::new(Language::English))
|
.filter(Stemmer::new(Language::English))
|
||||||
.filter(StopWordFilter::remove(stop_words::ENGLISH.iter().map(|&word| word.to_owned())));
|
.filter(StopWordFilter::remove(
|
||||||
|
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
|
||||||
|
));
|
||||||
|
|
||||||
if stop_words.len() > 0{
|
if stop_words.len() > 0 {
|
||||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn standard_builder() -> TextAnalyzerBuilder{
|
fn standard_builder() -> TextAnalyzerBuilder {
|
||||||
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn whitespace_builder()-> TextAnalyzerBuilder{
|
fn whitespace_builder() -> TextAnalyzerBuilder {
|
||||||
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn jieba_builder() -> TextAnalyzerBuilder{
|
fn jieba_builder() -> TextAnalyzerBuilder {
|
||||||
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_builder_by_name(name:&String) -> Result<TextAnalyzerBuilder, TantivyError>{
|
fn get_builder_by_name(name: &String) -> Result<TextAnalyzerBuilder> {
|
||||||
match name.as_str() {
|
match name.as_str() {
|
||||||
"standard" => Ok(standard_builder()),
|
"standard" => Ok(standard_builder()),
|
||||||
"whitespace" => Ok(whitespace_builder()),
|
"whitespace" => Ok(whitespace_builder()),
|
||||||
"jieba" => Ok(jieba_builder()),
|
"jieba" => Ok(jieba_builder()),
|
||||||
other => {
|
other => {
|
||||||
warn!("unsupported tokenizer: {}", other);
|
warn!("unsupported tokenizer: {}", other);
|
||||||
Err(format!("unsupported tokenizer: {}", other).into())
|
Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"unsupported tokenizer: {}",
|
||||||
|
other
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct AnalyzerBuilder<'a>{
|
struct AnalyzerBuilder<'a> {
|
||||||
// builder: TextAnalyzerBuilder
|
// builder: TextAnalyzerBuilder
|
||||||
filters:HashMap<String, SystemFilter>,
|
filters: HashMap<String, SystemFilter>,
|
||||||
params:&'a json::Map<String, json::Value>
|
params: &'a json::Map<String, json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AnalyzerBuilder<'_>{
|
impl AnalyzerBuilder<'_> {
|
||||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder{
|
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
|
||||||
AnalyzerBuilder{
|
AnalyzerBuilder {
|
||||||
filters: HashMap::new(),
|
filters: HashMap::new(),
|
||||||
params:params,
|
params: params,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_tokenizer_name(&self) -> Result<String, TantivyError>{
|
fn get_tokenizer_name(&self) -> Result<String> {
|
||||||
let tokenizer=self.params.get("tokenizer");
|
let tokenizer = self.params.get("tokenizer");
|
||||||
if tokenizer.is_none(){
|
if tokenizer.is_none() {
|
||||||
return Ok("standard".to_string());
|
return Ok("standard".to_string());
|
||||||
}
|
}
|
||||||
if !tokenizer.unwrap().is_string(){
|
if !tokenizer.unwrap().is_string() {
|
||||||
return Err(format!("tokenizer name should be string").into());
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"tokenizer name should be string"
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
|
Ok(tokenizer.unwrap().as_str().unwrap().to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_custom_filter(&mut self, name: &String, params: &json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
fn add_custom_filter(
|
||||||
match SystemFilter::try_from(params){
|
&mut self,
|
||||||
|
name: &String,
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
) -> Result<()> {
|
||||||
|
match SystemFilter::try_from(params) {
|
||||||
Ok(filter) => {
|
Ok(filter) => {
|
||||||
self.filters.insert(name.to_string(), filter);
|
self.filters.insert(name.to_string(), filter);
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
}
|
||||||
Err(e) => {Err(e)},
|
Err(e) => Err(e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_custom_filters(&mut self, params:&json::Map<String, json::Value>) -> Result<(),TantivyError>{
|
fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
|
||||||
for (name, value) in params{
|
for (name, value) in params {
|
||||||
if !value.is_object(){
|
if !value.is_object() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
self.add_custom_filter(name, value.as_object().unwrap())?;
|
self.add_custom_filter(name, value.as_object().unwrap())?;
|
||||||
@ -115,138 +125,155 @@ impl AnalyzerBuilder<'_>{
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_filter(&mut self,mut builder: TextAnalyzerBuilder, params: &json::Value) -> Result<TextAnalyzerBuilder, TantivyError>{
|
fn build_filter(
|
||||||
if !params.is_array(){
|
&mut self,
|
||||||
return Err("filter params should be array".into());
|
mut builder: TextAnalyzerBuilder,
|
||||||
|
params: &json::Value,
|
||||||
|
) -> Result<TextAnalyzerBuilder> {
|
||||||
|
if !params.is_array() {
|
||||||
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"filter params should be array".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let filters = params.as_array().unwrap();
|
let filters = params.as_array().unwrap();
|
||||||
|
|
||||||
for filter in filters{
|
for filter in filters {
|
||||||
if filter.is_string(){
|
if filter.is_string() {
|
||||||
let filter_name = filter.as_str().unwrap();
|
let filter_name = filter.as_str().unwrap();
|
||||||
let costum = self.filters.remove(filter_name);
|
let costum = self.filters.remove(filter_name);
|
||||||
if !costum.is_none(){
|
if !costum.is_none() {
|
||||||
builder = costum.unwrap().transform(builder);
|
builder = costum.unwrap().transform(builder);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if filter was system filter
|
// check if filter was system filter
|
||||||
let system = SystemFilter::from(filter_name);
|
let system = SystemFilter::from(filter_name);
|
||||||
match system {
|
match system {
|
||||||
SystemFilter::Invalid => {
|
SystemFilter::Invalid => {
|
||||||
return Err(format!("build analyzer failed, filter not found :{}", filter_name).into())
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"build analyzer failed, filter not found :{}",
|
||||||
|
filter_name
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
other => {
|
other => {
|
||||||
builder = other.transform(builder);
|
builder = other.transform(builder);
|
||||||
},
|
}
|
||||||
}
|
}
|
||||||
}else if filter.is_object(){
|
} else if filter.is_object() {
|
||||||
let filter=SystemFilter::try_from(filter.as_object().unwrap())?;
|
let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
|
||||||
builder = filter.transform(builder);
|
builder = filter.transform(builder);
|
||||||
}
|
}
|
||||||
};
|
|
||||||
Ok(builder)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder, TantivyError>{
|
|
||||||
for (key, value) in self.params{
|
|
||||||
match key.as_str(){
|
|
||||||
"tokenizer" => {},
|
|
||||||
"filter" => {
|
|
||||||
// build with filter if filter param exist
|
|
||||||
builder=self.build_filter(builder, value)?;
|
|
||||||
},
|
|
||||||
other => return Err(format!("unknown analyzer option key: {}", other).into()),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(builder)
|
Ok(builder)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_stop_words_option(&self) -> Result<Vec<String>, TantivyError>{
|
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder> {
|
||||||
|
for (key, value) in self.params {
|
||||||
|
match key.as_str() {
|
||||||
|
"tokenizer" => {}
|
||||||
|
"filter" => {
|
||||||
|
// build with filter if filter param exist
|
||||||
|
builder = self.build_filter(builder, value)?;
|
||||||
|
}
|
||||||
|
other => {
|
||||||
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"unknown analyzer option key: {}",
|
||||||
|
other
|
||||||
|
)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(builder)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_stop_words_option(&self) -> Result<Vec<String>> {
|
||||||
let value = self.params.get("stop_words");
|
let value = self.params.get("stop_words");
|
||||||
match value{
|
match value {
|
||||||
Some(value)=>{
|
Some(value) => {
|
||||||
let str_list = get_string_list(value, "filter stop_words")?;
|
let str_list = get_string_list(value, "filter stop_words")?;
|
||||||
Ok(get_stop_words_list(str_list))
|
Ok(get_stop_words_list(str_list))
|
||||||
}
|
}
|
||||||
None => Ok(vec![])
|
None => Ok(vec![]),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_template(self, type_: &str)-> Result<TextAnalyzer, TantivyError>{
|
fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
|
||||||
match type_{
|
match type_ {
|
||||||
"standard" => {
|
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
|
||||||
Ok(standard_analyzer(self.get_stop_words_option()?))
|
"chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
|
||||||
},
|
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
|
||||||
"chinese" => {
|
other_ => Err(TantivyBindingError::InternalError(format!(
|
||||||
Ok(chinese_analyzer(self.get_stop_words_option()?))
|
"unknown build-in analyzer type: {}",
|
||||||
},
|
other_
|
||||||
"english" => {
|
))),
|
||||||
Ok(english_analyzer(self.get_stop_words_option()?))
|
|
||||||
}
|
|
||||||
other_ => Err(format!("unknown build-in analyzer type: {}", other_).into())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build(mut self) -> Result<TextAnalyzer, TantivyError>{
|
fn build(mut self) -> Result<TextAnalyzer> {
|
||||||
// build base build-in analyzer
|
// build base build-in analyzer
|
||||||
match self.params.get("type"){
|
match self.params.get("type") {
|
||||||
Some(type_) =>{
|
Some(type_) => {
|
||||||
if !type_.is_string(){
|
if !type_.is_string() {
|
||||||
return Err(format!("analyzer type shoud be string").into())
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"analyzer type shoud be string"
|
||||||
|
)));
|
||||||
}
|
}
|
||||||
return self.build_template(type_.as_str().unwrap());
|
return self.build_template(type_.as_str().unwrap());
|
||||||
},
|
}
|
||||||
None => {}
|
None => {}
|
||||||
};
|
};
|
||||||
|
|
||||||
//build custom analyzer
|
//build custom analyzer
|
||||||
let tokenizer_name = self.get_tokenizer_name()?;
|
let tokenizer_name = self.get_tokenizer_name()?;
|
||||||
let mut builder=get_builder_by_name(&tokenizer_name)?;
|
let mut builder = get_builder_by_name(&tokenizer_name)?;
|
||||||
|
|
||||||
// build with option
|
// build with option
|
||||||
builder = self.build_option(builder)?;
|
builder = self.build_option(builder)?;
|
||||||
Ok(builder.build())
|
Ok(builder.build())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
pub(crate) fn create_tokenizer_with_filter(params: &String) -> Result<TextAnalyzer> {
|
||||||
match json::from_str::<json::Value>(¶ms){
|
match json::from_str::<json::Value>(¶ms) {
|
||||||
Ok(value) =>{
|
Ok(value) => {
|
||||||
if value.is_null(){
|
if value.is_null() {
|
||||||
return Ok(standard_analyzer(vec![]));
|
return Ok(standard_analyzer(vec![]));
|
||||||
}
|
}
|
||||||
if !value.is_object(){
|
if !value.is_object() {
|
||||||
return Err("tokenizer params should be a json map".into());
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"tokenizer params should be a json map".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
let json_params = value.as_object().unwrap();
|
let json_params = value.as_object().unwrap();
|
||||||
|
|
||||||
// create builder
|
// create builder
|
||||||
let analyzer_params=json_params.get("analyzer");
|
let analyzer_params = json_params.get("analyzer");
|
||||||
if analyzer_params.is_none(){
|
if analyzer_params.is_none() {
|
||||||
return Ok(standard_analyzer(vec![]));
|
return Ok(standard_analyzer(vec![]));
|
||||||
}
|
}
|
||||||
if !analyzer_params.unwrap().is_object(){
|
if !analyzer_params.unwrap().is_object() {
|
||||||
return Err("analyzer params should be a json map".into());
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"analyzer params should be a json map".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
|
let mut builder = AnalyzerBuilder::new(analyzer_params.unwrap().as_object().unwrap());
|
||||||
|
|
||||||
// build custom filter
|
// build custom filter
|
||||||
let filter_params=json_params.get("filter");
|
let filter_params = json_params.get("filter");
|
||||||
if !filter_params.is_none() && filter_params.unwrap().is_object(){
|
if !filter_params.is_none() && filter_params.unwrap().is_object() {
|
||||||
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
|
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// build analyzer
|
// build analyzer
|
||||||
builder.build()
|
builder.build()
|
||||||
},
|
}
|
||||||
Err(err) => Err(err.into()),
|
Err(err) => Err(err.into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer, TantivyError> {
|
pub(crate) fn create_tokenizer(params: &String) -> Result<TextAnalyzer> {
|
||||||
if params.len()==0{
|
if params.len() == 0 {
|
||||||
return Ok(standard_analyzer(vec![]));
|
return Ok(standard_analyzer(vec![]));
|
||||||
}
|
}
|
||||||
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
create_tokenizer_with_filter(&format!("{{\"analyzer\":{}}}", params))
|
||||||
@ -265,7 +292,7 @@ mod tests {
|
|||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -275,17 +302,16 @@ mod tests {
|
|||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_tokenizer(¶ms.to_string());
|
let tokenizer = create_tokenizer(¶ms.to_string());
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap().reason());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||||
|
|
||||||
let mut results = Vec::<String>::new();
|
let mut results = Vec::<String>::new();
|
||||||
while stream.advance(){
|
while stream.advance() {
|
||||||
let token = stream.token();
|
let token = stream.token();
|
||||||
results.push(token.text.clone());
|
results.push(token.text.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
print!("test tokens :{:?}\n", results)
|
print!("test tokens :{:?}\n", results)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
use tantivy::tokenizer::*;
|
|
||||||
use serde_json as json;
|
|
||||||
use regex;
|
use regex;
|
||||||
|
use serde_json as json;
|
||||||
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
use crate::error::TantivyError;
|
use crate::error::Result;
|
||||||
|
use crate::error::TantivyBindingError;
|
||||||
use crate::util::*;
|
use crate::util::*;
|
||||||
|
|
||||||
pub(crate) enum SystemFilter{
|
pub(crate) enum SystemFilter {
|
||||||
Invalid,
|
Invalid,
|
||||||
LowerCase(LowerCaser),
|
LowerCase(LowerCaser),
|
||||||
AsciiFolding(AsciiFoldingFilter),
|
AsciiFolding(AsciiFoldingFilter),
|
||||||
@ -15,16 +16,16 @@ pub(crate) enum SystemFilter{
|
|||||||
Length(RemoveLongFilter),
|
Length(RemoveLongFilter),
|
||||||
Stop(StopWordFilter),
|
Stop(StopWordFilter),
|
||||||
Decompounder(SplitCompoundWords),
|
Decompounder(SplitCompoundWords),
|
||||||
Stemmer(Stemmer)
|
Stemmer(Stemmer),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SystemFilter{
|
impl SystemFilter {
|
||||||
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder{
|
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
|
||||||
match self{
|
match self {
|
||||||
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
|
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Length(filter) => builder.filter(filter).dynamic(),
|
Self::Length(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||||
@ -41,65 +42,85 @@ impl SystemFilter{
|
|||||||
// "max": 10, // length
|
// "max": 10, // length
|
||||||
// }
|
// }
|
||||||
// TODO support min length
|
// TODO support min length
|
||||||
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter, TantivyError>{
|
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||||
let limit_str = params.get("max");
|
let limit_str = params.get("max");
|
||||||
if limit_str.is_none() || !limit_str.unwrap().is_u64(){
|
if limit_str.is_none() || !limit_str.unwrap().is_u64() {
|
||||||
return Err("lenth max param was none or not uint".into())
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"lenth max param was none or not uint".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
|
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
|
||||||
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit+1)))
|
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_stop_words_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||||
let value = params.get("stop_words");
|
let value = params.get("stop_words");
|
||||||
if value.is_none(){
|
if value.is_none() {
|
||||||
return Err("stop filter stop_words can't be empty".into());
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"stop filter stop_words can't be empty".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
||||||
Ok(SystemFilter::Stop(StopWordFilter::remove(get_stop_words_list(str_list))))
|
Ok(SystemFilter::Stop(StopWordFilter::remove(
|
||||||
|
get_stop_words_list(str_list),
|
||||||
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||||
let value = params.get("word_list");
|
let value = params.get("word_list");
|
||||||
if value.is_none() || !value.unwrap().is_array(){
|
if value.is_none() || !value.unwrap().is_array() {
|
||||||
return Err("decompounder word list should be array".into())
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"decompounder word list should be array".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let stop_words = value.unwrap().as_array().unwrap();
|
let stop_words = value.unwrap().as_array().unwrap();
|
||||||
let mut str_list = Vec::<String>::new();
|
let mut str_list = Vec::<String>::new();
|
||||||
for element in stop_words{
|
for element in stop_words {
|
||||||
match element.as_str(){
|
match element.as_str() {
|
||||||
Some(word) => str_list.push(word.to_string()),
|
Some(word) => str_list.push(word.to_string()),
|
||||||
None => return Err("decompounder word list item should be string".into())
|
None => {
|
||||||
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"decompounder word list item should be string".to_string(),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
match SplitCompoundWords::from_dictionary(str_list){
|
match SplitCompoundWords::from_dictionary(str_list) {
|
||||||
Ok(f) => Ok(SystemFilter::Decompounder(f)),
|
Ok(f) => Ok(SystemFilter::Decompounder(f)),
|
||||||
Err(e) => Err(format!("create decompounder failed: {}", e.to_string()).into())
|
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"create decompounder failed: {}",
|
||||||
|
e.to_string()
|
||||||
|
))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_stemmer_filter(params: &json::Map<String, json::Value>)-> Result<SystemFilter, TantivyError>{
|
fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||||
let value = params.get("language");
|
let value = params.get("language");
|
||||||
if value.is_none() || !value.unwrap().is_string(){
|
if value.is_none() || !value.unwrap().is_string() {
|
||||||
return Err("stemmer language field should be string".into())
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"stemmer language field should be string".to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
match value.unwrap().as_str().unwrap().into_language(){
|
match value.unwrap().as_str().unwrap().into_language() {
|
||||||
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
|
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
|
||||||
Err(e) => Err(format!("create stemmer failed : {}", e.to_string()).into()),
|
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"create stemmer failed : {}",
|
||||||
|
e.to_string()
|
||||||
|
))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
trait LanguageParser {
|
trait LanguageParser {
|
||||||
type Error;
|
type Error;
|
||||||
fn into_language(self) -> Result<Language, Self::Error>;
|
fn into_language(self) -> Result<Language>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LanguageParser for &str {
|
impl LanguageParser for &str {
|
||||||
type Error = TantivyError;
|
type Error = TantivyBindingError;
|
||||||
fn into_language(self) -> Result<Language, Self::Error> {
|
fn into_language(self) -> Result<Language> {
|
||||||
match self.to_lowercase().as_str() {
|
match self.to_lowercase().as_str() {
|
||||||
"arabig" => Ok(Language::Arabic),
|
"arabig" => Ok(Language::Arabic),
|
||||||
"danish" => Ok(Language::Danish),
|
"danish" => Ok(Language::Danish),
|
||||||
@ -119,14 +140,17 @@ impl LanguageParser for &str {
|
|||||||
"swedish" => Ok(Language::Swedish),
|
"swedish" => Ok(Language::Swedish),
|
||||||
"tamil" => Ok(Language::Tamil),
|
"tamil" => Ok(Language::Tamil),
|
||||||
"turkish" => Ok(Language::Turkish),
|
"turkish" => Ok(Language::Turkish),
|
||||||
other => Err(format!("unsupport language: {}", other).into()),
|
other => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"unsupport language: {}",
|
||||||
|
other
|
||||||
|
))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<&str> for SystemFilter{
|
impl From<&str> for SystemFilter {
|
||||||
fn from(value: &str) -> Self {
|
fn from(value: &str) -> Self {
|
||||||
match value{
|
match value {
|
||||||
"lowercase" => Self::LowerCase(LowerCaser),
|
"lowercase" => Self::LowerCase(LowerCaser),
|
||||||
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
|
||||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||||
@ -138,24 +162,31 @@ impl From<&str> for SystemFilter{
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||||
type Error = TantivyError;
|
type Error = TantivyBindingError;
|
||||||
|
|
||||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self, Self::Error> {
|
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||||
match params.get(&"type".to_string()){
|
match params.get(&"type".to_string()) {
|
||||||
Some(value) =>{
|
Some(value) => {
|
||||||
if !value.is_string(){
|
if !value.is_string() {
|
||||||
return Err("filter type should be string".into());
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"filter type should be string".to_string(),
|
||||||
|
));
|
||||||
};
|
};
|
||||||
|
|
||||||
match value.as_str().unwrap(){
|
match value.as_str().unwrap() {
|
||||||
"length" => get_length_filter(params),
|
"length" => get_length_filter(params),
|
||||||
"stop" => get_stop_words_filter(params),
|
"stop" => get_stop_words_filter(params),
|
||||||
"decompounder" => get_decompounder_filter(params),
|
"decompounder" => get_decompounder_filter(params),
|
||||||
"stemmer" => get_stemmer_filter(params),
|
"stemmer" => get_stemmer_filter(params),
|
||||||
other=> Err(format!("unsupport filter type: {}", other).into()),
|
other => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"unsupport filter type: {}",
|
||||||
|
other
|
||||||
|
))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
None => Err("no type field in filter params".into()),
|
None => Err(TantivyBindingError::InternalError(
|
||||||
|
"no type field in filter params".to_string(),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -167,7 +198,7 @@ pub struct CnCharOnlyFilterStream<T> {
|
|||||||
tail: T,
|
tail: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenFilter for CnCharOnlyFilter{
|
impl TokenFilter for CnCharOnlyFilter {
|
||||||
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
||||||
|
|
||||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
||||||
@ -216,7 +247,7 @@ pub struct CnAlphaNumOnlyFilterStream<T> {
|
|||||||
tail: T,
|
tail: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TokenFilter for CnAlphaNumOnlyFilter{
|
impl TokenFilter for CnAlphaNumOnlyFilter {
|
||||||
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||||
|
|
||||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||||
@ -255,4 +286,4 @@ impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
|||||||
fn token_mut(&mut self) -> &mut Token {
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
self.tail.token_mut()
|
self.tail.token_mut()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
|
use serde_json as json;
|
||||||
use std::ffi::c_void;
|
use std::ffi::c_void;
|
||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
use serde_json as json;
|
|
||||||
use tantivy::{directory::MmapDirectory, Index};
|
use tantivy::{directory::MmapDirectory, Index};
|
||||||
|
|
||||||
|
use crate::error::Result;
|
||||||
|
use crate::error::TantivyBindingError;
|
||||||
use crate::stop_words;
|
use crate::stop_words;
|
||||||
use crate::error::TantivyError;
|
|
||||||
|
|
||||||
pub fn index_exist(path: &str) -> bool {
|
pub fn index_exist(path: &str) -> bool {
|
||||||
let dir = MmapDirectory::open(path).unwrap();
|
let dir = MmapDirectory::open(path).unwrap();
|
||||||
@ -32,29 +33,35 @@ pub fn free_binding<T>(ptr: *mut c_void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>, TantivyError>{
|
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||||
if !value.is_array(){
|
if !value.is_array() {
|
||||||
return Err(format!("{} should be array", label).into())
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
format!("{} should be array", label).to_string(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let stop_words = value.as_array().unwrap();
|
let stop_words = value.as_array().unwrap();
|
||||||
let mut str_list = Vec::<String>::new();
|
let mut str_list = Vec::<String>::new();
|
||||||
for element in stop_words{
|
for element in stop_words {
|
||||||
match element.as_str(){
|
match element.as_str() {
|
||||||
Some(word) => str_list.push(word.to_string()),
|
Some(word) => str_list.push(word.to_string()),
|
||||||
None => return Err(format!("{} list item should be string", label).into())
|
None => {
|
||||||
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
format!("{} list item should be string", label).to_string(),
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
Ok(str_list)
|
Ok(str_list)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_stop_words_list(str_list:Vec<String>) -> Vec<String>{
|
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||||
let mut stop_words = Vec::new();
|
let mut stop_words = Vec::new();
|
||||||
for str in str_list{
|
for str in str_list {
|
||||||
if str.len()>0 && str.chars().nth(0).unwrap() == '_'{
|
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||||
match str.as_str(){
|
match str.as_str() {
|
||||||
"_english_" =>{
|
"_english_" => {
|
||||||
for word in stop_words::ENGLISH{
|
for word in stop_words::ENGLISH {
|
||||||
stop_words.push(word.to_string());
|
stop_words.push(word.to_string());
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
Loading…
Reference in New Issue
Block a user