diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs index 966e51dff2..6f7ee645f5 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/analyzer.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use tantivy::tokenizer::*; use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer}; +use crate::analyzer::filter::{get_stop_words_list, get_string_list}; use crate::error::Result; use crate::error::TantivyBindingError; @@ -36,32 +37,6 @@ impl AnalyzerBuilder<'_> { ))) } - fn add_custom_filter( - &mut self, - name: &String, - params: &json::Map, - ) -> Result<()> { - match SystemFilter::try_from(params) { - Ok(filter) => { - self.filters.insert(name.to_string(), filter); - Ok(()) - } - Err(e) => Err(e), - } - } - - // not used now - // support add custom filter with filter name - fn add_custom_filters(&mut self, params: &json::Map) -> Result<()> { - for (name, value) in params { - if !value.is_object() { - continue; - } - self.add_custom_filter(name, value.as_object().unwrap())?; - } - Ok(()) - } - fn build_filter( &mut self, mut builder: TextAnalyzerBuilder, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/decompounder_dict.txt b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/decompounder_dict.txt new file mode 100644 index 0000000000..7f78a5a397 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/decompounder_dict.txt @@ -0,0 +1,2 @@ +bank +note \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/stop_words_dict.txt b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/stop_words_dict.txt new file mode 100644 index 0000000000..9a35d9b5f8 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/stop_words_dict.txt @@ -0,0 +1,7 @@ +this +a +an +the +is +in +of \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/synonyms_dict.txt b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/synonyms_dict.txt new file mode 100644 index 0000000000..25ca19d16d --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/synonyms_dict.txt @@ -0,0 +1,2 @@ +distance, range, span, length +interval => gap \ No newline at end of file diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/cn_char_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/cn_char_filter.rs new file mode 100644 index 0000000000..d557b2bbcd --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/cn_char_filter.rs @@ -0,0 +1,98 @@ +use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; + +pub struct CnCharOnlyFilter; + +pub struct CnCharOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnCharOnlyFilter { + type Tokenizer = CnCharOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnCharOnlyFilterWrapper { + CnCharOnlyFilterWrapper(tokenizer) + } +} + +#[derive(Clone)] +pub struct CnCharOnlyFilterWrapper(T); + +impl Tokenizer for CnCharOnlyFilterWrapper { + type TokenStream<'a> = CnCharOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnCharOnlyFilterStream { + regex: regex::Regex::new("\\p{Han}+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnCharOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} + +pub struct CnAlphaNumOnlyFilter; + +pub struct CnAlphaNumOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnAlphaNumOnlyFilter { + type Tokenizer = CnAlphaNumOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper { + CnAlphaNumOnlyFilterWrapper(tokenizer) + } +} +#[derive(Clone)] +pub struct CnAlphaNumOnlyFilterWrapper(T); + +impl Tokenizer for CnAlphaNumOnlyFilterWrapper { + type TokenStream<'a> = CnAlphaNumOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnAlphaNumOnlyFilterStream { + regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnAlphaNumOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/decompounder_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/decompounder_filter.rs new file mode 100644 index 0000000000..af521c7295 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/decompounder_filter.rs @@ -0,0 +1,97 @@ +use super::filter::FilterBuilder; +use super::util::read_line_file; +use crate::error::{Result, TantivyBindingError}; +use serde_json as json; +use tantivy::tokenizer::SplitCompoundWords; + +const WORD_LIST_KEY: &str = "word_list"; +const WORD_LIST_FILE_KEY: &str = "word_list_file"; + +impl FilterBuilder for SplitCompoundWords { + fn from_json(params: &json::Map) -> Result { + let mut dict = Vec::::new(); + if let Some(value) = params.get(WORD_LIST_KEY) { + if !value.is_array() { + return Err(TantivyBindingError::InternalError( + "decompounder word list should be array".to_string(), + )); + } + let words = value.as_array().unwrap(); + for element in words { + if let Some(word) = element.as_str() { + dict.push(word.to_string()); + } else { + return Err(TantivyBindingError::InternalError( + "decompounder word list item should be string".to_string(), + )); + } + } + } + + if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) { + read_line_file(&mut dict, file_params, "decompounder word list file")?; + } + + if dict.is_empty() { + return Err(TantivyBindingError::InternalError( + "decompounder word list is empty".to_string(), + )); + } + + SplitCompoundWords::from_dictionary(dict).map_err(|e| { + TantivyBindingError::InternalError(format!( + "create decompounder failed: {}", + e.to_string() + )) + }) + } +} + +#[cfg(test)] +mod tests { + use super::SplitCompoundWords; + use crate::analyzer::filter::FilterBuilder; + use crate::analyzer::tokenizers::standard_builder; + use crate::log::init_log; + use serde_json as json; + use std::collections::HashSet; + use std::path::Path; + + #[test] + fn test_decompounder_filter_with_file() { + init_log(); + let file_dir = Path::new(file!()).parent().unwrap(); + let decompounder_path = file_dir.join("../data/test/decompounder_dict.txt"); + let decompounder_path_str = decompounder_path.to_string_lossy().to_string(); + let params = format!( + r#"{{ + "type": "decompounder", + "word_list_file": {{ + "type": "local", + "path": "{decompounder_path_str}" + }} + }}"# + ); + let json_params = json::from_str::(¶ms).unwrap(); + // let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]); + let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap()); + assert!(filter.is_ok(), "error: {}", filter.err().unwrap()); + let builder = standard_builder().filter(filter.unwrap()); + let mut analyzer = builder.build(); + let mut stream = analyzer.token_stream("banknote"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + assert_eq!( + results + .iter() + .map(|s| s.as_str()) + .collect::>(), + HashSet::from(["bank", "note"]) + ); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/filter.rs index 75f2328817..2dd9b2f5ec 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/filter.rs @@ -1,8 +1,9 @@ use serde_json as json; use tantivy::tokenizer::*; -use super::util::*; -use super::{RegexFilter, RemovePunctFilter, SynonymFilter}; +use super::{ + CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter, +}; use crate::error::{Result, TantivyBindingError}; pub(crate) enum SystemFilter { @@ -21,6 +22,12 @@ pub(crate) enum SystemFilter { Synonym(SynonymFilter), } +pub(crate) trait FilterBuilder { + fn from_json(params: &json::Map) -> Result + where + Self: Sized; +} + impl SystemFilter { pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder { match self { @@ -58,19 +65,6 @@ fn get_length_filter(params: &json::Map) -> Result) -> Result { - let value = params.get("stop_words"); - if value.is_none() { - return Err(TantivyBindingError::InternalError( - "stop filter stop_words can't be empty".to_string(), - )); - } - let str_list = get_string_list(value.unwrap(), "stop_words filter")?; - Ok(SystemFilter::Stop(StopWordFilter::remove( - get_stop_words_list(str_list), - ))) -} - fn get_decompounder_filter(params: &json::Map) -> Result { let value = params.get("word_list"); if value.is_none() || !value.unwrap().is_array() { @@ -82,13 +76,12 @@ fn get_decompounder_filter(params: &json::Map) -> Result::new(); for element in stop_words { - match element.as_str() { - Some(word) => str_list.push(word.to_string()), - _ => { - return Err(TantivyBindingError::InternalError( - "decompounder word list item should be string".to_string(), - )) - } + if let Some(word) = element.as_str() { + str_list.push(word.to_string()); + } else { + return Err(TantivyBindingError::InternalError( + "decompounder word list item should be string".to_string(), + )); } } @@ -101,57 +94,7 @@ fn get_decompounder_filter(params: &json::Map) -> Result) -> Result { - let value = params.get("language"); - if value.is_none() || !value.unwrap().is_string() { - return Err(TantivyBindingError::InternalError( - "stemmer language field should be string".to_string(), - )); - } - - match value.unwrap().as_str().unwrap().into_language() { - Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), - Err(e) => Err(TantivyBindingError::InternalError(format!( - "create stemmer failed : {}", - e.to_string() - ))), - } -} - -trait LanguageParser { - fn into_language(self) -> Result; -} - -impl LanguageParser for &str { - fn into_language(self) -> Result { - match self.to_lowercase().as_str() { - "arabic" => Ok(Language::Arabic), - "arabig" => Ok(Language::Arabic), // typo - "danish" => Ok(Language::Danish), - "dutch" => Ok(Language::Dutch), - "english" => Ok(Language::English), - "finnish" => Ok(Language::Finnish), - "french" => Ok(Language::French), - "german" => Ok(Language::German), - "greek" => Ok(Language::Greek), - "hungarian" => Ok(Language::Hungarian), - "italian" => Ok(Language::Italian), - "norwegian" => Ok(Language::Norwegian), - "portuguese" => Ok(Language::Portuguese), - "romanian" => Ok(Language::Romanian), - "russian" => Ok(Language::Russian), - "spanish" => Ok(Language::Spanish), - "swedish" => Ok(Language::Swedish), - "tamil" => Ok(Language::Tamil), - "turkish" => Ok(Language::Turkish), - other => Err(TantivyBindingError::InternalError(format!( - "unsupport language: {}", - other - ))), - } - } -} - +// fetch build-in filter from string impl From<&str> for SystemFilter { fn from(value: &str) -> Self { match value { @@ -180,9 +123,11 @@ impl TryFrom<&json::Map> for SystemFilter { match value.as_str().unwrap() { "length" => get_length_filter(params), - "stop" => get_stop_words_filter(params), - "decompounder" => get_decompounder_filter(params), - "stemmer" => get_stemmer_filter(params), + "stop" => StopWordFilter::from_json(params).map(|f| SystemFilter::Stop(f)), + "decompounder" => { + SplitCompoundWords::from_json(params).map(|f| SystemFilter::Decompounder(f)) + } + "stemmer" => Stemmer::from_json(params).map(|f| SystemFilter::Stemmer(f)), "regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)), "synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)), other => Err(TantivyBindingError::InternalError(format!( @@ -197,100 +142,3 @@ impl TryFrom<&json::Map> for SystemFilter { } } } - -pub struct CnCharOnlyFilter; - -pub struct CnCharOnlyFilterStream { - regex: regex::Regex, - tail: T, -} - -impl TokenFilter for CnCharOnlyFilter { - type Tokenizer = CnCharOnlyFilterWrapper; - - fn transform(self, tokenizer: T) -> CnCharOnlyFilterWrapper { - CnCharOnlyFilterWrapper(tokenizer) - } -} - -#[derive(Clone)] -pub struct CnCharOnlyFilterWrapper(T); - -impl Tokenizer for CnCharOnlyFilterWrapper { - type TokenStream<'a> = CnCharOnlyFilterStream>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - CnCharOnlyFilterStream { - regex: regex::Regex::new("\\p{Han}+").unwrap(), - tail: self.0.token_stream(text), - } - } -} - -impl TokenStream for CnCharOnlyFilterStream { - fn advance(&mut self) -> bool { - while self.tail.advance() { - if self.regex.is_match(&self.tail.token().text) { - return true; - } - } - - false - } - - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } -} - -pub struct CnAlphaNumOnlyFilter; - -pub struct CnAlphaNumOnlyFilterStream { - regex: regex::Regex, - tail: T, -} - -impl TokenFilter for CnAlphaNumOnlyFilter { - type Tokenizer = CnAlphaNumOnlyFilterWrapper; - - fn transform(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper { - CnAlphaNumOnlyFilterWrapper(tokenizer) - } -} -#[derive(Clone)] -pub struct CnAlphaNumOnlyFilterWrapper(T); - -impl Tokenizer for CnAlphaNumOnlyFilterWrapper { - type TokenStream<'a> = CnAlphaNumOnlyFilterStream>; - - fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { - CnAlphaNumOnlyFilterStream { - regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(), - tail: self.0.token_stream(text), - } - } -} - -impl TokenStream for CnAlphaNumOnlyFilterStream { - fn advance(&mut self) -> bool { - while self.tail.advance() { - if self.regex.is_match(&self.tail.token().text) { - return true; - } - } - - false - } - - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } -} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs index 8103535e61..624725e4f0 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs @@ -1,14 +1,19 @@ +mod cn_char_filter; +mod decompounder_filter; mod filter; mod regex_filter; mod remove_punct_filter; +mod stemmer_filter; +mod stop_word_filter; +pub mod stop_words; mod synonym_filter; mod util; -pub mod stop_words; - +pub(crate) use cn_char_filter::{CnAlphaNumOnlyFilter, CnCharOnlyFilter}; use regex_filter::RegexFilter; use remove_punct_filter::RemovePunctFilter; use synonym_filter::SynonymFilter; pub(crate) use filter::*; +pub(crate) use stop_word_filter::get_stop_words_list; pub(crate) use util::*; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stemmer_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stemmer_filter.rs new file mode 100644 index 0000000000..fced37a4a7 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stemmer_filter.rs @@ -0,0 +1,57 @@ +use super::filter::FilterBuilder; +use crate::error::{Result, TantivyBindingError}; +use serde_json as json; +use tantivy::tokenizer::{Language, Stemmer}; + +impl FilterBuilder for Stemmer { + fn from_json(params: &json::Map) -> Result { + let value = params.get("language"); + if value.is_none() || !value.unwrap().is_string() { + return Err(TantivyBindingError::InternalError( + "stemmer language field should be string".to_string(), + )); + } + + match value.unwrap().as_str().unwrap().into_language() { + Ok(language) => Ok(Stemmer::new(language)), + Err(e) => Err(TantivyBindingError::InternalError(format!( + "create stemmer failed : {}", + e.to_string() + ))), + } + } +} + +trait StemmerLanguageParser { + fn into_language(self) -> Result; +} + +impl StemmerLanguageParser for &str { + fn into_language(self) -> Result { + match self.to_lowercase().as_str() { + "arabic" => Ok(Language::Arabic), + "arabig" => Ok(Language::Arabic), // typo + "danish" => Ok(Language::Danish), + "dutch" => Ok(Language::Dutch), + "english" => Ok(Language::English), + "finnish" => Ok(Language::Finnish), + "french" => Ok(Language::French), + "german" => Ok(Language::German), + "greek" => Ok(Language::Greek), + "hungarian" => Ok(Language::Hungarian), + "italian" => Ok(Language::Italian), + "norwegian" => Ok(Language::Norwegian), + "portuguese" => Ok(Language::Portuguese), + "romanian" => Ok(Language::Romanian), + "russian" => Ok(Language::Russian), + "spanish" => Ok(Language::Spanish), + "swedish" => Ok(Language::Swedish), + "tamil" => Ok(Language::Tamil), + "turkish" => Ok(Language::Turkish), + other => Err(TantivyBindingError::InternalError(format!( + "unsupport language: {}", + other + ))), + } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_word_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_word_filter.rs new file mode 100644 index 0000000000..ce6eba56dd --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_word_filter.rs @@ -0,0 +1,94 @@ +use super::filter::FilterBuilder; +use super::stop_words::fetch_language_stop_words; +use super::util::*; +use crate::error::{Result, TantivyBindingError}; +use serde_json as json; +use tantivy::tokenizer::StopWordFilter; + +const STOP_WORDS_LIST_KEY: &str = "stop_words"; +const STOP_WORDS_FILE_KEY: &str = "stop_words_file"; + +pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { + let mut stop_words = Vec::new(); + for str in str_list { + if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { + match fetch_language_stop_words(str.as_str()) { + Some(words) => { + for word in words { + stop_words.push(word.to_string()); + } + continue; + } + None => {} + } + } + stop_words.push(str); + } + stop_words +} + +impl FilterBuilder for StopWordFilter { + fn from_json(params: &json::Map) -> Result { + let mut dict = Vec::::new(); + if let Some(value) = params.get(STOP_WORDS_LIST_KEY) { + dict = get_stop_words_list(get_string_list(value, "stop_words")?); + } + + if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) { + read_line_file(&mut dict, file_params, "stop words dict file")?; + } + + Ok(StopWordFilter::remove(dict)) + } +} + +#[cfg(test)] +mod tests { + use super::StopWordFilter; + use crate::analyzer::filter::FilterBuilder; + use crate::analyzer::tokenizers::standard_builder; + use crate::log::init_log; + use serde_json as json; + use std::collections::HashSet; + use std::path::Path; + + #[test] + fn test_stop_words_filter_with_file() { + init_log(); + let file_dir = Path::new(file!()).parent().unwrap(); + let stop_words_path = file_dir.join("../data/test/stop_words_dict.txt"); + let stop_words_path_str = stop_words_path.to_string_lossy().to_string(); + let params = format!( + r#"{{ + "type": "stop_words", + "stop_words_file": {{ + "type": "local", + "path": "{stop_words_path_str}" + }} + }}"# + ); + + let json_params = json::from_str::(¶ms).unwrap(); + let filter = StopWordFilter::from_json(json_params.as_object().unwrap()); + assert!(filter.is_ok(), "error: {}", filter.err().unwrap()); + + let builder = standard_builder().filter(filter.unwrap()); + let mut analyzer = builder.build(); + let mut stream = analyzer + .token_stream("this is a simple test of the stop words filter in an indexing system"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + assert_eq!( + results + .iter() + .map(|s| s.as_str()) + .collect::>(), + HashSet::from(["simple", "test", "stop", "words", "filter", "indexing", "system"]) + ); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/synonym_filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/synonym_filter.rs index bc50bd5f09..7db17dde5f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/synonym_filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/synonym_filter.rs @@ -1,6 +1,8 @@ +use crate::analyzer::options::get_resource_path; use crate::error::{Result, TantivyBindingError}; use serde_json as json; use std::collections::{HashMap, HashSet}; +use std::io::BufRead; use std::sync::Arc; use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; @@ -197,6 +199,23 @@ impl SynonymDict { } } +fn read_synonyms_file(builder: &mut SynonymDictBuilder, params: &json::Value) -> Result<()> { + let path = get_resource_path(params, "synonyms dict file")?; + let file = std::fs::File::open(path)?; + let reader = std::io::BufReader::new(file); + for line in reader.lines() { + if let Ok(row_data) = line { + builder.add_row(&row_data)?; + } else { + return Err(TantivyBindingError::InternalError(format!( + "read synonyms dict file failed, error: {}", + line.unwrap_err().to_string() + ))); + } + } + Ok(()) +} + #[derive(Clone)] pub struct SynonymFilter { dict: Arc, @@ -226,6 +245,10 @@ impl SynonymFilter { })?; } + if let Some(file_params) = params.get("synonyms_file") { + read_synonyms_file(&mut builder, file_params)?; + } + Ok(SynonymFilter { dict: Arc::new(builder.build()), }) @@ -331,6 +354,7 @@ mod tests { use crate::log::init_log; use serde_json as json; use std::collections::HashSet; + use std::path::Path; #[test] fn test_synonym_filter() { @@ -361,4 +385,41 @@ mod tests { HashSet::from(["\\test", "translate", "=>", "synonym"]) ); } + + #[test] + fn test_synonym_filter_with_file() { + init_log(); + let file_dir = Path::new(file!()).parent().unwrap(); + let synonyms_path = file_dir.join("../data/test/synonyms_dict.txt"); + let synonyms_path_str = synonyms_path.to_string_lossy().to_string(); + let params = format!( + r#"{{ + "type": "synonym", + "synonyms_file": {{ + "type": "local", + "path": "{synonyms_path_str}" + }} + }}"# + ); + let json_params = json::from_str::(¶ms).unwrap(); + let filter = SynonymFilter::from_json(json_params.as_object().unwrap()); + assert!(filter.is_ok(), "error: {}", filter.err().unwrap()); + let builder = standard_builder().filter(filter.unwrap()); + let mut analyzer = builder.build(); + let mut stream = analyzer.token_stream("distance interval"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + assert_eq!( + results + .iter() + .map(|s| s.as_str()) + .collect::>(), + HashSet::from(["distance", "range", "span", "length", "interval", "gap"]) + ); + } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/util.rs index 640b2487a5..5058865116 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/util.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/util.rs @@ -1,7 +1,7 @@ -use serde_json as json; - -use super::stop_words; +use crate::analyzer::options::get_resource_path; use crate::error::{Result, TantivyBindingError}; +use serde_json as json; +use std::io::BufRead; pub fn get_string_list(value: &json::Value, label: &str) -> Result> { if !value.is_array() { @@ -25,21 +25,24 @@ pub fn get_string_list(value: &json::Value, label: &str) -> Result> Ok(str_list) } -pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { - let mut stop_words = Vec::new(); - for str in str_list { - if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { - match stop_words::fetch_language_stop_words(str.as_str()) { - Some(words) => { - for word in words { - stop_words.push(word.to_string()); - } - continue; - } - None => {} - } +pub(crate) fn read_line_file( + dict: &mut Vec, + params: &json::Value, + key: &str, +) -> Result<()> { + let path = get_resource_path(params, key)?; + let file = std::fs::File::open(path)?; + let reader = std::io::BufReader::new(file); + for line in reader.lines() { + if let Ok(row_data) = line { + dict.push(row_data); + } else { + return Err(TantivyBindingError::InternalError(format!( + "read {} file failed, error: {}", + key, + line.unwrap_err().to_string() + ))); } - stop_words.push(str); } - stop_words + Ok(()) } diff --git a/tests/python_client/milvus_client/test_milvus_client_analyzer.py b/tests/python_client/milvus_client/test_milvus_client_analyzer.py index e25c5d7075..b589caccc2 100644 --- a/tests/python_client/milvus_client/test_milvus_client_analyzer.py +++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py @@ -416,11 +416,9 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("invalid_filter_params", [ - {"tokenizer": "standard", "filter": [{"type": "stop"}]}, {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]}, {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]}, {"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]}, - {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]}, ]) def test_analyzer_with_invalid_filter(self, invalid_filter_params): """