feat: support use user's file as dictionary for analyzer filter (#46145)

relate: https://github.com/milvus-io/milvus/issues/43687

---------

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-12-16 11:45:16 +08:00 committed by GitHub
parent bb2a08ed71
commit df80f54151
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 468 additions and 221 deletions

View File

@ -3,6 +3,7 @@ use std::collections::HashMap;
use tantivy::tokenizer::*; use tantivy::tokenizer::*;
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer}; use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
use crate::analyzer::filter::{get_stop_words_list, get_string_list};
use crate::error::Result; use crate::error::Result;
use crate::error::TantivyBindingError; use crate::error::TantivyBindingError;
@ -36,32 +37,6 @@ impl AnalyzerBuilder<'_> {
))) )))
} }
fn add_custom_filter(
&mut self,
name: &String,
params: &json::Map<String, json::Value>,
) -> Result<()> {
match SystemFilter::try_from(params) {
Ok(filter) => {
self.filters.insert(name.to_string(), filter);
Ok(())
}
Err(e) => Err(e),
}
}
// not used now
// support add custom filter with filter name
fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
for (name, value) in params {
if !value.is_object() {
continue;
}
self.add_custom_filter(name, value.as_object().unwrap())?;
}
Ok(())
}
fn build_filter( fn build_filter(
&mut self, &mut self,
mut builder: TextAnalyzerBuilder, mut builder: TextAnalyzerBuilder,

View File

@ -0,0 +1,7 @@
this
a
an
the
is
in
of

View File

@ -0,0 +1,2 @@
distance, range, span, length
interval => gap

View File

@ -0,0 +1,98 @@
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
pub struct CnCharOnlyFilter;
pub struct CnCharOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnCharOnlyFilter {
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
CnCharOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnCharOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnCharOnlyFilterStream {
regex: regex::Regex::new("\\p{Han}+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
pub struct CnAlphaNumOnlyFilter;
pub struct CnAlphaNumOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnAlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
CnAlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnAlphaNumOnlyFilterStream {
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}

View File

@ -0,0 +1,97 @@
use super::filter::FilterBuilder;
use super::util::read_line_file;
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
use tantivy::tokenizer::SplitCompoundWords;
const WORD_LIST_KEY: &str = "word_list";
const WORD_LIST_FILE_KEY: &str = "word_list_file";
impl FilterBuilder for SplitCompoundWords {
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
let mut dict = Vec::<String>::new();
if let Some(value) = params.get(WORD_LIST_KEY) {
if !value.is_array() {
return Err(TantivyBindingError::InternalError(
"decompounder word list should be array".to_string(),
));
}
let words = value.as_array().unwrap();
for element in words {
if let Some(word) = element.as_str() {
dict.push(word.to_string());
} else {
return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(),
));
}
}
}
if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) {
read_line_file(&mut dict, file_params, "decompounder word list file")?;
}
if dict.is_empty() {
return Err(TantivyBindingError::InternalError(
"decompounder word list is empty".to_string(),
));
}
SplitCompoundWords::from_dictionary(dict).map_err(|e| {
TantivyBindingError::InternalError(format!(
"create decompounder failed: {}",
e.to_string()
))
})
}
}
#[cfg(test)]
mod tests {
use super::SplitCompoundWords;
use crate::analyzer::filter::FilterBuilder;
use crate::analyzer::tokenizers::standard_builder;
use crate::log::init_log;
use serde_json as json;
use std::collections::HashSet;
use std::path::Path;
#[test]
fn test_decompounder_filter_with_file() {
init_log();
let file_dir = Path::new(file!()).parent().unwrap();
let decompounder_path = file_dir.join("../data/test/decompounder_dict.txt");
let decompounder_path_str = decompounder_path.to_string_lossy().to_string();
let params = format!(
r#"{{
"type": "decompounder",
"word_list_file": {{
"type": "local",
"path": "{decompounder_path_str}"
}}
}}"#
);
let json_params = json::from_str::<json::Value>(&params).unwrap();
// let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]);
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap());
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
let builder = standard_builder().filter(filter.unwrap());
let mut analyzer = builder.build();
let mut stream = analyzer.token_stream("banknote");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
assert_eq!(
results
.iter()
.map(|s| s.as_str())
.collect::<HashSet<&str>>(),
HashSet::from(["bank", "note"])
);
}
}

View File

@ -1,8 +1,9 @@
use serde_json as json; use serde_json as json;
use tantivy::tokenizer::*; use tantivy::tokenizer::*;
use super::util::*; use super::{
use super::{RegexFilter, RemovePunctFilter, SynonymFilter}; CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter,
};
use crate::error::{Result, TantivyBindingError}; use crate::error::{Result, TantivyBindingError};
pub(crate) enum SystemFilter { pub(crate) enum SystemFilter {
@ -21,6 +22,12 @@ pub(crate) enum SystemFilter {
Synonym(SynonymFilter), Synonym(SynonymFilter),
} }
pub(crate) trait FilterBuilder {
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self>
where
Self: Sized;
}
impl SystemFilter { impl SystemFilter {
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder { pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
match self { match self {
@ -58,19 +65,6 @@ fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFi
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1))) Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
} }
fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("stop_words");
if value.is_none() {
return Err(TantivyBindingError::InternalError(
"stop filter stop_words can't be empty".to_string(),
));
}
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
Ok(SystemFilter::Stop(StopWordFilter::remove(
get_stop_words_list(str_list),
)))
}
fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> { fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("word_list"); let value = params.get("word_list");
if value.is_none() || !value.unwrap().is_array() { if value.is_none() || !value.unwrap().is_array() {
@ -82,13 +76,12 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
let stop_words = value.unwrap().as_array().unwrap(); let stop_words = value.unwrap().as_array().unwrap();
let mut str_list = Vec::<String>::new(); let mut str_list = Vec::<String>::new();
for element in stop_words { for element in stop_words {
match element.as_str() { if let Some(word) = element.as_str() {
Some(word) => str_list.push(word.to_string()), str_list.push(word.to_string());
_ => { } else {
return Err(TantivyBindingError::InternalError( return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(), "decompounder word list item should be string".to_string(),
)) ));
}
} }
} }
@ -101,57 +94,7 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
} }
} }
fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> { // fetch build-in filter from string
let value = params.get("language");
if value.is_none() || !value.unwrap().is_string() {
return Err(TantivyBindingError::InternalError(
"stemmer language field should be string".to_string(),
));
}
match value.unwrap().as_str().unwrap().into_language() {
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
Err(e) => Err(TantivyBindingError::InternalError(format!(
"create stemmer failed : {}",
e.to_string()
))),
}
}
trait LanguageParser {
fn into_language(self) -> Result<Language>;
}
impl LanguageParser for &str {
fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() {
"arabic" => Ok(Language::Arabic),
"arabig" => Ok(Language::Arabic), // typo
"danish" => Ok(Language::Danish),
"dutch" => Ok(Language::Dutch),
"english" => Ok(Language::English),
"finnish" => Ok(Language::Finnish),
"french" => Ok(Language::French),
"german" => Ok(Language::German),
"greek" => Ok(Language::Greek),
"hungarian" => Ok(Language::Hungarian),
"italian" => Ok(Language::Italian),
"norwegian" => Ok(Language::Norwegian),
"portuguese" => Ok(Language::Portuguese),
"romanian" => Ok(Language::Romanian),
"russian" => Ok(Language::Russian),
"spanish" => Ok(Language::Spanish),
"swedish" => Ok(Language::Swedish),
"tamil" => Ok(Language::Tamil),
"turkish" => Ok(Language::Turkish),
other => Err(TantivyBindingError::InternalError(format!(
"unsupport language: {}",
other
))),
}
}
}
impl From<&str> for SystemFilter { impl From<&str> for SystemFilter {
fn from(value: &str) -> Self { fn from(value: &str) -> Self {
match value { match value {
@ -180,9 +123,11 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
match value.as_str().unwrap() { match value.as_str().unwrap() {
"length" => get_length_filter(params), "length" => get_length_filter(params),
"stop" => get_stop_words_filter(params), "stop" => StopWordFilter::from_json(params).map(|f| SystemFilter::Stop(f)),
"decompounder" => get_decompounder_filter(params), "decompounder" => {
"stemmer" => get_stemmer_filter(params), SplitCompoundWords::from_json(params).map(|f| SystemFilter::Decompounder(f))
}
"stemmer" => Stemmer::from_json(params).map(|f| SystemFilter::Stemmer(f)),
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)), "regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
"synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)), "synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)),
other => Err(TantivyBindingError::InternalError(format!( other => Err(TantivyBindingError::InternalError(format!(
@ -197,100 +142,3 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
} }
} }
} }
pub struct CnCharOnlyFilter;
pub struct CnCharOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnCharOnlyFilter {
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
CnCharOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnCharOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnCharOnlyFilterStream {
regex: regex::Regex::new("\\p{Han}+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
pub struct CnAlphaNumOnlyFilter;
pub struct CnAlphaNumOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnAlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
CnAlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnAlphaNumOnlyFilterStream {
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}

View File

@ -1,14 +1,19 @@
mod cn_char_filter;
mod decompounder_filter;
mod filter; mod filter;
mod regex_filter; mod regex_filter;
mod remove_punct_filter; mod remove_punct_filter;
mod stemmer_filter;
mod stop_word_filter;
pub mod stop_words;
mod synonym_filter; mod synonym_filter;
mod util; mod util;
pub mod stop_words; pub(crate) use cn_char_filter::{CnAlphaNumOnlyFilter, CnCharOnlyFilter};
use regex_filter::RegexFilter; use regex_filter::RegexFilter;
use remove_punct_filter::RemovePunctFilter; use remove_punct_filter::RemovePunctFilter;
use synonym_filter::SynonymFilter; use synonym_filter::SynonymFilter;
pub(crate) use filter::*; pub(crate) use filter::*;
pub(crate) use stop_word_filter::get_stop_words_list;
pub(crate) use util::*; pub(crate) use util::*;

View File

@ -0,0 +1,57 @@
use super::filter::FilterBuilder;
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
use tantivy::tokenizer::{Language, Stemmer};
impl FilterBuilder for Stemmer {
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
let value = params.get("language");
if value.is_none() || !value.unwrap().is_string() {
return Err(TantivyBindingError::InternalError(
"stemmer language field should be string".to_string(),
));
}
match value.unwrap().as_str().unwrap().into_language() {
Ok(language) => Ok(Stemmer::new(language)),
Err(e) => Err(TantivyBindingError::InternalError(format!(
"create stemmer failed : {}",
e.to_string()
))),
}
}
}
trait StemmerLanguageParser {
fn into_language(self) -> Result<Language>;
}
impl StemmerLanguageParser for &str {
fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() {
"arabic" => Ok(Language::Arabic),
"arabig" => Ok(Language::Arabic), // typo
"danish" => Ok(Language::Danish),
"dutch" => Ok(Language::Dutch),
"english" => Ok(Language::English),
"finnish" => Ok(Language::Finnish),
"french" => Ok(Language::French),
"german" => Ok(Language::German),
"greek" => Ok(Language::Greek),
"hungarian" => Ok(Language::Hungarian),
"italian" => Ok(Language::Italian),
"norwegian" => Ok(Language::Norwegian),
"portuguese" => Ok(Language::Portuguese),
"romanian" => Ok(Language::Romanian),
"russian" => Ok(Language::Russian),
"spanish" => Ok(Language::Spanish),
"swedish" => Ok(Language::Swedish),
"tamil" => Ok(Language::Tamil),
"turkish" => Ok(Language::Turkish),
other => Err(TantivyBindingError::InternalError(format!(
"unsupport language: {}",
other
))),
}
}
}

View File

@ -0,0 +1,94 @@
use super::filter::FilterBuilder;
use super::stop_words::fetch_language_stop_words;
use super::util::*;
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
use tantivy::tokenizer::StopWordFilter;
const STOP_WORDS_LIST_KEY: &str = "stop_words";
const STOP_WORDS_FILE_KEY: &str = "stop_words_file";
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
let mut stop_words = Vec::new();
for str in str_list {
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
match fetch_language_stop_words(str.as_str()) {
Some(words) => {
for word in words {
stop_words.push(word.to_string());
}
continue;
}
None => {}
}
}
stop_words.push(str);
}
stop_words
}
impl FilterBuilder for StopWordFilter {
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
let mut dict = Vec::<String>::new();
if let Some(value) = params.get(STOP_WORDS_LIST_KEY) {
dict = get_stop_words_list(get_string_list(value, "stop_words")?);
}
if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) {
read_line_file(&mut dict, file_params, "stop words dict file")?;
}
Ok(StopWordFilter::remove(dict))
}
}
#[cfg(test)]
mod tests {
use super::StopWordFilter;
use crate::analyzer::filter::FilterBuilder;
use crate::analyzer::tokenizers::standard_builder;
use crate::log::init_log;
use serde_json as json;
use std::collections::HashSet;
use std::path::Path;
#[test]
fn test_stop_words_filter_with_file() {
init_log();
let file_dir = Path::new(file!()).parent().unwrap();
let stop_words_path = file_dir.join("../data/test/stop_words_dict.txt");
let stop_words_path_str = stop_words_path.to_string_lossy().to_string();
let params = format!(
r#"{{
"type": "stop_words",
"stop_words_file": {{
"type": "local",
"path": "{stop_words_path_str}"
}}
}}"#
);
let json_params = json::from_str::<json::Value>(&params).unwrap();
let filter = StopWordFilter::from_json(json_params.as_object().unwrap());
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
let builder = standard_builder().filter(filter.unwrap());
let mut analyzer = builder.build();
let mut stream = analyzer
.token_stream("this is a simple test of the stop words filter in an indexing system");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
assert_eq!(
results
.iter()
.map(|s| s.as_str())
.collect::<HashSet<&str>>(),
HashSet::from(["simple", "test", "stop", "words", "filter", "indexing", "system"])
);
}
}

View File

@ -1,6 +1,8 @@
use crate::analyzer::options::get_resource_path;
use crate::error::{Result, TantivyBindingError}; use crate::error::{Result, TantivyBindingError};
use serde_json as json; use serde_json as json;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::BufRead;
use std::sync::Arc; use std::sync::Arc;
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
@ -197,6 +199,23 @@ impl SynonymDict {
} }
} }
fn read_synonyms_file(builder: &mut SynonymDictBuilder, params: &json::Value) -> Result<()> {
let path = get_resource_path(params, "synonyms dict file")?;
let file = std::fs::File::open(path)?;
let reader = std::io::BufReader::new(file);
for line in reader.lines() {
if let Ok(row_data) = line {
builder.add_row(&row_data)?;
} else {
return Err(TantivyBindingError::InternalError(format!(
"read synonyms dict file failed, error: {}",
line.unwrap_err().to_string()
)));
}
}
Ok(())
}
#[derive(Clone)] #[derive(Clone)]
pub struct SynonymFilter { pub struct SynonymFilter {
dict: Arc<SynonymDict>, dict: Arc<SynonymDict>,
@ -226,6 +245,10 @@ impl SynonymFilter {
})?; })?;
} }
if let Some(file_params) = params.get("synonyms_file") {
read_synonyms_file(&mut builder, file_params)?;
}
Ok(SynonymFilter { Ok(SynonymFilter {
dict: Arc::new(builder.build()), dict: Arc::new(builder.build()),
}) })
@ -331,6 +354,7 @@ mod tests {
use crate::log::init_log; use crate::log::init_log;
use serde_json as json; use serde_json as json;
use std::collections::HashSet; use std::collections::HashSet;
use std::path::Path;
#[test] #[test]
fn test_synonym_filter() { fn test_synonym_filter() {
@ -361,4 +385,41 @@ mod tests {
HashSet::from(["\\test", "translate", "=>", "synonym"]) HashSet::from(["\\test", "translate", "=>", "synonym"])
); );
} }
#[test]
fn test_synonym_filter_with_file() {
init_log();
let file_dir = Path::new(file!()).parent().unwrap();
let synonyms_path = file_dir.join("../data/test/synonyms_dict.txt");
let synonyms_path_str = synonyms_path.to_string_lossy().to_string();
let params = format!(
r#"{{
"type": "synonym",
"synonyms_file": {{
"type": "local",
"path": "{synonyms_path_str}"
}}
}}"#
);
let json_params = json::from_str::<json::Value>(&params).unwrap();
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
let builder = standard_builder().filter(filter.unwrap());
let mut analyzer = builder.build();
let mut stream = analyzer.token_stream("distance interval");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
assert_eq!(
results
.iter()
.map(|s| s.as_str())
.collect::<HashSet<&str>>(),
HashSet::from(["distance", "range", "span", "length", "interval", "gap"])
);
}
} }

View File

@ -1,7 +1,7 @@
use serde_json as json; use crate::analyzer::options::get_resource_path;
use super::stop_words;
use crate::error::{Result, TantivyBindingError}; use crate::error::{Result, TantivyBindingError};
use serde_json as json;
use std::io::BufRead;
pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> { pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
if !value.is_array() { if !value.is_array() {
@ -25,21 +25,24 @@ pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>>
Ok(str_list) Ok(str_list)
} }
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> { pub(crate) fn read_line_file(
let mut stop_words = Vec::new(); dict: &mut Vec<String>,
for str in str_list { params: &json::Value,
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { key: &str,
match stop_words::fetch_language_stop_words(str.as_str()) { ) -> Result<()> {
Some(words) => { let path = get_resource_path(params, key)?;
for word in words { let file = std::fs::File::open(path)?;
stop_words.push(word.to_string()); let reader = std::io::BufReader::new(file);
} for line in reader.lines() {
continue; if let Ok(row_data) = line {
} dict.push(row_data);
None => {} } else {
} return Err(TantivyBindingError::InternalError(format!(
"read {} file failed, error: {}",
key,
line.unwrap_err().to_string()
)));
} }
stop_words.push(str);
} }
stop_words Ok(())
} }

View File

@ -416,11 +416,9 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
@pytest.mark.tags(CaseLabel.L1) @pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("invalid_filter_params", [ @pytest.mark.parametrize("invalid_filter_params", [
{"tokenizer": "standard", "filter": [{"type": "stop"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]}, {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]}, {"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]},
{"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]}, {"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]},
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]},
]) ])
def test_analyzer_with_invalid_filter(self, invalid_filter_params): def test_analyzer_with_invalid_filter(self, invalid_filter_params):
""" """