mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
feat: support use user's file as dictionary for analyzer filter (#46145)
relate: https://github.com/milvus-io/milvus/issues/43687 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
bb2a08ed71
commit
df80f54151
@ -3,6 +3,7 @@ use std::collections::HashMap;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
|
||||
use crate::analyzer::filter::{get_stop_words_list, get_string_list};
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
|
||||
@ -36,32 +37,6 @@ impl AnalyzerBuilder<'_> {
|
||||
)))
|
||||
}
|
||||
|
||||
fn add_custom_filter(
|
||||
&mut self,
|
||||
name: &String,
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<()> {
|
||||
match SystemFilter::try_from(params) {
|
||||
Ok(filter) => {
|
||||
self.filters.insert(name.to_string(), filter);
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
// not used now
|
||||
// support add custom filter with filter name
|
||||
fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
|
||||
for (name, value) in params {
|
||||
if !value.is_object() {
|
||||
continue;
|
||||
}
|
||||
self.add_custom_filter(name, value.as_object().unwrap())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build_filter(
|
||||
&mut self,
|
||||
mut builder: TextAnalyzerBuilder,
|
||||
|
||||
@ -0,0 +1,2 @@
|
||||
bank
|
||||
note
|
||||
7
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/stop_words_dict.txt
vendored
Normal file
7
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/stop_words_dict.txt
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
this
|
||||
a
|
||||
an
|
||||
the
|
||||
is
|
||||
in
|
||||
of
|
||||
2
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/synonyms_dict.txt
vendored
Normal file
2
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/data/test/synonyms_dict.txt
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
distance, range, span, length
|
||||
interval => gap
|
||||
98
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/cn_char_filter.rs
vendored
Normal file
98
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/cn_char_filter.rs
vendored
Normal file
@ -0,0 +1,98 @@
|
||||
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub struct CnCharOnlyFilter;
|
||||
|
||||
pub struct CnCharOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnCharOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
||||
CnCharOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CnCharOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnCharOnlyFilterStream {
|
||||
regex: regex::Regex::new("\\p{Han}+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CnAlphaNumOnlyFilter;
|
||||
|
||||
pub struct CnAlphaNumOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnAlphaNumOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||
CnAlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnAlphaNumOnlyFilterStream {
|
||||
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
97
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/decompounder_filter.rs
vendored
Normal file
97
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/decompounder_filter.rs
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use super::util::read_line_file;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::SplitCompoundWords;
|
||||
|
||||
const WORD_LIST_KEY: &str = "word_list";
|
||||
const WORD_LIST_FILE_KEY: &str = "word_list_file";
|
||||
|
||||
impl FilterBuilder for SplitCompoundWords {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
let mut dict = Vec::<String>::new();
|
||||
if let Some(value) = params.get(WORD_LIST_KEY) {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list should be array".to_string(),
|
||||
));
|
||||
}
|
||||
let words = value.as_array().unwrap();
|
||||
for element in words {
|
||||
if let Some(word) = element.as_str() {
|
||||
dict.push(word.to_string());
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list item should be string".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) {
|
||||
read_line_file(&mut dict, file_params, "decompounder word list file")?;
|
||||
}
|
||||
|
||||
if dict.is_empty() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list is empty".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
SplitCompoundWords::from_dictionary(dict).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"create decompounder failed: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SplitCompoundWords;
|
||||
use crate::analyzer::filter::FilterBuilder;
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_decompounder_filter_with_file() {
|
||||
init_log();
|
||||
let file_dir = Path::new(file!()).parent().unwrap();
|
||||
let decompounder_path = file_dir.join("../data/test/decompounder_dict.txt");
|
||||
let decompounder_path_str = decompounder_path.to_string_lossy().to_string();
|
||||
let params = format!(
|
||||
r#"{{
|
||||
"type": "decompounder",
|
||||
"word_list_file": {{
|
||||
"type": "local",
|
||||
"path": "{decompounder_path_str}"
|
||||
}}
|
||||
}}"#
|
||||
);
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
// let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]);
|
||||
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap());
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
let mut stream = analyzer.token_stream("banknote");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
results
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<HashSet<&str>>(),
|
||||
HashSet::from(["bank", "note"])
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -1,8 +1,9 @@
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use super::util::*;
|
||||
use super::{RegexFilter, RemovePunctFilter, SynonymFilter};
|
||||
use super::{
|
||||
CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter,
|
||||
};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
pub(crate) enum SystemFilter {
|
||||
@ -21,6 +22,12 @@ pub(crate) enum SystemFilter {
|
||||
Synonym(SynonymFilter),
|
||||
}
|
||||
|
||||
pub(crate) trait FilterBuilder {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self>
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
|
||||
impl SystemFilter {
|
||||
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
|
||||
match self {
|
||||
@ -58,19 +65,6 @@ fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFi
|
||||
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
|
||||
}
|
||||
|
||||
fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("stop_words");
|
||||
if value.is_none() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"stop filter stop_words can't be empty".to_string(),
|
||||
));
|
||||
}
|
||||
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
|
||||
Ok(SystemFilter::Stop(StopWordFilter::remove(
|
||||
get_stop_words_list(str_list),
|
||||
)))
|
||||
}
|
||||
|
||||
fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("word_list");
|
||||
if value.is_none() || !value.unwrap().is_array() {
|
||||
@ -82,13 +76,12 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
|
||||
let stop_words = value.unwrap().as_array().unwrap();
|
||||
let mut str_list = Vec::<String>::new();
|
||||
for element in stop_words {
|
||||
match element.as_str() {
|
||||
Some(word) => str_list.push(word.to_string()),
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list item should be string".to_string(),
|
||||
))
|
||||
}
|
||||
if let Some(word) = element.as_str() {
|
||||
str_list.push(word.to_string());
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"decompounder word list item should be string".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@ -101,57 +94,7 @@ fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<Sy
|
||||
}
|
||||
}
|
||||
|
||||
fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
|
||||
let value = params.get("language");
|
||||
if value.is_none() || !value.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"stemmer language field should be string".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
match value.unwrap().as_str().unwrap().into_language() {
|
||||
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
|
||||
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||
"create stemmer failed : {}",
|
||||
e.to_string()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
trait LanguageParser {
|
||||
fn into_language(self) -> Result<Language>;
|
||||
}
|
||||
|
||||
impl LanguageParser for &str {
|
||||
fn into_language(self) -> Result<Language> {
|
||||
match self.to_lowercase().as_str() {
|
||||
"arabic" => Ok(Language::Arabic),
|
||||
"arabig" => Ok(Language::Arabic), // typo
|
||||
"danish" => Ok(Language::Danish),
|
||||
"dutch" => Ok(Language::Dutch),
|
||||
"english" => Ok(Language::English),
|
||||
"finnish" => Ok(Language::Finnish),
|
||||
"french" => Ok(Language::French),
|
||||
"german" => Ok(Language::German),
|
||||
"greek" => Ok(Language::Greek),
|
||||
"hungarian" => Ok(Language::Hungarian),
|
||||
"italian" => Ok(Language::Italian),
|
||||
"norwegian" => Ok(Language::Norwegian),
|
||||
"portuguese" => Ok(Language::Portuguese),
|
||||
"romanian" => Ok(Language::Romanian),
|
||||
"russian" => Ok(Language::Russian),
|
||||
"spanish" => Ok(Language::Spanish),
|
||||
"swedish" => Ok(Language::Swedish),
|
||||
"tamil" => Ok(Language::Tamil),
|
||||
"turkish" => Ok(Language::Turkish),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport language: {}",
|
||||
other
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fetch build-in filter from string
|
||||
impl From<&str> for SystemFilter {
|
||||
fn from(value: &str) -> Self {
|
||||
match value {
|
||||
@ -180,9 +123,11 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
|
||||
match value.as_str().unwrap() {
|
||||
"length" => get_length_filter(params),
|
||||
"stop" => get_stop_words_filter(params),
|
||||
"decompounder" => get_decompounder_filter(params),
|
||||
"stemmer" => get_stemmer_filter(params),
|
||||
"stop" => StopWordFilter::from_json(params).map(|f| SystemFilter::Stop(f)),
|
||||
"decompounder" => {
|
||||
SplitCompoundWords::from_json(params).map(|f| SystemFilter::Decompounder(f))
|
||||
}
|
||||
"stemmer" => Stemmer::from_json(params).map(|f| SystemFilter::Stemmer(f)),
|
||||
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
||||
"synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
@ -197,100 +142,3 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CnCharOnlyFilter;
|
||||
|
||||
pub struct CnCharOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnCharOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
|
||||
CnCharOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct CnCharOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnCharOnlyFilterStream {
|
||||
regex: regex::Regex::new("\\p{Han}+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CnAlphaNumOnlyFilter;
|
||||
|
||||
pub struct CnAlphaNumOnlyFilterStream<T> {
|
||||
regex: regex::Regex,
|
||||
tail: T,
|
||||
}
|
||||
|
||||
impl TokenFilter for CnAlphaNumOnlyFilter {
|
||||
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
|
||||
|
||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
|
||||
CnAlphaNumOnlyFilterWrapper(tokenizer)
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
|
||||
|
||||
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
|
||||
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
CnAlphaNumOnlyFilterStream {
|
||||
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
|
||||
tail: self.0.token_stream(text),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.tail.advance() {
|
||||
if self.regex.is_match(&self.tail.token().text) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,14 +1,19 @@
|
||||
mod cn_char_filter;
|
||||
mod decompounder_filter;
|
||||
mod filter;
|
||||
mod regex_filter;
|
||||
mod remove_punct_filter;
|
||||
mod stemmer_filter;
|
||||
mod stop_word_filter;
|
||||
pub mod stop_words;
|
||||
mod synonym_filter;
|
||||
mod util;
|
||||
|
||||
pub mod stop_words;
|
||||
|
||||
pub(crate) use cn_char_filter::{CnAlphaNumOnlyFilter, CnCharOnlyFilter};
|
||||
use regex_filter::RegexFilter;
|
||||
use remove_punct_filter::RemovePunctFilter;
|
||||
use synonym_filter::SynonymFilter;
|
||||
|
||||
pub(crate) use filter::*;
|
||||
pub(crate) use stop_word_filter::get_stop_words_list;
|
||||
pub(crate) use util::*;
|
||||
|
||||
57
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stemmer_filter.rs
vendored
Normal file
57
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stemmer_filter.rs
vendored
Normal file
@ -0,0 +1,57 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::{Language, Stemmer};
|
||||
|
||||
impl FilterBuilder for Stemmer {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
let value = params.get("language");
|
||||
if value.is_none() || !value.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"stemmer language field should be string".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
match value.unwrap().as_str().unwrap().into_language() {
|
||||
Ok(language) => Ok(Stemmer::new(language)),
|
||||
Err(e) => Err(TantivyBindingError::InternalError(format!(
|
||||
"create stemmer failed : {}",
|
||||
e.to_string()
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trait StemmerLanguageParser {
|
||||
fn into_language(self) -> Result<Language>;
|
||||
}
|
||||
|
||||
impl StemmerLanguageParser for &str {
|
||||
fn into_language(self) -> Result<Language> {
|
||||
match self.to_lowercase().as_str() {
|
||||
"arabic" => Ok(Language::Arabic),
|
||||
"arabig" => Ok(Language::Arabic), // typo
|
||||
"danish" => Ok(Language::Danish),
|
||||
"dutch" => Ok(Language::Dutch),
|
||||
"english" => Ok(Language::English),
|
||||
"finnish" => Ok(Language::Finnish),
|
||||
"french" => Ok(Language::French),
|
||||
"german" => Ok(Language::German),
|
||||
"greek" => Ok(Language::Greek),
|
||||
"hungarian" => Ok(Language::Hungarian),
|
||||
"italian" => Ok(Language::Italian),
|
||||
"norwegian" => Ok(Language::Norwegian),
|
||||
"portuguese" => Ok(Language::Portuguese),
|
||||
"romanian" => Ok(Language::Romanian),
|
||||
"russian" => Ok(Language::Russian),
|
||||
"spanish" => Ok(Language::Spanish),
|
||||
"swedish" => Ok(Language::Swedish),
|
||||
"tamil" => Ok(Language::Tamil),
|
||||
"turkish" => Ok(Language::Turkish),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport language: {}",
|
||||
other
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
94
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_word_filter.rs
vendored
Normal file
94
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_word_filter.rs
vendored
Normal file
@ -0,0 +1,94 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use super::stop_words::fetch_language_stop_words;
|
||||
use super::util::*;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
|
||||
const STOP_WORDS_LIST_KEY: &str = "stop_words";
|
||||
const STOP_WORDS_FILE_KEY: &str = "stop_words_file";
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list {
|
||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||
match fetch_language_stop_words(str.as_str()) {
|
||||
Some(words) => {
|
||||
for word in words {
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
stop_words.push(str);
|
||||
}
|
||||
stop_words
|
||||
}
|
||||
|
||||
impl FilterBuilder for StopWordFilter {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
let mut dict = Vec::<String>::new();
|
||||
if let Some(value) = params.get(STOP_WORDS_LIST_KEY) {
|
||||
dict = get_stop_words_list(get_string_list(value, "stop_words")?);
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) {
|
||||
read_line_file(&mut dict, file_params, "stop words dict file")?;
|
||||
}
|
||||
|
||||
Ok(StopWordFilter::remove(dict))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::StopWordFilter;
|
||||
use crate::analyzer::filter::FilterBuilder;
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_filter_with_file() {
|
||||
init_log();
|
||||
let file_dir = Path::new(file!()).parent().unwrap();
|
||||
let stop_words_path = file_dir.join("../data/test/stop_words_dict.txt");
|
||||
let stop_words_path_str = stop_words_path.to_string_lossy().to_string();
|
||||
let params = format!(
|
||||
r#"{{
|
||||
"type": "stop_words",
|
||||
"stop_words_file": {{
|
||||
"type": "local",
|
||||
"path": "{stop_words_path_str}"
|
||||
}}
|
||||
}}"#
|
||||
);
|
||||
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
let filter = StopWordFilter::from_json(json_params.as_object().unwrap());
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
let mut stream = analyzer
|
||||
.token_stream("this is a simple test of the stop words filter in an indexing system");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
results
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<HashSet<&str>>(),
|
||||
HashSet::from(["simple", "test", "stop", "words", "filter", "indexing", "system"])
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -1,6 +1,8 @@
|
||||
use crate::analyzer::options::get_resource_path;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::BufRead;
|
||||
use std::sync::Arc;
|
||||
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
@ -197,6 +199,23 @@ impl SynonymDict {
|
||||
}
|
||||
}
|
||||
|
||||
fn read_synonyms_file(builder: &mut SynonymDictBuilder, params: &json::Value) -> Result<()> {
|
||||
let path = get_resource_path(params, "synonyms dict file")?;
|
||||
let file = std::fs::File::open(path)?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
if let Ok(row_data) = line {
|
||||
builder.add_row(&row_data)?;
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"read synonyms dict file failed, error: {}",
|
||||
line.unwrap_err().to_string()
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SynonymFilter {
|
||||
dict: Arc<SynonymDict>,
|
||||
@ -226,6 +245,10 @@ impl SynonymFilter {
|
||||
})?;
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get("synonyms_file") {
|
||||
read_synonyms_file(&mut builder, file_params)?;
|
||||
}
|
||||
|
||||
Ok(SynonymFilter {
|
||||
dict: Arc::new(builder.build()),
|
||||
})
|
||||
@ -331,6 +354,7 @@ mod tests {
|
||||
use crate::log::init_log;
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
|
||||
#[test]
|
||||
fn test_synonym_filter() {
|
||||
@ -361,4 +385,41 @@ mod tests {
|
||||
HashSet::from(["\\test", "translate", "=>", "synonym"])
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_synonym_filter_with_file() {
|
||||
init_log();
|
||||
let file_dir = Path::new(file!()).parent().unwrap();
|
||||
let synonyms_path = file_dir.join("../data/test/synonyms_dict.txt");
|
||||
let synonyms_path_str = synonyms_path.to_string_lossy().to_string();
|
||||
let params = format!(
|
||||
r#"{{
|
||||
"type": "synonym",
|
||||
"synonyms_file": {{
|
||||
"type": "local",
|
||||
"path": "{synonyms_path_str}"
|
||||
}}
|
||||
}}"#
|
||||
);
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
let mut stream = analyzer.token_stream("distance interval");
|
||||
|
||||
let mut results = Vec::<String>::new();
|
||||
while stream.advance() {
|
||||
let token = stream.token();
|
||||
results.push(token.text.clone());
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
results
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.collect::<HashSet<&str>>(),
|
||||
HashSet::from(["distance", "range", "span", "length", "interval", "gap"])
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
use serde_json as json;
|
||||
|
||||
use super::stop_words;
|
||||
use crate::analyzer::options::get_resource_path;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use std::io::BufRead;
|
||||
|
||||
pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
if !value.is_array() {
|
||||
@ -25,21 +25,24 @@ pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>>
|
||||
Ok(str_list)
|
||||
}
|
||||
|
||||
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
let mut stop_words = Vec::new();
|
||||
for str in str_list {
|
||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||
match stop_words::fetch_language_stop_words(str.as_str()) {
|
||||
Some(words) => {
|
||||
for word in words {
|
||||
stop_words.push(word.to_string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
pub(crate) fn read_line_file(
|
||||
dict: &mut Vec<String>,
|
||||
params: &json::Value,
|
||||
key: &str,
|
||||
) -> Result<()> {
|
||||
let path = get_resource_path(params, key)?;
|
||||
let file = std::fs::File::open(path)?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
if let Ok(row_data) = line {
|
||||
dict.push(row_data);
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"read {} file failed, error: {}",
|
||||
key,
|
||||
line.unwrap_err().to_string()
|
||||
)));
|
||||
}
|
||||
stop_words.push(str);
|
||||
}
|
||||
stop_words
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -416,11 +416,9 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("invalid_filter_params", [
|
||||
{"tokenizer": "standard", "filter": [{"type": "stop"}]},
|
||||
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": "not_a_list"}]},
|
||||
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": [123, 456]}]},
|
||||
{"tokenizer": "standard", "filter": [{"type": "invalid_filter_type"}]},
|
||||
{"tokenizer": "standard", "filter": [{"type": "stop", "stop_words": None}]},
|
||||
])
|
||||
def test_analyzer_with_invalid_filter(self, invalid_filter_params):
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user