mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 10:08:42 +08:00
feat: support more lauguage for build in stop words and add remove punct, regex filter (#41412)
relate: https://github.com/milvus-io/milvus/issues/41213 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
91d40fa558
commit
a16bd6263b
29
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
29
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -211,6 +211,21 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bit-set"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
|
||||||
|
dependencies = [
|
||||||
|
"bit-vec",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bit-vec"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "1.3.2"
|
version = "1.3.2"
|
||||||
@ -857,6 +872,17 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fancy-regex"
|
||||||
|
version = "0.14.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298"
|
||||||
|
dependencies = [
|
||||||
|
"bit-set",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax 0.8.5",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fastdivide"
|
name = "fastdivide"
|
||||||
version = "0.4.2"
|
version = "0.4.2"
|
||||||
@ -3853,6 +3879,7 @@ dependencies = [
|
|||||||
"criterion",
|
"criterion",
|
||||||
"either",
|
"either",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
|
"fancy-regex",
|
||||||
"futures",
|
"futures",
|
||||||
"icu_segmenter",
|
"icu_segmenter",
|
||||||
"jieba-rs",
|
"jieba-rs",
|
||||||
@ -4568,7 +4595,7 @@ version = "0.1.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"windows-sys 0.48.0",
|
"windows-sys 0.59.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@ -31,6 +31,7 @@ either = "1.13.0"
|
|||||||
icu_segmenter = "2.0.0-beta2"
|
icu_segmenter = "2.0.0-beta2"
|
||||||
whatlang = "0.16.4"
|
whatlang = "0.16.4"
|
||||||
lingua = "1.7.1"
|
lingua = "1.7.1"
|
||||||
|
fancy-regex = "0.14.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.3"
|
rand = "0.3"
|
||||||
|
|||||||
@ -2,9 +2,7 @@ use serde_json as json;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
use crate::analyzer::{
|
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
|
||||||
build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer, util::*,
|
|
||||||
};
|
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
use crate::error::TantivyBindingError;
|
use crate::error::TantivyBindingError;
|
||||||
|
|
||||||
|
|||||||
@ -1,8 +1,8 @@
|
|||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
use crate::analyzer::tokenizers::*;
|
use super::tokenizers::*;
|
||||||
use crate::analyzer::filter::*;
|
use super::filter::*;
|
||||||
use crate::analyzer::stop_words;
|
use super::filter::stop_words;
|
||||||
|
|
||||||
// default build-in analyzer
|
// default build-in analyzer
|
||||||
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||||
|
|||||||
@ -1,8 +1,9 @@
|
|||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
use crate::error::{Result,TantivyBindingError};
|
use super::util::*;
|
||||||
use crate::analyzer::util::*;
|
use super::{RegexFilter, RemovePunctFilter};
|
||||||
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
pub(crate) enum SystemFilter {
|
pub(crate) enum SystemFilter {
|
||||||
Invalid,
|
Invalid,
|
||||||
@ -12,9 +13,11 @@ pub(crate) enum SystemFilter {
|
|||||||
CnCharOnly(CnCharOnlyFilter),
|
CnCharOnly(CnCharOnlyFilter),
|
||||||
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||||
Length(RemoveLongFilter),
|
Length(RemoveLongFilter),
|
||||||
|
RemovePunct(RemovePunctFilter),
|
||||||
Stop(StopWordFilter),
|
Stop(StopWordFilter),
|
||||||
Decompounder(SplitCompoundWords),
|
Decompounder(SplitCompoundWords),
|
||||||
Stemmer(Stemmer),
|
Stemmer(Stemmer),
|
||||||
|
Regex(RegexFilter),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SystemFilter {
|
impl SystemFilter {
|
||||||
@ -29,6 +32,8 @@ impl SystemFilter {
|
|||||||
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
Self::Stop(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Stemmer(filter) => builder.filter(filter).dynamic(),
|
Self::Stemmer(filter) => builder.filter(filter).dynamic(),
|
||||||
|
Self::RemovePunct(filter) => builder.filter(filter).dynamic(),
|
||||||
|
Self::Regex(filter) => builder.filter(filter).dynamic(),
|
||||||
Self::Invalid => builder,
|
Self::Invalid => builder,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,6 +157,7 @@ impl From<&str> for SystemFilter {
|
|||||||
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
|
||||||
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
|
||||||
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
|
||||||
|
"removepunct" => Self::RemovePunct(RemovePunctFilter),
|
||||||
_ => Self::Invalid,
|
_ => Self::Invalid,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -174,6 +180,7 @@ impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
|||||||
"stop" => get_stop_words_filter(params),
|
"stop" => get_stop_words_filter(params),
|
||||||
"decompounder" => get_decompounder_filter(params),
|
"decompounder" => get_decompounder_filter(params),
|
||||||
"stemmer" => get_stemmer_filter(params),
|
"stemmer" => get_stemmer_filter(params),
|
||||||
|
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
||||||
other => Err(TantivyBindingError::InternalError(format!(
|
other => Err(TantivyBindingError::InternalError(format!(
|
||||||
"unsupport filter type: {}",
|
"unsupport filter type: {}",
|
||||||
other
|
other
|
||||||
11
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs
vendored
Normal file
11
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/mod.rs
vendored
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
mod filter;
|
||||||
|
mod remove_punct_filter;
|
||||||
|
mod regex_filter;
|
||||||
|
pub(crate) mod stop_words;
|
||||||
|
mod util;
|
||||||
|
|
||||||
|
use regex_filter::RegexFilter;
|
||||||
|
use remove_punct_filter::RemovePunctFilter;
|
||||||
|
|
||||||
|
pub(crate) use filter::*;
|
||||||
|
pub(crate) use util::*;
|
||||||
128
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/regex_filter.rs
vendored
Normal file
128
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/regex_filter.rs
vendored
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
use serde_json as json;
|
||||||
|
use fancy_regex as regex;
|
||||||
|
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RegexFilter {
|
||||||
|
regex: regex::Regex,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexFilter {
|
||||||
|
/// Creates a `RegexFilter` given regex expression
|
||||||
|
pub fn new(expr: &str) -> Result<RegexFilter> {
|
||||||
|
regex::Regex::new(expr).map_or_else(
|
||||||
|
|e| {
|
||||||
|
Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"regex expression invalid, expr:{}, err: {}",
|
||||||
|
expr, e
|
||||||
|
)))
|
||||||
|
},
|
||||||
|
|regex| Ok(RegexFilter { regex }),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<RegexFilter> {
|
||||||
|
params.get("expr").map_or(
|
||||||
|
Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"must set expr for regex filter"
|
||||||
|
))),
|
||||||
|
|value| {
|
||||||
|
value.as_str().map_or(
|
||||||
|
Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"expr must be string"
|
||||||
|
))),
|
||||||
|
|expr| RegexFilter::new(expr),
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenFilter for RegexFilter {
|
||||||
|
type Tokenizer<T: Tokenizer> = RegexFilterWrapper<T>;
|
||||||
|
|
||||||
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> RegexFilterWrapper<T> {
|
||||||
|
RegexFilterWrapper {
|
||||||
|
regex: self.regex,
|
||||||
|
inner: tokenizer,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RegexFilterWrapper<T> {
|
||||||
|
regex: regex::Regex,
|
||||||
|
inner: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Tokenizer> Tokenizer for RegexFilterWrapper<T> {
|
||||||
|
type TokenStream<'a> = RegexFilterStream<T::TokenStream<'a>>;
|
||||||
|
|
||||||
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||||
|
RegexFilterStream {
|
||||||
|
regex: self.regex.clone(),
|
||||||
|
tail: self.inner.token_stream(text),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct RegexFilterStream<T> {
|
||||||
|
regex: regex::Regex,
|
||||||
|
tail: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> RegexFilterStream<T> {
|
||||||
|
fn predicate(&self, token: &Token) -> bool {
|
||||||
|
self.regex.is_match(&token.text).map_or(true, |b|b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenStream> TokenStream for RegexFilterStream<T> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if self.predicate(self.tail.token()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::analyzer::analyzer::create_analyzer;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_regex_filter() {
|
||||||
|
let params = r#"{
|
||||||
|
"tokenizer": "standard",
|
||||||
|
"filter": [{
|
||||||
|
"type": "regex",
|
||||||
|
"expr": "^(?!test)"
|
||||||
|
}]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let tokenizer = create_analyzer(¶ms.to_string());
|
||||||
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
|
let mut bining = tokenizer.unwrap();
|
||||||
|
let mut stream = bining.token_stream("test milvus");
|
||||||
|
|
||||||
|
let mut results = Vec::<String>::new();
|
||||||
|
while stream.advance() {
|
||||||
|
let token = stream.token();
|
||||||
|
results.push(token.text.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
print!("test tokens :{:?}\n", results)
|
||||||
|
}
|
||||||
|
}
|
||||||
80
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/remove_punct_filter.rs
vendored
Normal file
80
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/remove_punct_filter.rs
vendored
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
pub struct RemovePunctFilter;
|
||||||
|
|
||||||
|
pub struct RemovePunctFilterStream<T> {
|
||||||
|
regex: regex::Regex,
|
||||||
|
tail: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenFilter for RemovePunctFilter {
|
||||||
|
type Tokenizer<T: Tokenizer> = RemovePunctFilterWrapper<T>;
|
||||||
|
|
||||||
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> RemovePunctFilterWrapper<T> {
|
||||||
|
RemovePunctFilterWrapper(tokenizer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct RemovePunctFilterWrapper<T>(T);
|
||||||
|
|
||||||
|
impl<T: Tokenizer> Tokenizer for RemovePunctFilterWrapper<T> {
|
||||||
|
type TokenStream<'a> = RemovePunctFilterStream<T::TokenStream<'a>>;
|
||||||
|
|
||||||
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||||
|
RemovePunctFilterStream {
|
||||||
|
regex: regex::Regex::new(r"[\p{Punct}\s]+").unwrap(),
|
||||||
|
tail: self.0.token_stream(text),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: TokenStream> TokenStream for RemovePunctFilterStream<T> {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
while self.tail.advance() {
|
||||||
|
if !self.regex.is_match(&self.tail.token().text) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.tail.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.tail.token_mut()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::analyzer::analyzer::create_analyzer;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "lindera-ipadic")]
|
||||||
|
fn test_remove_punct_filter() {
|
||||||
|
let params = r#"{
|
||||||
|
"tokenizer": {
|
||||||
|
"type": "lindera",
|
||||||
|
"dict_kind": "ipadic"
|
||||||
|
},
|
||||||
|
"filter": ["removepunct"]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let tokenizer = create_analyzer(¶ms.to_string());
|
||||||
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
|
let mut bining = tokenizer.unwrap();
|
||||||
|
let mut stream = bining.token_stream("ミルヴァスの日本語テスト、句読点テスト");
|
||||||
|
|
||||||
|
let mut results = Vec::<String>::new();
|
||||||
|
while stream.advance() {
|
||||||
|
let token = stream.token();
|
||||||
|
results.push(token.text.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
print!("test tokens :{:?}\n", results)
|
||||||
|
}
|
||||||
|
}
|
||||||
1921
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_words.rs
vendored
Normal file
1921
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter/stop_words.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,7 @@
|
|||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
|
||||||
use crate::error::{Result,TantivyBindingError};
|
use super::stop_words;
|
||||||
use crate::analyzer::stop_words;
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||||
if !value.is_array() {
|
if !value.is_array() {
|
||||||
@ -29,14 +29,14 @@ pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
|||||||
let mut stop_words = Vec::new();
|
let mut stop_words = Vec::new();
|
||||||
for str in str_list {
|
for str in str_list {
|
||||||
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
|
||||||
match str.as_str() {
|
match stop_words::fetch_language_stop_words(str.as_str()) {
|
||||||
"_english_" => {
|
Some(words) => {
|
||||||
for word in stop_words::ENGLISH {
|
for word in words {
|
||||||
stop_words.push(word.to_string());
|
stop_words.push(word.to_string());
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
_other => {}
|
None => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stop_words.push(str);
|
stop_words.push(str);
|
||||||
@ -1,10 +1,8 @@
|
|||||||
mod analyzer;
|
mod analyzer;
|
||||||
mod stop_words;
|
|
||||||
mod build_in_analyzer;
|
mod build_in_analyzer;
|
||||||
mod filter;
|
mod filter;
|
||||||
mod util;
|
|
||||||
|
|
||||||
pub mod tokenizers;
|
pub mod tokenizers;
|
||||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||||
|
|
||||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||||
|
|||||||
@ -1,5 +0,0 @@
|
|||||||
pub const ENGLISH: &[&str] = &[
|
|
||||||
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
|
|
||||||
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
|
|
||||||
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
|
|
||||||
];
|
|
||||||
Loading…
x
Reference in New Issue
Block a user