enhance: support use lindera tag filter (#40416)

relate: https://github.com/milvus-io/milvus/issues/39659

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-04-21 15:56:36 +08:00 committed by GitHub
parent c5428c12eb
commit f166843c5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
use core::result::Result::Err;
use std::collections::HashSet;
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use lindera::mode::Mode;
@ -7,6 +8,13 @@ use lindera::token::Token as LToken;
use lindera::tokenizer::Tokenizer as LTokenizer;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
use lindera::token_filter::japanese_keep_tags::JapaneseKeepTagsTokenFilter;
use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter;
use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
@ -15,6 +23,9 @@ pub struct LinderaTokenStream<'a> {
pub token: &'a mut Token,
}
const DICTKINDKEY: &str = "dict_kind";
const FILTERKEY: &str = "filter";
impl<'a> TokenStream for LinderaTokenStream<'a> {
fn advance(&mut self) -> bool {
if self.tokens.is_empty() {
@ -47,17 +58,25 @@ pub struct LinderaTokenizer {
impl LinderaTokenizer {
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
/// This function will create a new `LinderaTokenizer` with json parameters.
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
let kind = fetch_lindera_kind(params)?;
let dictionary = load_dictionary_from_kind(kind);
if dictionary.is_err() {
return Err(TantivyBindingError::InvalidArgument(format!(
let dictionary = load_dictionary_from_kind(kind.clone()).map_err(|_| {
TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer with invalid dict_kind"
)));
))
})?;
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
// append lindera filter
let filters = fetch_lindera_token_filters(&kind, params)?;
for filter in filters {
tokenizer.append_token_filter(filter)
}
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
Ok(LinderaTokenizer::from_segmenter(segmenter))
Ok(tokenizer)
}
/// Create a new `LinderaTokenizer`.
@ -68,6 +87,10 @@ impl LinderaTokenizer {
token: Default::default(),
}
}
pub fn append_token_filter(&mut self, filter: LTokenFilter) {
self.tokenizer.append_token_filter(filter);
}
}
impl Tokenizer for LinderaTokenizer {
@ -103,26 +126,209 @@ impl DictionaryKindParser for &str {
}
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
match params.get("dict_kind") {
Some(val) => {
if !val.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict kind should be string"
)));
}
val.as_str().unwrap().into_dict_kind()
}
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict_kind must be set"
)))
}
params
.get(DICTKINDKEY)
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("lindera tokenizer dict_kind must be set"))
})?
.as_str()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict kind should be string"
))
})?
.into_dict_kind()
}
fn fetch_lindera_tags_from_params(
params: &json::Map<String, json::Value>,
) -> Result<HashSet<String>> {
params
.get("tags")
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese stop tag filter tags must be set"
))
})?
.as_array()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese stop tags filter tags must be array"
))
})?
.iter()
.map(|v| {
v.as_str()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese stop tags filter tags must be string"
))
})
.map(|s| s.to_string())
})
.collect::<Result<HashSet<String>>>()
}
fn fetch_japanese_compound_word_token_filter(
kind: &DictionaryKind,
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
let filter_param = params.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese compound word filter must use with params"
))
})?;
let tags: HashSet<String> = fetch_lindera_tags_from_params(filter_param)?;
let new_tag: Option<String> = filter_param
.get("new_tag")
.map(|v| {
v.as_str()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese compound word filter new_tag must be string"
))
})
.map(|s| s.to_string())
})
.transpose()?;
Ok(JapaneseCompoundWordTokenFilter::new(kind.clone(), tags, new_tag).into())
}
fn fetch_japanese_keep_tags_token_filter(
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
Ok(
JapaneseKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese keep tags filter must use with params"
))
},
)?)?)
.into(),
)
}
fn fetch_japanese_stop_tags_token_filter(
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
Ok(
JapaneseStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|| {
TantivyBindingError::InvalidArgument(format!(
"lindera japanese stop tags filter must use with params"
))
},
)?)?)
.into(),
)
}
fn fetch_korean_keep_tags_token_filter(
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
Ok(
KoreanKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|| {
TantivyBindingError::InvalidArgument(format!(
"lindera korean keep tags filter must use with params"
))
},
)?)?)
.into(),
)
}
fn fetch_korean_stop_tags_token_filter(
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
Ok(
KoreanStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|| {
TantivyBindingError::InvalidArgument(format!(
"lindera korean stop tags filter must use with params"
))
},
)?)?)
.into(),
)
}
fn fetch_lindera_token_filter_params(
params: &json::Value,
) -> Result<(&str, Option<&json::Map<String, json::Value>>)> {
if params.is_string() {
return Ok((params.as_str().unwrap(), None));
}
let kind = params
.as_object()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer filter params must be object"
))
})?
.get("kind")
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("lindera tokenizer filter must have type"))
})?
.as_str()
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer filter type should be string"
))
})?;
Ok((kind, Some(params.as_object().unwrap())))
}
fn fetch_lindera_token_filter(
type_name: &str,
kind: &DictionaryKind,
params: Option<&json::Map<String, json::Value>>,
) -> Result<LTokenFilter> {
match type_name {
"japanese_compound_word" => fetch_japanese_compound_word_token_filter(kind, params),
"japanese_keep_tags" => fetch_japanese_keep_tags_token_filter(params),
"japanese_stop_tags" => fetch_japanese_stop_tags_token_filter(params),
"korean_keep_tags" => fetch_korean_keep_tags_token_filter(params),
"korean_stop_tags" => fetch_korean_stop_tags_token_filter(params),
_ => Err(TantivyBindingError::InvalidArgument(format!(
"unknown lindera filter type"
))),
}
}
fn fetch_lindera_token_filters(
kind: &DictionaryKind,
params: &json::Map<String, json::Value>,
) -> Result<Vec<LTokenFilter>> {
let mut result: Vec<LTokenFilter> = vec![];
match params.get(FILTERKEY) {
Some(v) => {
let filter_list = v.as_array().ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
})?;
for filter_params in filter_list {
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
let filter = fetch_lindera_token_filter(name, kind, params)?;
result.push(filter);
}
}
_ => {}
}
Ok(result)
}
#[cfg(test)]
mod tests {
use serde_json as json;
use tantivy::tokenizer::Tokenizer;
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
@ -130,13 +336,27 @@ mod tests {
fn test_lindera_tokenizer() {
let params = r#"{
"type": "lindera",
"dict_kind": "ipadic"
"dict_kind": "ipadic",
"filter": [{
"kind": "japanese_stop_tags",
"tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"]
}]
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut binding = tokenizer.unwrap();
let stream =
binding.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
let mut results = Vec::<String>::new();
for token in stream.tokens {
results.push(token.text.to_string());
}
print!("test tokens :{:?}\n", results)
}
#[test]