mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
enhance: support use lindera tag filter (#40416)
relate: https://github.com/milvus-io/milvus/issues/39659 Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
c5428c12eb
commit
f166843c5e
@ -1,4 +1,5 @@
|
|||||||
use core::result::Result::Err;
|
use core::result::Result::Err;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||||
use lindera::mode::Mode;
|
use lindera::mode::Mode;
|
||||||
@ -7,6 +8,13 @@ use lindera::token::Token as LToken;
|
|||||||
use lindera::tokenizer::Tokenizer as LTokenizer;
|
use lindera::tokenizer::Tokenizer as LTokenizer;
|
||||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
||||||
|
use lindera::token_filter::japanese_keep_tags::JapaneseKeepTagsTokenFilter;
|
||||||
|
use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter;
|
||||||
|
use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
|
||||||
|
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
||||||
|
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||||
|
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
|
||||||
@ -15,6 +23,9 @@ pub struct LinderaTokenStream<'a> {
|
|||||||
pub token: &'a mut Token,
|
pub token: &'a mut Token,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const DICTKINDKEY: &str = "dict_kind";
|
||||||
|
const FILTERKEY: &str = "filter";
|
||||||
|
|
||||||
impl<'a> TokenStream for LinderaTokenStream<'a> {
|
impl<'a> TokenStream for LinderaTokenStream<'a> {
|
||||||
fn advance(&mut self) -> bool {
|
fn advance(&mut self) -> bool {
|
||||||
if self.tokens.is_empty() {
|
if self.tokens.is_empty() {
|
||||||
@ -47,17 +58,25 @@ pub struct LinderaTokenizer {
|
|||||||
|
|
||||||
impl LinderaTokenizer {
|
impl LinderaTokenizer {
|
||||||
/// Create a new `LinderaTokenizer`.
|
/// Create a new `LinderaTokenizer`.
|
||||||
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
|
/// This function will create a new `LinderaTokenizer` with json parameters.
|
||||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
|
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
|
||||||
let kind = fetch_lindera_kind(params)?;
|
let kind = fetch_lindera_kind(params)?;
|
||||||
let dictionary = load_dictionary_from_kind(kind);
|
let dictionary = load_dictionary_from_kind(kind.clone()).map_err(|_| {
|
||||||
if dictionary.is_err() {
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
|
||||||
"lindera tokenizer with invalid dict_kind"
|
"lindera tokenizer with invalid dict_kind"
|
||||||
)));
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
|
||||||
|
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
|
||||||
|
|
||||||
|
// append lindera filter
|
||||||
|
let filters = fetch_lindera_token_filters(&kind, params)?;
|
||||||
|
for filter in filters {
|
||||||
|
tokenizer.append_token_filter(filter)
|
||||||
}
|
}
|
||||||
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
|
|
||||||
Ok(LinderaTokenizer::from_segmenter(segmenter))
|
Ok(tokenizer)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new `LinderaTokenizer`.
|
/// Create a new `LinderaTokenizer`.
|
||||||
@ -68,6 +87,10 @@ impl LinderaTokenizer {
|
|||||||
token: Default::default(),
|
token: Default::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn append_token_filter(&mut self, filter: LTokenFilter) {
|
||||||
|
self.tokenizer.append_token_filter(filter);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Tokenizer for LinderaTokenizer {
|
impl Tokenizer for LinderaTokenizer {
|
||||||
@ -103,26 +126,209 @@ impl DictionaryKindParser for &str {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
|
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
|
||||||
match params.get("dict_kind") {
|
params
|
||||||
Some(val) => {
|
.get(DICTKINDKEY)
|
||||||
if !val.is_string() {
|
.ok_or_else(|| {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
TantivyBindingError::InvalidArgument(format!("lindera tokenizer dict_kind must be set"))
|
||||||
"lindera tokenizer dict kind should be string"
|
})?
|
||||||
)));
|
.as_str()
|
||||||
}
|
.ok_or_else(|| {
|
||||||
val.as_str().unwrap().into_dict_kind()
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
}
|
"lindera tokenizer dict kind should be string"
|
||||||
_ => {
|
))
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
})?
|
||||||
"lindera tokenizer dict_kind must be set"
|
.into_dict_kind()
|
||||||
)))
|
}
|
||||||
}
|
|
||||||
|
fn fetch_lindera_tags_from_params(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
) -> Result<HashSet<String>> {
|
||||||
|
params
|
||||||
|
.get("tags")
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese stop tag filter tags must be set"
|
||||||
|
))
|
||||||
|
})?
|
||||||
|
.as_array()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese stop tags filter tags must be array"
|
||||||
|
))
|
||||||
|
})?
|
||||||
|
.iter()
|
||||||
|
.map(|v| {
|
||||||
|
v.as_str()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese stop tags filter tags must be string"
|
||||||
|
))
|
||||||
|
})
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})
|
||||||
|
.collect::<Result<HashSet<String>>>()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_japanese_compound_word_token_filter(
|
||||||
|
kind: &DictionaryKind,
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
let filter_param = params.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese compound word filter must use with params"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let tags: HashSet<String> = fetch_lindera_tags_from_params(filter_param)?;
|
||||||
|
|
||||||
|
let new_tag: Option<String> = filter_param
|
||||||
|
.get("new_tag")
|
||||||
|
.map(|v| {
|
||||||
|
v.as_str()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese compound word filter new_tag must be string"
|
||||||
|
))
|
||||||
|
})
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
})
|
||||||
|
.transpose()?;
|
||||||
|
Ok(JapaneseCompoundWordTokenFilter::new(kind.clone(), tags, new_tag).into())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_japanese_keep_tags_token_filter(
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
Ok(
|
||||||
|
JapaneseKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|
||||||
|
|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese keep tags filter must use with params"
|
||||||
|
))
|
||||||
|
},
|
||||||
|
)?)?)
|
||||||
|
.into(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_japanese_stop_tags_token_filter(
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
Ok(
|
||||||
|
JapaneseStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|
||||||
|
|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera japanese stop tags filter must use with params"
|
||||||
|
))
|
||||||
|
},
|
||||||
|
)?)?)
|
||||||
|
.into(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_korean_keep_tags_token_filter(
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
Ok(
|
||||||
|
KoreanKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|
||||||
|
|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera korean keep tags filter must use with params"
|
||||||
|
))
|
||||||
|
},
|
||||||
|
)?)?)
|
||||||
|
.into(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_korean_stop_tags_token_filter(
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
Ok(
|
||||||
|
KoreanStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else(
|
||||||
|
|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera korean stop tags filter must use with params"
|
||||||
|
))
|
||||||
|
},
|
||||||
|
)?)?)
|
||||||
|
.into(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_lindera_token_filter_params(
|
||||||
|
params: &json::Value,
|
||||||
|
) -> Result<(&str, Option<&json::Map<String, json::Value>>)> {
|
||||||
|
if params.is_string() {
|
||||||
|
return Ok((params.as_str().unwrap(), None));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let kind = params
|
||||||
|
.as_object()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera tokenizer filter params must be object"
|
||||||
|
))
|
||||||
|
})?
|
||||||
|
.get("kind")
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!("lindera tokenizer filter must have type"))
|
||||||
|
})?
|
||||||
|
.as_str()
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"lindera tokenizer filter type should be string"
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok((kind, Some(params.as_object().unwrap())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_lindera_token_filter(
|
||||||
|
type_name: &str,
|
||||||
|
kind: &DictionaryKind,
|
||||||
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
) -> Result<LTokenFilter> {
|
||||||
|
match type_name {
|
||||||
|
"japanese_compound_word" => fetch_japanese_compound_word_token_filter(kind, params),
|
||||||
|
"japanese_keep_tags" => fetch_japanese_keep_tags_token_filter(params),
|
||||||
|
"japanese_stop_tags" => fetch_japanese_stop_tags_token_filter(params),
|
||||||
|
"korean_keep_tags" => fetch_korean_keep_tags_token_filter(params),
|
||||||
|
"korean_stop_tags" => fetch_korean_stop_tags_token_filter(params),
|
||||||
|
_ => Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"unknown lindera filter type"
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_lindera_token_filters(
|
||||||
|
kind: &DictionaryKind,
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
) -> Result<Vec<LTokenFilter>> {
|
||||||
|
let mut result: Vec<LTokenFilter> = vec![];
|
||||||
|
|
||||||
|
match params.get(FILTERKEY) {
|
||||||
|
Some(v) => {
|
||||||
|
let filter_list = v.as_array().ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
for filter_params in filter_list {
|
||||||
|
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
|
||||||
|
let filter = fetch_lindera_token_filter(name, kind, params)?;
|
||||||
|
result.push(filter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
use tantivy::tokenizer::Tokenizer;
|
||||||
|
|
||||||
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
|
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
|
||||||
|
|
||||||
@ -130,13 +336,27 @@ mod tests {
|
|||||||
fn test_lindera_tokenizer() {
|
fn test_lindera_tokenizer() {
|
||||||
let params = r#"{
|
let params = r#"{
|
||||||
"type": "lindera",
|
"type": "lindera",
|
||||||
"dict_kind": "ipadic"
|
"dict_kind": "ipadic",
|
||||||
|
"filter": [{
|
||||||
|
"kind": "japanese_stop_tags",
|
||||||
|
"tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"]
|
||||||
|
}]
|
||||||
}"#;
|
}"#;
|
||||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||||
assert!(json_param.is_ok());
|
assert!(json_param.is_ok());
|
||||||
|
|
||||||
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
|
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
|
let mut binding = tokenizer.unwrap();
|
||||||
|
let stream =
|
||||||
|
binding.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
|
||||||
|
let mut results = Vec::<String>::new();
|
||||||
|
for token in stream.tokens {
|
||||||
|
results.push(token.text.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
print!("test tokens :{:?}\n", results)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user