diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index 2f7a26ff5e..4eabe3bf1e 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -1,4 +1,5 @@ use core::result::Result::Err; +use std::collections::HashSet; use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; use lindera::mode::Mode; @@ -7,6 +8,13 @@ use lindera::token::Token as LToken; use lindera::tokenizer::Tokenizer as LTokenizer; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; +use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter; +use lindera::token_filter::japanese_keep_tags::JapaneseKeepTagsTokenFilter; +use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter; +use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter; +use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter; +use lindera::token_filter::BoxTokenFilter as LTokenFilter; + use crate::error::{Result, TantivyBindingError}; use serde_json as json; @@ -15,6 +23,9 @@ pub struct LinderaTokenStream<'a> { pub token: &'a mut Token, } +const DICTKINDKEY: &str = "dict_kind"; +const FILTERKEY: &str = "filter"; + impl<'a> TokenStream for LinderaTokenStream<'a> { fn advance(&mut self) -> bool { if self.tokens.is_empty() { @@ -47,17 +58,25 @@ pub struct LinderaTokenizer { impl LinderaTokenizer { /// Create a new `LinderaTokenizer`. - /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. + /// This function will create a new `LinderaTokenizer` with json parameters. pub fn from_json(params: &json::Map) -> Result { let kind = fetch_lindera_kind(params)?; - let dictionary = load_dictionary_from_kind(kind); - if dictionary.is_err() { - return Err(TantivyBindingError::InvalidArgument(format!( + let dictionary = load_dictionary_from_kind(kind.clone()).map_err(|_| { + TantivyBindingError::InvalidArgument(format!( "lindera tokenizer with invalid dict_kind" - ))); + )) + })?; + + let segmenter = Segmenter::new(Mode::Normal, dictionary, None); + let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter); + + // append lindera filter + let filters = fetch_lindera_token_filters(&kind, params)?; + for filter in filters { + tokenizer.append_token_filter(filter) } - let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None); - Ok(LinderaTokenizer::from_segmenter(segmenter)) + + Ok(tokenizer) } /// Create a new `LinderaTokenizer`. @@ -68,6 +87,10 @@ impl LinderaTokenizer { token: Default::default(), } } + + pub fn append_token_filter(&mut self, filter: LTokenFilter) { + self.tokenizer.append_token_filter(filter); + } } impl Tokenizer for LinderaTokenizer { @@ -103,26 +126,209 @@ impl DictionaryKindParser for &str { } fn fetch_lindera_kind(params: &json::Map) -> Result { - match params.get("dict_kind") { - Some(val) => { - if !val.is_string() { - return Err(TantivyBindingError::InvalidArgument(format!( - "lindera tokenizer dict kind should be string" - ))); - } - val.as_str().unwrap().into_dict_kind() - } - _ => { - return Err(TantivyBindingError::InvalidArgument(format!( - "lindera tokenizer dict_kind must be set" - ))) - } + params + .get(DICTKINDKEY) + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!("lindera tokenizer dict_kind must be set")) + })? + .as_str() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict kind should be string" + )) + })? + .into_dict_kind() +} + +fn fetch_lindera_tags_from_params( + params: &json::Map, +) -> Result> { + params + .get("tags") + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese stop tag filter tags must be set" + )) + })? + .as_array() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese stop tags filter tags must be array" + )) + })? + .iter() + .map(|v| { + v.as_str() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese stop tags filter tags must be string" + )) + }) + .map(|s| s.to_string()) + }) + .collect::>>() +} + +fn fetch_japanese_compound_word_token_filter( + kind: &DictionaryKind, + params: Option<&json::Map>, +) -> Result { + let filter_param = params.ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese compound word filter must use with params" + )) + })?; + + let tags: HashSet = fetch_lindera_tags_from_params(filter_param)?; + + let new_tag: Option = filter_param + .get("new_tag") + .map(|v| { + v.as_str() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese compound word filter new_tag must be string" + )) + }) + .map(|s| s.to_string()) + }) + .transpose()?; + Ok(JapaneseCompoundWordTokenFilter::new(kind.clone(), tags, new_tag).into()) +} + +fn fetch_japanese_keep_tags_token_filter( + params: Option<&json::Map>, +) -> Result { + Ok( + JapaneseKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else( + || { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese keep tags filter must use with params" + )) + }, + )?)?) + .into(), + ) +} + +fn fetch_japanese_stop_tags_token_filter( + params: Option<&json::Map>, +) -> Result { + Ok( + JapaneseStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else( + || { + TantivyBindingError::InvalidArgument(format!( + "lindera japanese stop tags filter must use with params" + )) + }, + )?)?) + .into(), + ) +} + +fn fetch_korean_keep_tags_token_filter( + params: Option<&json::Map>, +) -> Result { + Ok( + KoreanKeepTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else( + || { + TantivyBindingError::InvalidArgument(format!( + "lindera korean keep tags filter must use with params" + )) + }, + )?)?) + .into(), + ) +} + +fn fetch_korean_stop_tags_token_filter( + params: Option<&json::Map>, +) -> Result { + Ok( + KoreanStopTagsTokenFilter::new(fetch_lindera_tags_from_params(params.ok_or_else( + || { + TantivyBindingError::InvalidArgument(format!( + "lindera korean stop tags filter must use with params" + )) + }, + )?)?) + .into(), + ) +} + +fn fetch_lindera_token_filter_params( + params: &json::Value, +) -> Result<(&str, Option<&json::Map>)> { + if params.is_string() { + return Ok((params.as_str().unwrap(), None)); } + + let kind = params + .as_object() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer filter params must be object" + )) + })? + .get("kind") + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!("lindera tokenizer filter must have type")) + })? + .as_str() + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer filter type should be string" + )) + })?; + + Ok((kind, Some(params.as_object().unwrap()))) +} + +fn fetch_lindera_token_filter( + type_name: &str, + kind: &DictionaryKind, + params: Option<&json::Map>, +) -> Result { + match type_name { + "japanese_compound_word" => fetch_japanese_compound_word_token_filter(kind, params), + "japanese_keep_tags" => fetch_japanese_keep_tags_token_filter(params), + "japanese_stop_tags" => fetch_japanese_stop_tags_token_filter(params), + "korean_keep_tags" => fetch_korean_keep_tags_token_filter(params), + "korean_stop_tags" => fetch_korean_stop_tags_token_filter(params), + _ => Err(TantivyBindingError::InvalidArgument(format!( + "unknown lindera filter type" + ))), + } +} + +fn fetch_lindera_token_filters( + kind: &DictionaryKind, + params: &json::Map, +) -> Result> { + let mut result: Vec = vec![]; + + match params.get(FILTERKEY) { + Some(v) => { + let filter_list = v.as_array().ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!("lindera filters should be array")) + })?; + + for filter_params in filter_list { + let (name, params) = fetch_lindera_token_filter_params(filter_params)?; + let filter = fetch_lindera_token_filter(name, kind, params)?; + result.push(filter); + } + } + _ => {} + } + + Ok(result) } #[cfg(test)] mod tests { use serde_json as json; + use tantivy::tokenizer::Tokenizer; use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer; @@ -130,13 +336,27 @@ mod tests { fn test_lindera_tokenizer() { let params = r#"{ "type": "lindera", - "dict_kind": "ipadic" + "dict_kind": "ipadic", + "filter": [{ + "kind": "japanese_stop_tags", + "tags": ["接続詞", "助詞", "助詞,格助詞", "助詞,連体化"] + }] }"#; let json_param = json::from_str::>(¶ms); assert!(json_param.is_ok()); let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + + let mut binding = tokenizer.unwrap(); + let stream = + binding.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です"); + let mut results = Vec::::new(); + for token in stream.tokens { + results.push(token.text.to_string()); + } + + print!("test tokens :{:?}\n", results) } #[test]