From cc7652327da220bcb40b0e36ceea9838a745f384 Mon Sep 17 00:00:00 2001 From: aoiasd <45024769+aoiasd@users.noreply.github.com> Date: Tue, 6 Jan 2026 21:19:25 +0800 Subject: [PATCH] enhance: optimize jieba and lindera analyzer clone (#46719) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit relate: https://github.com/milvus-io/milvus/issues/46718 ## Enhancement: Optimize Jieba and Lindera Analyzer Clone **Core Invariant**: JiebaTokenizer and LinderaTokenizer must be efficiently cloneable without lifetime constraints to support analyzer composition in multi-language detection chains. **What Logic Was Improved**: - **JiebaTokenizer**: Replaced `Cow<'a, Jieba>` with `Arc` and removed the `<'a>` lifetime parameter. The global JIEBA instance now wraps in Arc, enabling `#[derive(Clone)]` on the struct. This eliminates lifetime management complexity while maintaining zero-copy sharing via atomic reference counting. - **LinderaTokenizer**: Introduced public `LinderaSegmenter` struct encapsulating dictionary and mode state, and implemented explicit `Clone` that properly duplicates the segmenter (cloning Arc-wrapped dictionary), applies `box_clone()` to each boxed token filter, and clones the token buffer. Previously, Clone was either unavailable or incompletely handled trait objects. **Why Previous Implementation Was Limiting**: - The `Cow::Borrowed` pattern for JiebaTokenizer created explicit lifetime dependencies that prevented straightforward `#[derive(Clone)]`. Switching to Arc eliminates borrow checker constraints while providing the same reference semantics for immutable shared state. - LinderaTokenizer's token filters are boxed trait objects, which cannot be auto-derived. Manual Clone implementation with `box_clone()` calls correctly handles polymorphic filter duplication. **No Data Loss or Behavior Regression**: - Arc cloning is semantically equivalent to `Cow::Borrowed` for read-only access; both efficiently share the underlying Jieba instance and Dictionary without data duplication. - The explicit Clone preserves all tokenizer state: segmenter (with shared Arc dictionary), all token filters (via individual box_clone), and the token buffer used during tokenization. - Token stream behavior unchanged—segmentation and filter application order remain identical. - New benchmarks (`bench_jieba_tokenizer_clone`, `bench_lindera_tokenizer_clone`) measure and validate clone performance for both tokenizers. --------- Signed-off-by: aoiasd --- .../tantivy-binding/benches/analyzer_bench.rs | 44 +++- .../analyzer/tokenizers/jieba_tokenizer.rs | 21 +- .../tokenizers/lang_ident_tokenizer.rs | 9 +- .../analyzer/tokenizers/lindera_tokenizer.rs | 189 ++++++++++++++---- 4 files changed, 206 insertions(+), 57 deletions(-) diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs b/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs index ae914abd7b..058c9a3704 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs @@ -8,6 +8,10 @@ fn test_analyzer(tokenizer: &mut TextAnalyzer) { tokenizer.token_stream(text); } +fn clone_analyzer(tokenizer: &mut TextAnalyzer) { + let _ = tokenizer.clone(); +} + fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) { let params = r#" { @@ -57,7 +61,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) { } }, "mapping": { - "Chinese": "jieba", + "Mandarin": "jieba", "English": "en" }, "identifier": "whatlang" @@ -72,9 +76,45 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) { }); } +fn bench_jieba_tokenizer_clone(c: &mut Criterion) { + let params = r#" + { + "tokenizer": { + "type": "jieba", + "dict":["_extend_default_"] + } + } + "#; + let mut analyzer = create_analyzer(params, ""); + assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap()); + + c.bench_function("test", |b| { + b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap()))) + }); +} + +fn bench_lindera_tokenizer_clone(c: &mut Criterion) { + let params = r#" + { + "tokenizer": { + "type": "lindera", + "dict_kind": "ipadic" + } + } + "#; + let mut analyzer = create_analyzer(params, ""); + assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap()); + + c.bench_function("test", |b| { + b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap()))) + }); +} + criterion_group!( benches, bench_lindua_language_identifier_tokenizer, - bench_whatlang_language_identifier_tokenizer + bench_whatlang_language_identifier_tokenizer, + bench_jieba_tokenizer_clone, + bench_lindera_tokenizer_clone ); criterion_main!(benches); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs index e8a01d842a..4c355de421 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs @@ -1,18 +1,17 @@ use core::{option::Option::Some, result::Result::Ok}; use jieba_rs; use lazy_static::lazy_static; -use log::warn; use serde_json as json; use std::fs; use std::io::BufReader; -use std::{borrow::Cow, path::PathBuf}; +use std::{path::PathBuf, sync::Arc}; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; use crate::analyzer::options::{get_resource_path, FileResourcePathHelper}; use crate::error::{Result, TantivyBindingError}; lazy_static! { - static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); + static ref JIEBA: Arc = Arc::new(jieba_rs::Jieba::new()); } static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big"); @@ -25,10 +24,10 @@ pub enum JiebaMode { } #[derive(Clone)] -pub struct JiebaTokenizer<'a> { +pub struct JiebaTokenizer { mode: JiebaMode, hmm: bool, - tokenizer: Cow<'a, jieba_rs::Jieba>, + tokenizer: Arc, } pub struct JiebaTokenStream { @@ -149,19 +148,19 @@ fn get_jieba_hmm(params: &json::Map) -> Result { } } -impl<'a> JiebaTokenizer<'a> { - pub fn new() -> JiebaTokenizer<'a> { +impl JiebaTokenizer { + pub fn new() -> JiebaTokenizer { JiebaTokenizer { mode: JiebaMode::Search, hmm: true, - tokenizer: Cow::Borrowed(&JIEBA), + tokenizer: JIEBA.clone(), } } pub fn from_json( params: &json::Map, helper: &mut FileResourcePathHelper, - ) -> Result> { + ) -> Result { let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?; let mut tokenizer = @@ -203,7 +202,7 @@ impl<'a> JiebaTokenizer<'a> { Ok(JiebaTokenizer { mode: mode, hmm: hmm, - tokenizer: Cow::Owned(tokenizer), + tokenizer: Arc::new(tokenizer), }) } @@ -235,7 +234,7 @@ impl<'a> JiebaTokenizer<'a> { } } -impl Tokenizer for JiebaTokenizer<'static> { +impl Tokenizer for JiebaTokenizer { type TokenStream<'a> = JiebaTokenStream; fn token_stream(&mut self, text: &str) -> JiebaTokenStream { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs index 41397efedc..419cdfa8d2 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs @@ -249,7 +249,6 @@ impl<'a> LangIdentTokenizer<'a> { fn tokenize<'b>(&'b mut self, text: &'b str) -> BoxTokenStream<'b> { let language: String = self.identifier.0.detect(text); let analyzer = self.get_by_language(language.as_str()); - analyzer.token_stream(text) } } @@ -287,7 +286,7 @@ mod tests { let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default()); let result = || -> Result<()> { analyzer.add("default", create_analyzer(standard_params, "")?); - analyzer.add("cmn", create_analyzer(jieba_params, "")?); + analyzer.add("Mandarin", create_analyzer(jieba_params, "")?); Ok(()) }(); @@ -304,7 +303,7 @@ mod tests { "default": { "tokenizer": "standard" }, - "cmn": { + "Mandarin": { "tokenizer": "jieba" } } @@ -339,8 +338,8 @@ mod tests { } }, "mapping": { - "cmn": "jieba", - "eng": "en" + "Mandarin": "jieba", + "English": "en" } }"#; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index f0110720c9..01db259131 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -1,11 +1,10 @@ use core::result::Result::Err; use std::collections::HashSet; +use std::{borrow::Cow, sync::Arc}; use lindera::dictionary::DictionaryKind; use lindera::mode::Mode; -use lindera::segmenter::Segmenter; use lindera::token::Token as LToken; -use lindera::tokenizer::Tokenizer as LTokenizer; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter; @@ -15,10 +14,104 @@ use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter; use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter; use lindera::token_filter::BoxTokenFilter as LTokenFilter; +use lindera::dictionary::{Dictionary, UserDictionary}; +use lindera_dictionary::viterbi::Lattice; + use crate::analyzer::dict::lindera::load_dictionary_from_kind; use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY}; use crate::error::{Result, TantivyBindingError}; use serde_json as json; +/// Segmenter +#[derive(Clone)] +pub struct LinderaSegmenter { + /// The segmentation mode to be used by the segmenter. + /// This determines how the text will be split into segments. + pub mode: Mode, + + /// The dictionary used for segmenting text. This dictionary contains the necessary + /// data structures and algorithms to perform morphological analysis and tokenization. + pub dictionary: Arc, + + /// An optional user-defined dictionary that can be used to customize the segmentation process. + /// If provided, this dictionary will be used in addition to the default dictionary to improve + /// the accuracy of segmentation for specific words or phrases. + pub user_dictionary: Option>, +} + +impl LinderaSegmenter { + /// Creates a new instance with the specified mode, dictionary, and optional user dictionary. + pub fn new( + mode: Mode, + dictionary: Dictionary, + user_dictionary: Option, + ) -> Self { + Self { + mode, + dictionary: Arc::new(dictionary), + user_dictionary: user_dictionary.map(|d| Arc::new(d)), + } + } + + pub fn segment<'a>(&'a self, text: Cow<'a, str>) -> Result>> { + let mut tokens: Vec = Vec::new(); + let mut lattice = Lattice::default(); + + let mut position = 0_usize; + let mut byte_position = 0_usize; + + // Split text into sentences using Japanese punctuation. + for sentence in text.split_inclusive(&['。', '、', '\n', '\t']) { + if sentence.is_empty() { + continue; + } + + lattice.set_text( + &self.dictionary.prefix_dictionary, + &self.user_dictionary.as_ref().map(|d| &d.dict), + &self.dictionary.character_definition, + &self.dictionary.unknown_dictionary, + sentence, + &self.mode, + ); + lattice.calculate_path_costs(&self.dictionary.connection_cost_matrix, &self.mode); + + let offsets = lattice.tokens_offset(); + + for i in 0..offsets.len() { + let (byte_start, word_id) = offsets[i]; + let byte_end = if i == offsets.len() - 1 { + sentence.len() + } else { + let (next_start, _word_id) = offsets[i + 1]; + next_start + }; + + // retrieve token from its sentence byte positions + let surface = &sentence[byte_start..byte_end]; + + // compute the token's absolute byte positions + let token_start = byte_position; + byte_position += surface.len(); + let token_end = byte_position; + + // Use Cow::Owned to ensure the token data can be returned safely + tokens.push(LToken::new( + Cow::Owned(surface.to_string()), // Clone the string here + token_start, + token_end, + position, + word_id, + &self.dictionary, + self.user_dictionary.as_deref(), + )); + + position += 1; + } + } + + Ok(tokens) + } +} pub struct LinderaTokenStream<'a> { pub tokens: Vec>, @@ -52,12 +145,27 @@ impl<'a> TokenStream for LinderaTokenStream<'a> { } } -#[derive(Clone)] pub struct LinderaTokenizer { - tokenizer: LTokenizer, + segmenter: LinderaSegmenter, + lindera_filters: Vec, token: Token, } +impl Clone for LinderaTokenizer { + fn clone(&self) -> Self { + let mut token_filters: Vec = Vec::new(); + for token_filter in self.lindera_filters.iter() { + token_filters.push(token_filter.box_clone()); + } + + Self { + segmenter: self.segmenter.clone(), + lindera_filters: token_filters, + token: self.token.clone(), + } + } +} + impl LinderaTokenizer { /// Create a new `LinderaTokenizer`. /// This function will create a new `LinderaTokenizer` with json parameters. @@ -70,29 +178,45 @@ impl LinderaTokenizer { let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?; - let segmenter = Segmenter::new(Mode::Normal, dictionary, None); + let segmenter = LinderaSegmenter::new(Mode::Normal, dictionary, None); let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter); // append lindera filter - let filters = fetch_lindera_token_filters(&kind, params)?; - for filter in filters { - tokenizer.append_token_filter(filter) - } - + tokenizer.append_token_filter(&kind, params)?; Ok(tokenizer) } /// Create a new `LinderaTokenizer`. /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. - pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { + pub fn from_segmenter(segmenter: LinderaSegmenter) -> LinderaTokenizer { LinderaTokenizer { - tokenizer: LTokenizer::new(segmenter), + segmenter: segmenter, + lindera_filters: vec![], token: Default::default(), } } - pub fn append_token_filter(&mut self, filter: LTokenFilter) { - self.tokenizer.append_token_filter(filter); + pub fn append_token_filter( + &mut self, + kind: &DictionaryKind, + params: &json::Map, + ) -> Result<()> { + match params.get(FILTER_KEY) { + Some(v) => { + let filter_list = v.as_array().ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!("lindera filters should be array")) + })?; + + for filter_params in filter_list { + let (name, params) = fetch_lindera_token_filter_params(filter_params)?; + let filter = fetch_lindera_token_filter(name, kind, params)?; + self.lindera_filters.push(filter); + } + } + _ => {} + } + + Ok(()) } } @@ -101,8 +225,19 @@ impl Tokenizer for LinderaTokenizer { fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> { self.token.reset(); + // Segment a text. + let mut tokens = self + .segmenter + .segment(Cow::<'a, str>::Borrowed(text)) + .unwrap(); + + // Apply token filters to the tokens if they are not empty. + for token_filter in &self.lindera_filters { + token_filter.apply(&mut tokens).unwrap(); + } + LinderaTokenStream { - tokens: self.tokenizer.tokenize(text).unwrap(), + tokens: tokens, token: &mut self.token, } } @@ -312,30 +447,6 @@ fn fetch_lindera_token_filter( } } -fn fetch_lindera_token_filters( - kind: &DictionaryKind, - params: &json::Map, -) -> Result> { - let mut result: Vec = vec![]; - - match params.get(FILTER_KEY) { - Some(v) => { - let filter_list = v.as_array().ok_or_else(|| { - TantivyBindingError::InvalidArgument(format!("lindera filters should be array")) - })?; - - for filter_params in filter_list { - let (name, params) = fetch_lindera_token_filter_params(filter_params)?; - let filter = fetch_lindera_token_filter(name, kind, params)?; - result.push(filter); - } - } - _ => {} - } - - Ok(result) -} - #[cfg(test)] mod tests { use super::LinderaTokenizer;