enhance: optimize jieba and lindera analyzer clone (#46719)

relate: https://github.com/milvus-io/milvus/issues/46718  ## Enhancement: Optimize Jieba and Lindera Analyzer Clone **Core Invariant**: JiebaTokenizer and LinderaTokenizer must be efficiently cloneable without lifetime constraints to support analyzer composition in multi-language detection chains. **What Logic Was Improved**: - **JiebaTokenizer**: Replaced `Cow<'a, Jieba>` with `Arc<jieba_rs::Jieba>` and removed the `<'a>` lifetime parameter. The global JIEBA instance now wraps in Arc, enabling `#[derive(Clone)]` on the struct. This eliminates lifetime management complexity while maintaining zero-copy sharing via atomic reference counting. - **LinderaTokenizer**: Introduced public `LinderaSegmenter` struct encapsulating dictionary and mode state, and implemented explicit `Clone` that properly duplicates the segmenter (cloning Arc-wrapped dictionary), applies `box_clone()` to each boxed token filter, and clones the token buffer. Previously, Clone was either unavailable or incompletely handled trait objects. **Why Previous Implementation Was Limiting**: - The `Cow::Borrowed` pattern for JiebaTokenizer created explicit lifetime dependencies that prevented straightforward `#[derive(Clone)]`. Switching to Arc eliminates borrow checker constraints while providing the same reference semantics for immutable shared state. - LinderaTokenizer's token filters are boxed trait objects, which cannot be auto-derived. Manual Clone implementation with `box_clone()` calls correctly handles polymorphic filter duplication. **No Data Loss or Behavior Regression**: - Arc cloning is semantically equivalent to `Cow::Borrowed` for read-only access; both efficiently share the underlying Jieba instance and Dictionary without data duplication. - The explicit Clone preserves all tokenizer state: segmenter (with shared Arc dictionary), all token filters (via individual box_clone), and the token buffer used during tokenization. - Token stream behavior unchanged—segmentation and filter application order remain identical. - New benchmarks (`bench_jieba_tokenizer_clone`, `bench_lindera_tokenizer_clone`) measure and validate clone performance for both tokenizers.  --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
2026-01-07 19:31:51 +08:00 · 2026-01-06 21:19:25 +08:00 · 2026-01-06 21:19:25 +08:00 · cc7652327d
commit cc7652327d
parent c7b5c23ff6
4 changed files with 206 additions and 57 deletions
--- a/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/benches/analyzer_bench.rs
@ -8,6 +8,10 @@ fn test_analyzer(tokenizer: &mut TextAnalyzer) {
    tokenizer.token_stream(text);
 }

+fn clone_analyzer(tokenizer: &mut TextAnalyzer) {
+    let _ = tokenizer.clone();
+}
+
 fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) {
    let params = r#"
        {
@ -57,7 +61,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
                    }
                },
                "mapping": {
-                    "Chinese": "jieba",
+                    "Mandarin": "jieba",
                    "English": "en"
                },
                "identifier": "whatlang"
@ -72,9 +76,45 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
    });
 }

+fn bench_jieba_tokenizer_clone(c: &mut Criterion) {
+    let params = r#"
+        {
+            "tokenizer": {
+                "type": "jieba",
+                "dict":["_extend_default_"]
+            }
+        }
+    "#;
+    let mut analyzer = create_analyzer(params, "");
+    assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
+
+    c.bench_function("test", |b| {
+        b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
+    });
+}
+
+fn bench_lindera_tokenizer_clone(c: &mut Criterion) {
+    let params = r#"
+        {
+            "tokenizer": {
+                "type": "lindera",
+                "dict_kind": "ipadic"
+            }
+        }
+    "#;
+    let mut analyzer = create_analyzer(params, "");
+    assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
+
+    c.bench_function("test", |b| {
+        b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
+    });
+}
+
 criterion_group!(
    benches,
    bench_lindua_language_identifier_tokenizer,
-    bench_whatlang_language_identifier_tokenizer
+    bench_whatlang_language_identifier_tokenizer,
+    bench_jieba_tokenizer_clone,
+    bench_lindera_tokenizer_clone
 );
 criterion_main!(benches);
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs
@ -1,18 +1,17 @@
 use core::{option::Option::Some, result::Result::Ok};
 use jieba_rs;
 use lazy_static::lazy_static;
-use log::warn;
 use serde_json as json;
 use std::fs;
 use std::io::BufReader;
-use std::{borrow::Cow, path::PathBuf};
+use std::{path::PathBuf, sync::Arc};
 use tantivy::tokenizer::{Token, TokenStream, Tokenizer};

 use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
 use crate::error::{Result, TantivyBindingError};

 lazy_static! {
-    static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
+    static ref JIEBA: Arc<jieba_rs::Jieba> = Arc::new(jieba_rs::Jieba::new());
 }

 static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big");
@ -25,10 +24,10 @@ pub enum JiebaMode {
 }

 #[derive(Clone)]
-pub struct JiebaTokenizer<'a> {
+pub struct JiebaTokenizer {
    mode: JiebaMode,
    hmm: bool,
-    tokenizer: Cow<'a, jieba_rs::Jieba>,
+    tokenizer: Arc<jieba_rs::Jieba>,
 }

 pub struct JiebaTokenStream {
@ -149,19 +148,19 @@ fn get_jieba_hmm(params: &json::Map<String, json::Value>) -> Result<bool> {
    }
 }

-impl<'a> JiebaTokenizer<'a> {
-    pub fn new() -> JiebaTokenizer<'a> {
+impl JiebaTokenizer {
+    pub fn new() -> JiebaTokenizer {
        JiebaTokenizer {
            mode: JiebaMode::Search,
            hmm: true,
-            tokenizer: Cow::Borrowed(&JIEBA),
+            tokenizer: JIEBA.clone(),
        }
    }

    pub fn from_json(
        params: &json::Map<String, json::Value>,
        helper: &mut FileResourcePathHelper,
-    ) -> Result<JiebaTokenizer<'a>> {
+    ) -> Result<JiebaTokenizer> {
        let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?;

        let mut tokenizer =
@ -203,7 +202,7 @@ impl<'a> JiebaTokenizer<'a> {
        Ok(JiebaTokenizer {
            mode: mode,
            hmm: hmm,
-            tokenizer: Cow::Owned(tokenizer),
+            tokenizer: Arc::new(tokenizer),
        })
    }

@ -235,7 +234,7 @@ impl<'a> JiebaTokenizer<'a> {
    }
 }

-impl Tokenizer for JiebaTokenizer<'static> {
+impl Tokenizer for JiebaTokenizer {
    type TokenStream<'a> = JiebaTokenStream;

    fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lang_ident_tokenizer.rs
@ -249,7 +249,6 @@ impl<'a> LangIdentTokenizer<'a> {
    fn tokenize<'b>(&'b mut self, text: &'b str) -> BoxTokenStream<'b> {
        let language: String = self.identifier.0.detect(text);
        let analyzer = self.get_by_language(language.as_str());
-
        analyzer.token_stream(text)
    }
 }
@ -287,7 +286,7 @@ mod tests {
        let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
        let result = || -> Result<()> {
            analyzer.add("default", create_analyzer(standard_params, "")?);
-            analyzer.add("cmn", create_analyzer(jieba_params, "")?);
+            analyzer.add("Mandarin", create_analyzer(jieba_params, "")?);
            Ok(())
        }();

@ -304,7 +303,7 @@ mod tests {
                "default": {
                    "tokenizer": "standard"
                },
-                "cmn": {
+                "Mandarin": {
                    "tokenizer": "jieba"
                }
            }
@ -339,8 +338,8 @@ mod tests {
                }
            },
            "mapping": {
-                "cmn": "jieba",
-                "eng": "en"
+                "Mandarin": "jieba",
+                "English": "en"
            }
        }"#;

--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
@ -1,11 +1,10 @@
 use core::result::Result::Err;
 use std::collections::HashSet;
+use std::{borrow::Cow, sync::Arc};

 use lindera::dictionary::DictionaryKind;
 use lindera::mode::Mode;
-use lindera::segmenter::Segmenter;
 use lindera::token::Token as LToken;
-use lindera::tokenizer::Tokenizer as LTokenizer;
 use tantivy::tokenizer::{Token, TokenStream, Tokenizer};

 use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
@ -15,10 +14,104 @@ use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
 use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
 use lindera::token_filter::BoxTokenFilter as LTokenFilter;

+use lindera::dictionary::{Dictionary, UserDictionary};
+use lindera_dictionary::viterbi::Lattice;
+
 use crate::analyzer::dict::lindera::load_dictionary_from_kind;
 use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
 use crate::error::{Result, TantivyBindingError};
 use serde_json as json;
+/// Segmenter
+#[derive(Clone)]
+pub struct LinderaSegmenter {
+    /// The segmentation mode to be used by the segmenter.
+    /// This determines how the text will be split into segments.
+    pub mode: Mode,
+
+    /// The dictionary used for segmenting text. This dictionary contains the necessary
+    /// data structures and algorithms to perform morphological analysis and tokenization.
+    pub dictionary: Arc<Dictionary>,
+
+    /// An optional user-defined dictionary that can be used to customize the segmentation process.
+    /// If provided, this dictionary will be used in addition to the default dictionary to improve
+    /// the accuracy of segmentation for specific words or phrases.
+    pub user_dictionary: Option<Arc<UserDictionary>>,
+}
+
+impl LinderaSegmenter {
+    /// Creates a new instance with the specified mode, dictionary, and optional user dictionary.
+    pub fn new(
+        mode: Mode,
+        dictionary: Dictionary,
+        user_dictionary: Option<UserDictionary>,
+    ) -> Self {
+        Self {
+            mode,
+            dictionary: Arc::new(dictionary),
+            user_dictionary: user_dictionary.map(|d| Arc::new(d)),
+        }
+    }
+
+    pub fn segment<'a>(&'a self, text: Cow<'a, str>) -> Result<Vec<LToken<'a>>> {
+        let mut tokens: Vec<LToken> = Vec::new();
+        let mut lattice = Lattice::default();
+
+        let mut position = 0_usize;
+        let mut byte_position = 0_usize;
+
+        // Split text into sentences using Japanese punctuation.
+        for sentence in text.split_inclusive(&['。', '、', '\n', '\t']) {
+            if sentence.is_empty() {
+                continue;
+            }
+
+            lattice.set_text(
+                &self.dictionary.prefix_dictionary,
+                &self.user_dictionary.as_ref().map(|d| &d.dict),
+                &self.dictionary.character_definition,
+                &self.dictionary.unknown_dictionary,
+                sentence,
+                &self.mode,
+            );
+            lattice.calculate_path_costs(&self.dictionary.connection_cost_matrix, &self.mode);
+
+            let offsets = lattice.tokens_offset();
+
+            for i in 0..offsets.len() {
+                let (byte_start, word_id) = offsets[i];
+                let byte_end = if i == offsets.len() - 1 {
+                    sentence.len()
+                } else {
+                    let (next_start, _word_id) = offsets[i + 1];
+                    next_start
+                };
+
+                // retrieve token from its sentence byte positions
+                let surface = &sentence[byte_start..byte_end];
+
+                // compute the token's absolute byte positions
+                let token_start = byte_position;
+                byte_position += surface.len();
+                let token_end = byte_position;
+
+                // Use Cow::Owned to ensure the token data can be returned safely
+                tokens.push(LToken::new(
+                    Cow::Owned(surface.to_string()), // Clone the string here
+                    token_start,
+                    token_end,
+                    position,
+                    word_id,
+                    &self.dictionary,
+                    self.user_dictionary.as_deref(),
+                ));
+
+                position += 1;
+            }
+        }
+
+        Ok(tokens)
+    }
+}

 pub struct LinderaTokenStream<'a> {
    pub tokens: Vec<LToken<'a>>,
@ -52,12 +145,27 @@ impl<'a> TokenStream for LinderaTokenStream<'a> {
    }
 }

-#[derive(Clone)]
 pub struct LinderaTokenizer {
-    tokenizer: LTokenizer,
+    segmenter: LinderaSegmenter,
+    lindera_filters: Vec<LTokenFilter>,
    token: Token,
 }

+impl Clone for LinderaTokenizer {
+    fn clone(&self) -> Self {
+        let mut token_filters: Vec<LTokenFilter> = Vec::new();
+        for token_filter in self.lindera_filters.iter() {
+            token_filters.push(token_filter.box_clone());
+        }
+
+        Self {
+            segmenter: self.segmenter.clone(),
+            lindera_filters: token_filters,
+            token: self.token.clone(),
+        }
+    }
+}
+
 impl LinderaTokenizer {
    /// Create a new `LinderaTokenizer`.
    /// This function will create a new `LinderaTokenizer` with json parameters.
@ -70,29 +178,45 @@ impl LinderaTokenizer {

        let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?;

-        let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
+        let segmenter = LinderaSegmenter::new(Mode::Normal, dictionary, None);
        let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);

        // append lindera filter
-        let filters = fetch_lindera_token_filters(&kind, params)?;
-        for filter in filters {
-            tokenizer.append_token_filter(filter)
-        }
-
+        tokenizer.append_token_filter(&kind, params)?;
        Ok(tokenizer)
    }

    /// Create a new `LinderaTokenizer`.
    /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
-    pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
+    pub fn from_segmenter(segmenter: LinderaSegmenter) -> LinderaTokenizer {
        LinderaTokenizer {
-            tokenizer: LTokenizer::new(segmenter),
+            segmenter: segmenter,
+            lindera_filters: vec![],
            token: Default::default(),
        }
    }

-    pub fn append_token_filter(&mut self, filter: LTokenFilter) {
-        self.tokenizer.append_token_filter(filter);
+    pub fn append_token_filter(
+        &mut self,
+        kind: &DictionaryKind,
+        params: &json::Map<String, json::Value>,
+    ) -> Result<()> {
+        match params.get(FILTER_KEY) {
+            Some(v) => {
+                let filter_list = v.as_array().ok_or_else(|| {
+                    TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
+                })?;
+
+                for filter_params in filter_list {
+                    let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
+                    let filter = fetch_lindera_token_filter(name, kind, params)?;
+                    self.lindera_filters.push(filter);
+                }
+            }
+            _ => {}
+        }
+
+        Ok(())
    }
 }

@ -101,8 +225,19 @@ impl Tokenizer for LinderaTokenizer {

    fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
        self.token.reset();
+        // Segment a text.
+        let mut tokens = self
+            .segmenter
+            .segment(Cow::<'a, str>::Borrowed(text))
+            .unwrap();
+
+        // Apply token filters to the tokens if they are not empty.
+        for token_filter in &self.lindera_filters {
+            token_filter.apply(&mut tokens).unwrap();
+        }
+
        LinderaTokenStream {
-            tokens: self.tokenizer.tokenize(text).unwrap(),
+            tokens: tokens,
            token: &mut self.token,
        }
    }
@ -312,30 +447,6 @@ fn fetch_lindera_token_filter(
    }
 }

-fn fetch_lindera_token_filters(
-    kind: &DictionaryKind,
-    params: &json::Map<String, json::Value>,
-) -> Result<Vec<LTokenFilter>> {
-    let mut result: Vec<LTokenFilter> = vec![];
-
-    match params.get(FILTER_KEY) {
-        Some(v) => {
-            let filter_list = v.as_array().ok_or_else(|| {
-                TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
-            })?;
-
-            for filter_params in filter_list {
-                let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
-                let filter = fetch_lindera_token_filter(name, kind, params)?;
-                result.push(filter);
-            }
-        }
-        _ => {}
-    }
-
-    Ok(result)
-}
-
 #[cfg(test)]
 mod tests {
    use super::LinderaTokenizer;