enhance: optimize jieba and lindera analyzer clone (#46719)

relate: https://github.com/milvus-io/milvus/issues/46718

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Enhancement: Optimize Jieba and Lindera Analyzer Clone

**Core Invariant**: JiebaTokenizer and LinderaTokenizer must be
efficiently cloneable without lifetime constraints to support analyzer
composition in multi-language detection chains.

**What Logic Was Improved**:
- **JiebaTokenizer**: Replaced `Cow<'a, Jieba>` with
`Arc<jieba_rs::Jieba>` and removed the `<'a>` lifetime parameter. The
global JIEBA instance now wraps in Arc, enabling `#[derive(Clone)]` on
the struct. This eliminates lifetime management complexity while
maintaining zero-copy sharing via atomic reference counting.
- **LinderaTokenizer**: Introduced public `LinderaSegmenter` struct
encapsulating dictionary and mode state, and implemented explicit
`Clone` that properly duplicates the segmenter (cloning Arc-wrapped
dictionary), applies `box_clone()` to each boxed token filter, and
clones the token buffer. Previously, Clone was either unavailable or
incompletely handled trait objects.

**Why Previous Implementation Was Limiting**:
- The `Cow::Borrowed` pattern for JiebaTokenizer created explicit
lifetime dependencies that prevented straightforward `#[derive(Clone)]`.
Switching to Arc eliminates borrow checker constraints while providing
the same reference semantics for immutable shared state.
- LinderaTokenizer's token filters are boxed trait objects, which cannot
be auto-derived. Manual Clone implementation with `box_clone()` calls
correctly handles polymorphic filter duplication.

**No Data Loss or Behavior Regression**:
- Arc cloning is semantically equivalent to `Cow::Borrowed` for
read-only access; both efficiently share the underlying Jieba instance
and Dictionary without data duplication.
- The explicit Clone preserves all tokenizer state: segmenter (with
shared Arc dictionary), all token filters (via individual box_clone),
and the token buffer used during tokenization.
- Token stream behavior unchanged—segmentation and filter application
order remain identical.
- New benchmarks (`bench_jieba_tokenizer_clone`,
`bench_lindera_tokenizer_clone`) measure and validate clone performance
for both tokenizers.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2026-01-06 21:19:25 +08:00 committed by GitHub
parent c7b5c23ff6
commit cc7652327d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 206 additions and 57 deletions

View File

@ -8,6 +8,10 @@ fn test_analyzer(tokenizer: &mut TextAnalyzer) {
tokenizer.token_stream(text);
}
fn clone_analyzer(tokenizer: &mut TextAnalyzer) {
let _ = tokenizer.clone();
}
fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) {
let params = r#"
{
@ -57,7 +61,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
}
},
"mapping": {
"Chinese": "jieba",
"Mandarin": "jieba",
"English": "en"
},
"identifier": "whatlang"
@ -72,9 +76,45 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
});
}
fn bench_jieba_tokenizer_clone(c: &mut Criterion) {
let params = r#"
{
"tokenizer": {
"type": "jieba",
"dict":["_extend_default_"]
}
}
"#;
let mut analyzer = create_analyzer(params, "");
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
c.bench_function("test", |b| {
b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
});
}
fn bench_lindera_tokenizer_clone(c: &mut Criterion) {
let params = r#"
{
"tokenizer": {
"type": "lindera",
"dict_kind": "ipadic"
}
}
"#;
let mut analyzer = create_analyzer(params, "");
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
c.bench_function("test", |b| {
b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
});
}
criterion_group!(
benches,
bench_lindua_language_identifier_tokenizer,
bench_whatlang_language_identifier_tokenizer
bench_whatlang_language_identifier_tokenizer,
bench_jieba_tokenizer_clone,
bench_lindera_tokenizer_clone
);
criterion_main!(benches);

View File

@ -1,18 +1,17 @@
use core::{option::Option::Some, result::Result::Ok};
use jieba_rs;
use lazy_static::lazy_static;
use log::warn;
use serde_json as json;
use std::fs;
use std::io::BufReader;
use std::{borrow::Cow, path::PathBuf};
use std::{path::PathBuf, sync::Arc};
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
use crate::error::{Result, TantivyBindingError};
lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
static ref JIEBA: Arc<jieba_rs::Jieba> = Arc::new(jieba_rs::Jieba::new());
}
static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big");
@ -25,10 +24,10 @@ pub enum JiebaMode {
}
#[derive(Clone)]
pub struct JiebaTokenizer<'a> {
pub struct JiebaTokenizer {
mode: JiebaMode,
hmm: bool,
tokenizer: Cow<'a, jieba_rs::Jieba>,
tokenizer: Arc<jieba_rs::Jieba>,
}
pub struct JiebaTokenStream {
@ -149,19 +148,19 @@ fn get_jieba_hmm(params: &json::Map<String, json::Value>) -> Result<bool> {
}
}
impl<'a> JiebaTokenizer<'a> {
pub fn new() -> JiebaTokenizer<'a> {
impl JiebaTokenizer {
pub fn new() -> JiebaTokenizer {
JiebaTokenizer {
mode: JiebaMode::Search,
hmm: true,
tokenizer: Cow::Borrowed(&JIEBA),
tokenizer: JIEBA.clone(),
}
}
pub fn from_json(
params: &json::Map<String, json::Value>,
helper: &mut FileResourcePathHelper,
) -> Result<JiebaTokenizer<'a>> {
) -> Result<JiebaTokenizer> {
let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?;
let mut tokenizer =
@ -203,7 +202,7 @@ impl<'a> JiebaTokenizer<'a> {
Ok(JiebaTokenizer {
mode: mode,
hmm: hmm,
tokenizer: Cow::Owned(tokenizer),
tokenizer: Arc::new(tokenizer),
})
}
@ -235,7 +234,7 @@ impl<'a> JiebaTokenizer<'a> {
}
}
impl Tokenizer for JiebaTokenizer<'static> {
impl Tokenizer for JiebaTokenizer {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {

View File

@ -249,7 +249,6 @@ impl<'a> LangIdentTokenizer<'a> {
fn tokenize<'b>(&'b mut self, text: &'b str) -> BoxTokenStream<'b> {
let language: String = self.identifier.0.detect(text);
let analyzer = self.get_by_language(language.as_str());
analyzer.token_stream(text)
}
}
@ -287,7 +286,7 @@ mod tests {
let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
let result = || -> Result<()> {
analyzer.add("default", create_analyzer(standard_params, "")?);
analyzer.add("cmn", create_analyzer(jieba_params, "")?);
analyzer.add("Mandarin", create_analyzer(jieba_params, "")?);
Ok(())
}();
@ -304,7 +303,7 @@ mod tests {
"default": {
"tokenizer": "standard"
},
"cmn": {
"Mandarin": {
"tokenizer": "jieba"
}
}
@ -339,8 +338,8 @@ mod tests {
}
},
"mapping": {
"cmn": "jieba",
"eng": "en"
"Mandarin": "jieba",
"English": "en"
}
}"#;

View File

@ -1,11 +1,10 @@
use core::result::Result::Err;
use std::collections::HashSet;
use std::{borrow::Cow, sync::Arc};
use lindera::dictionary::DictionaryKind;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::token::Token as LToken;
use lindera::tokenizer::Tokenizer as LTokenizer;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
@ -15,10 +14,104 @@ use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
use lindera::dictionary::{Dictionary, UserDictionary};
use lindera_dictionary::viterbi::Lattice;
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
/// Segmenter
#[derive(Clone)]
pub struct LinderaSegmenter {
/// The segmentation mode to be used by the segmenter.
/// This determines how the text will be split into segments.
pub mode: Mode,
/// The dictionary used for segmenting text. This dictionary contains the necessary
/// data structures and algorithms to perform morphological analysis and tokenization.
pub dictionary: Arc<Dictionary>,
/// An optional user-defined dictionary that can be used to customize the segmentation process.
/// If provided, this dictionary will be used in addition to the default dictionary to improve
/// the accuracy of segmentation for specific words or phrases.
pub user_dictionary: Option<Arc<UserDictionary>>,
}
impl LinderaSegmenter {
/// Creates a new instance with the specified mode, dictionary, and optional user dictionary.
pub fn new(
mode: Mode,
dictionary: Dictionary,
user_dictionary: Option<UserDictionary>,
) -> Self {
Self {
mode,
dictionary: Arc::new(dictionary),
user_dictionary: user_dictionary.map(|d| Arc::new(d)),
}
}
pub fn segment<'a>(&'a self, text: Cow<'a, str>) -> Result<Vec<LToken<'a>>> {
let mut tokens: Vec<LToken> = Vec::new();
let mut lattice = Lattice::default();
let mut position = 0_usize;
let mut byte_position = 0_usize;
// Split text into sentences using Japanese punctuation.
for sentence in text.split_inclusive(&['。', '、', '\n', '\t']) {
if sentence.is_empty() {
continue;
}
lattice.set_text(
&self.dictionary.prefix_dictionary,
&self.user_dictionary.as_ref().map(|d| &d.dict),
&self.dictionary.character_definition,
&self.dictionary.unknown_dictionary,
sentence,
&self.mode,
);
lattice.calculate_path_costs(&self.dictionary.connection_cost_matrix, &self.mode);
let offsets = lattice.tokens_offset();
for i in 0..offsets.len() {
let (byte_start, word_id) = offsets[i];
let byte_end = if i == offsets.len() - 1 {
sentence.len()
} else {
let (next_start, _word_id) = offsets[i + 1];
next_start
};
// retrieve token from its sentence byte positions
let surface = &sentence[byte_start..byte_end];
// compute the token's absolute byte positions
let token_start = byte_position;
byte_position += surface.len();
let token_end = byte_position;
// Use Cow::Owned to ensure the token data can be returned safely
tokens.push(LToken::new(
Cow::Owned(surface.to_string()), // Clone the string here
token_start,
token_end,
position,
word_id,
&self.dictionary,
self.user_dictionary.as_deref(),
));
position += 1;
}
}
Ok(tokens)
}
}
pub struct LinderaTokenStream<'a> {
pub tokens: Vec<LToken<'a>>,
@ -52,12 +145,27 @@ impl<'a> TokenStream for LinderaTokenStream<'a> {
}
}
#[derive(Clone)]
pub struct LinderaTokenizer {
tokenizer: LTokenizer,
segmenter: LinderaSegmenter,
lindera_filters: Vec<LTokenFilter>,
token: Token,
}
impl Clone for LinderaTokenizer {
fn clone(&self) -> Self {
let mut token_filters: Vec<LTokenFilter> = Vec::new();
for token_filter in self.lindera_filters.iter() {
token_filters.push(token_filter.box_clone());
}
Self {
segmenter: self.segmenter.clone(),
lindera_filters: token_filters,
token: self.token.clone(),
}
}
}
impl LinderaTokenizer {
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with json parameters.
@ -70,29 +178,45 @@ impl LinderaTokenizer {
let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?;
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
let segmenter = LinderaSegmenter::new(Mode::Normal, dictionary, None);
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
// append lindera filter
let filters = fetch_lindera_token_filters(&kind, params)?;
for filter in filters {
tokenizer.append_token_filter(filter)
}
tokenizer.append_token_filter(&kind, params)?;
Ok(tokenizer)
}
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
pub fn from_segmenter(segmenter: LinderaSegmenter) -> LinderaTokenizer {
LinderaTokenizer {
tokenizer: LTokenizer::new(segmenter),
segmenter: segmenter,
lindera_filters: vec![],
token: Default::default(),
}
}
pub fn append_token_filter(&mut self, filter: LTokenFilter) {
self.tokenizer.append_token_filter(filter);
pub fn append_token_filter(
&mut self,
kind: &DictionaryKind,
params: &json::Map<String, json::Value>,
) -> Result<()> {
match params.get(FILTER_KEY) {
Some(v) => {
let filter_list = v.as_array().ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
})?;
for filter_params in filter_list {
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
let filter = fetch_lindera_token_filter(name, kind, params)?;
self.lindera_filters.push(filter);
}
}
_ => {}
}
Ok(())
}
}
@ -101,8 +225,19 @@ impl Tokenizer for LinderaTokenizer {
fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
self.token.reset();
// Segment a text.
let mut tokens = self
.segmenter
.segment(Cow::<'a, str>::Borrowed(text))
.unwrap();
// Apply token filters to the tokens if they are not empty.
for token_filter in &self.lindera_filters {
token_filter.apply(&mut tokens).unwrap();
}
LinderaTokenStream {
tokens: self.tokenizer.tokenize(text).unwrap(),
tokens: tokens,
token: &mut self.token,
}
}
@ -312,30 +447,6 @@ fn fetch_lindera_token_filter(
}
}
fn fetch_lindera_token_filters(
kind: &DictionaryKind,
params: &json::Map<String, json::Value>,
) -> Result<Vec<LTokenFilter>> {
let mut result: Vec<LTokenFilter> = vec![];
match params.get(FILTER_KEY) {
Some(v) => {
let filter_list = v.as_array().ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
})?;
for filter_params in filter_list {
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
let filter = fetch_lindera_token_filter(name, kind, params)?;
result.push(filter);
}
}
_ => {}
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::LinderaTokenizer;