mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
enhance: optimize jieba and lindera analyzer clone (#46719)
relate: https://github.com/milvus-io/milvus/issues/46718 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Enhancement: Optimize Jieba and Lindera Analyzer Clone **Core Invariant**: JiebaTokenizer and LinderaTokenizer must be efficiently cloneable without lifetime constraints to support analyzer composition in multi-language detection chains. **What Logic Was Improved**: - **JiebaTokenizer**: Replaced `Cow<'a, Jieba>` with `Arc<jieba_rs::Jieba>` and removed the `<'a>` lifetime parameter. The global JIEBA instance now wraps in Arc, enabling `#[derive(Clone)]` on the struct. This eliminates lifetime management complexity while maintaining zero-copy sharing via atomic reference counting. - **LinderaTokenizer**: Introduced public `LinderaSegmenter` struct encapsulating dictionary and mode state, and implemented explicit `Clone` that properly duplicates the segmenter (cloning Arc-wrapped dictionary), applies `box_clone()` to each boxed token filter, and clones the token buffer. Previously, Clone was either unavailable or incompletely handled trait objects. **Why Previous Implementation Was Limiting**: - The `Cow::Borrowed` pattern for JiebaTokenizer created explicit lifetime dependencies that prevented straightforward `#[derive(Clone)]`. Switching to Arc eliminates borrow checker constraints while providing the same reference semantics for immutable shared state. - LinderaTokenizer's token filters are boxed trait objects, which cannot be auto-derived. Manual Clone implementation with `box_clone()` calls correctly handles polymorphic filter duplication. **No Data Loss or Behavior Regression**: - Arc cloning is semantically equivalent to `Cow::Borrowed` for read-only access; both efficiently share the underlying Jieba instance and Dictionary without data duplication. - The explicit Clone preserves all tokenizer state: segmenter (with shared Arc dictionary), all token filters (via individual box_clone), and the token buffer used during tokenization. - Token stream behavior unchanged—segmentation and filter application order remain identical. - New benchmarks (`bench_jieba_tokenizer_clone`, `bench_lindera_tokenizer_clone`) measure and validate clone performance for both tokenizers. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
c7b5c23ff6
commit
cc7652327d
@ -8,6 +8,10 @@ fn test_analyzer(tokenizer: &mut TextAnalyzer) {
|
||||
tokenizer.token_stream(text);
|
||||
}
|
||||
|
||||
fn clone_analyzer(tokenizer: &mut TextAnalyzer) {
|
||||
let _ = tokenizer.clone();
|
||||
}
|
||||
|
||||
fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) {
|
||||
let params = r#"
|
||||
{
|
||||
@ -57,7 +61,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
|
||||
}
|
||||
},
|
||||
"mapping": {
|
||||
"Chinese": "jieba",
|
||||
"Mandarin": "jieba",
|
||||
"English": "en"
|
||||
},
|
||||
"identifier": "whatlang"
|
||||
@ -72,9 +76,45 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_jieba_tokenizer_clone(c: &mut Criterion) {
|
||||
let params = r#"
|
||||
{
|
||||
"tokenizer": {
|
||||
"type": "jieba",
|
||||
"dict":["_extend_default_"]
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let mut analyzer = create_analyzer(params, "");
|
||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||
|
||||
c.bench_function("test", |b| {
|
||||
b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_lindera_tokenizer_clone(c: &mut Criterion) {
|
||||
let params = r#"
|
||||
{
|
||||
"tokenizer": {
|
||||
"type": "lindera",
|
||||
"dict_kind": "ipadic"
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let mut analyzer = create_analyzer(params, "");
|
||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||
|
||||
c.bench_function("test", |b| {
|
||||
b.iter(|| clone_analyzer(black_box(&mut analyzer.as_mut().unwrap())))
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(
|
||||
benches,
|
||||
bench_lindua_language_identifier_tokenizer,
|
||||
bench_whatlang_language_identifier_tokenizer
|
||||
bench_whatlang_language_identifier_tokenizer,
|
||||
bench_jieba_tokenizer_clone,
|
||||
bench_lindera_tokenizer_clone
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
@ -1,18 +1,17 @@
|
||||
use core::{option::Option::Some, result::Result::Ok};
|
||||
use jieba_rs;
|
||||
use lazy_static::lazy_static;
|
||||
use log::warn;
|
||||
use serde_json as json;
|
||||
use std::fs;
|
||||
use std::io::BufReader;
|
||||
use std::{borrow::Cow, path::PathBuf};
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
lazy_static! {
|
||||
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
|
||||
static ref JIEBA: Arc<jieba_rs::Jieba> = Arc::new(jieba_rs::Jieba::new());
|
||||
}
|
||||
|
||||
static EXTEND_DEFAULT_DICT: &str = include_str!("../data/jieba/dict.txt.big");
|
||||
@ -25,10 +24,10 @@ pub enum JiebaMode {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct JiebaTokenizer<'a> {
|
||||
pub struct JiebaTokenizer {
|
||||
mode: JiebaMode,
|
||||
hmm: bool,
|
||||
tokenizer: Cow<'a, jieba_rs::Jieba>,
|
||||
tokenizer: Arc<jieba_rs::Jieba>,
|
||||
}
|
||||
|
||||
pub struct JiebaTokenStream {
|
||||
@ -149,19 +148,19 @@ fn get_jieba_hmm(params: &json::Map<String, json::Value>) -> Result<bool> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> JiebaTokenizer<'a> {
|
||||
pub fn new() -> JiebaTokenizer<'a> {
|
||||
impl JiebaTokenizer {
|
||||
pub fn new() -> JiebaTokenizer {
|
||||
JiebaTokenizer {
|
||||
mode: JiebaMode::Search,
|
||||
hmm: true,
|
||||
tokenizer: Cow::Borrowed(&JIEBA),
|
||||
tokenizer: JIEBA.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<JiebaTokenizer<'a>> {
|
||||
) -> Result<JiebaTokenizer> {
|
||||
let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?;
|
||||
|
||||
let mut tokenizer =
|
||||
@ -203,7 +202,7 @@ impl<'a> JiebaTokenizer<'a> {
|
||||
Ok(JiebaTokenizer {
|
||||
mode: mode,
|
||||
hmm: hmm,
|
||||
tokenizer: Cow::Owned(tokenizer),
|
||||
tokenizer: Arc::new(tokenizer),
|
||||
})
|
||||
}
|
||||
|
||||
@ -235,7 +234,7 @@ impl<'a> JiebaTokenizer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for JiebaTokenizer<'static> {
|
||||
impl Tokenizer for JiebaTokenizer {
|
||||
type TokenStream<'a> = JiebaTokenStream;
|
||||
|
||||
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
|
||||
|
||||
@ -249,7 +249,6 @@ impl<'a> LangIdentTokenizer<'a> {
|
||||
fn tokenize<'b>(&'b mut self, text: &'b str) -> BoxTokenStream<'b> {
|
||||
let language: String = self.identifier.0.detect(text);
|
||||
let analyzer = self.get_by_language(language.as_str());
|
||||
|
||||
analyzer.token_stream(text)
|
||||
}
|
||||
}
|
||||
@ -287,7 +286,7 @@ mod tests {
|
||||
let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
|
||||
let result = || -> Result<()> {
|
||||
analyzer.add("default", create_analyzer(standard_params, "")?);
|
||||
analyzer.add("cmn", create_analyzer(jieba_params, "")?);
|
||||
analyzer.add("Mandarin", create_analyzer(jieba_params, "")?);
|
||||
Ok(())
|
||||
}();
|
||||
|
||||
@ -304,7 +303,7 @@ mod tests {
|
||||
"default": {
|
||||
"tokenizer": "standard"
|
||||
},
|
||||
"cmn": {
|
||||
"Mandarin": {
|
||||
"tokenizer": "jieba"
|
||||
}
|
||||
}
|
||||
@ -339,8 +338,8 @@ mod tests {
|
||||
}
|
||||
},
|
||||
"mapping": {
|
||||
"cmn": "jieba",
|
||||
"eng": "en"
|
||||
"Mandarin": "jieba",
|
||||
"English": "en"
|
||||
}
|
||||
}"#;
|
||||
|
||||
|
||||
@ -1,11 +1,10 @@
|
||||
use core::result::Result::Err;
|
||||
use std::collections::HashSet;
|
||||
use std::{borrow::Cow, sync::Arc};
|
||||
|
||||
use lindera::dictionary::DictionaryKind;
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::token::Token as LToken;
|
||||
use lindera::tokenizer::Tokenizer as LTokenizer;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
||||
@ -15,10 +14,104 @@ use lindera::token_filter::korean_keep_tags::KoreanKeepTagsTokenFilter;
|
||||
use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
||||
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||
|
||||
use lindera::dictionary::{Dictionary, UserDictionary};
|
||||
use lindera_dictionary::viterbi::Lattice;
|
||||
|
||||
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
||||
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
/// Segmenter
|
||||
#[derive(Clone)]
|
||||
pub struct LinderaSegmenter {
|
||||
/// The segmentation mode to be used by the segmenter.
|
||||
/// This determines how the text will be split into segments.
|
||||
pub mode: Mode,
|
||||
|
||||
/// The dictionary used for segmenting text. This dictionary contains the necessary
|
||||
/// data structures and algorithms to perform morphological analysis and tokenization.
|
||||
pub dictionary: Arc<Dictionary>,
|
||||
|
||||
/// An optional user-defined dictionary that can be used to customize the segmentation process.
|
||||
/// If provided, this dictionary will be used in addition to the default dictionary to improve
|
||||
/// the accuracy of segmentation for specific words or phrases.
|
||||
pub user_dictionary: Option<Arc<UserDictionary>>,
|
||||
}
|
||||
|
||||
impl LinderaSegmenter {
|
||||
/// Creates a new instance with the specified mode, dictionary, and optional user dictionary.
|
||||
pub fn new(
|
||||
mode: Mode,
|
||||
dictionary: Dictionary,
|
||||
user_dictionary: Option<UserDictionary>,
|
||||
) -> Self {
|
||||
Self {
|
||||
mode,
|
||||
dictionary: Arc::new(dictionary),
|
||||
user_dictionary: user_dictionary.map(|d| Arc::new(d)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn segment<'a>(&'a self, text: Cow<'a, str>) -> Result<Vec<LToken<'a>>> {
|
||||
let mut tokens: Vec<LToken> = Vec::new();
|
||||
let mut lattice = Lattice::default();
|
||||
|
||||
let mut position = 0_usize;
|
||||
let mut byte_position = 0_usize;
|
||||
|
||||
// Split text into sentences using Japanese punctuation.
|
||||
for sentence in text.split_inclusive(&['。', '、', '\n', '\t']) {
|
||||
if sentence.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
lattice.set_text(
|
||||
&self.dictionary.prefix_dictionary,
|
||||
&self.user_dictionary.as_ref().map(|d| &d.dict),
|
||||
&self.dictionary.character_definition,
|
||||
&self.dictionary.unknown_dictionary,
|
||||
sentence,
|
||||
&self.mode,
|
||||
);
|
||||
lattice.calculate_path_costs(&self.dictionary.connection_cost_matrix, &self.mode);
|
||||
|
||||
let offsets = lattice.tokens_offset();
|
||||
|
||||
for i in 0..offsets.len() {
|
||||
let (byte_start, word_id) = offsets[i];
|
||||
let byte_end = if i == offsets.len() - 1 {
|
||||
sentence.len()
|
||||
} else {
|
||||
let (next_start, _word_id) = offsets[i + 1];
|
||||
next_start
|
||||
};
|
||||
|
||||
// retrieve token from its sentence byte positions
|
||||
let surface = &sentence[byte_start..byte_end];
|
||||
|
||||
// compute the token's absolute byte positions
|
||||
let token_start = byte_position;
|
||||
byte_position += surface.len();
|
||||
let token_end = byte_position;
|
||||
|
||||
// Use Cow::Owned to ensure the token data can be returned safely
|
||||
tokens.push(LToken::new(
|
||||
Cow::Owned(surface.to_string()), // Clone the string here
|
||||
token_start,
|
||||
token_end,
|
||||
position,
|
||||
word_id,
|
||||
&self.dictionary,
|
||||
self.user_dictionary.as_deref(),
|
||||
));
|
||||
|
||||
position += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(tokens)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LinderaTokenStream<'a> {
|
||||
pub tokens: Vec<LToken<'a>>,
|
||||
@ -52,12 +145,27 @@ impl<'a> TokenStream for LinderaTokenStream<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct LinderaTokenizer {
|
||||
tokenizer: LTokenizer,
|
||||
segmenter: LinderaSegmenter,
|
||||
lindera_filters: Vec<LTokenFilter>,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Clone for LinderaTokenizer {
|
||||
fn clone(&self) -> Self {
|
||||
let mut token_filters: Vec<LTokenFilter> = Vec::new();
|
||||
for token_filter in self.lindera_filters.iter() {
|
||||
token_filters.push(token_filter.box_clone());
|
||||
}
|
||||
|
||||
Self {
|
||||
segmenter: self.segmenter.clone(),
|
||||
lindera_filters: token_filters,
|
||||
token: self.token.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LinderaTokenizer {
|
||||
/// Create a new `LinderaTokenizer`.
|
||||
/// This function will create a new `LinderaTokenizer` with json parameters.
|
||||
@ -70,29 +178,45 @@ impl LinderaTokenizer {
|
||||
|
||||
let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?;
|
||||
|
||||
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
|
||||
let segmenter = LinderaSegmenter::new(Mode::Normal, dictionary, None);
|
||||
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
|
||||
|
||||
// append lindera filter
|
||||
let filters = fetch_lindera_token_filters(&kind, params)?;
|
||||
for filter in filters {
|
||||
tokenizer.append_token_filter(filter)
|
||||
}
|
||||
|
||||
tokenizer.append_token_filter(&kind, params)?;
|
||||
Ok(tokenizer)
|
||||
}
|
||||
|
||||
/// Create a new `LinderaTokenizer`.
|
||||
/// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
|
||||
pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
|
||||
pub fn from_segmenter(segmenter: LinderaSegmenter) -> LinderaTokenizer {
|
||||
LinderaTokenizer {
|
||||
tokenizer: LTokenizer::new(segmenter),
|
||||
segmenter: segmenter,
|
||||
lindera_filters: vec![],
|
||||
token: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_token_filter(&mut self, filter: LTokenFilter) {
|
||||
self.tokenizer.append_token_filter(filter);
|
||||
pub fn append_token_filter(
|
||||
&mut self,
|
||||
kind: &DictionaryKind,
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<()> {
|
||||
match params.get(FILTER_KEY) {
|
||||
Some(v) => {
|
||||
let filter_list = v.as_array().ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
|
||||
})?;
|
||||
|
||||
for filter_params in filter_list {
|
||||
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
|
||||
let filter = fetch_lindera_token_filter(name, kind, params)?;
|
||||
self.lindera_filters.push(filter);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@ -101,8 +225,19 @@ impl Tokenizer for LinderaTokenizer {
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
|
||||
self.token.reset();
|
||||
// Segment a text.
|
||||
let mut tokens = self
|
||||
.segmenter
|
||||
.segment(Cow::<'a, str>::Borrowed(text))
|
||||
.unwrap();
|
||||
|
||||
// Apply token filters to the tokens if they are not empty.
|
||||
for token_filter in &self.lindera_filters {
|
||||
token_filter.apply(&mut tokens).unwrap();
|
||||
}
|
||||
|
||||
LinderaTokenStream {
|
||||
tokens: self.tokenizer.tokenize(text).unwrap(),
|
||||
tokens: tokens,
|
||||
token: &mut self.token,
|
||||
}
|
||||
}
|
||||
@ -312,30 +447,6 @@ fn fetch_lindera_token_filter(
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_lindera_token_filters(
|
||||
kind: &DictionaryKind,
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<Vec<LTokenFilter>> {
|
||||
let mut result: Vec<LTokenFilter> = vec![];
|
||||
|
||||
match params.get(FILTER_KEY) {
|
||||
Some(v) => {
|
||||
let filter_list = v.as_array().ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
|
||||
})?;
|
||||
|
||||
for filter_params in filter_list {
|
||||
let (name, params) = fetch_lindera_token_filter_params(filter_params)?;
|
||||
let filter = fetch_lindera_token_filter(name, kind, params)?;
|
||||
result.push(filter);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::LinderaTokenizer;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user