feat: support file params in analyzer and set jieba dict file (#45206)

relate: https://github.com/milvus-io/milvus/issues/43687
Support use user provice file by file params, in analyzer params.
Could use local file or remote file resource.
Support use file params in jieba extern dict.

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
aoiasd 2025-11-25 11:33:06 +08:00 committed by GitHub
parent 5e6fdf3ba7
commit 322caafe18
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 156 additions and 23 deletions

1
.gitignore vendored
View File

@ -47,6 +47,7 @@ proxy/suvlim/*
proxy-go/proxy-go
# Compiled source
target/
bin/
lib/
*.a

View File

@ -2,10 +2,10 @@ mod analyzer;
mod build_in_analyzer;
mod dict;
mod filter;
mod runtime_option;
mod options;
pub mod tokenizers;
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
pub use self::runtime_option::set_options;
pub use self::options::set_options;
pub(crate) use self::build_in_analyzer::standard_analyzer;

View File

@ -0,0 +1,8 @@
mod runtime_option;
mod util;
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
pub use self::util::get_resource_path;
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;

View File

@ -2,6 +2,7 @@ use crate::error::{Result, TantivyBindingError};
use once_cell::sync::Lazy;
use serde_json as json;
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
@ -26,8 +27,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
}
pub fn get_resource_id(name: &str) -> Option<i64> {
GLOBAL_OPTIONS.get_resource_id(name)
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
}
// analyzer options
@ -57,9 +58,28 @@ impl RuntimeOption {
r.lindera_download_urls.get(kind).map(|v| v.clone())
}
fn get_resource_id(&self, name: &str) -> Option<i64> {
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
let r = self.inner.read().unwrap();
r.resource_map.get(name).cloned()
let resource_id =
r.resource_map
.get(resource_name)
.ok_or(TantivyBindingError::InternalError(format!(
"file resource: {} not found in local resource list",
resource_name
)))?;
let base = r
.params
.get(RESOURCE_PATH_KEY)
.ok_or(TantivyBindingError::InternalError(
"local_resource_path config not init success".to_string(),
))?
.as_str()
.ok_or("local_resource_path must set as string")?;
return Ok(PathBuf::new()
.join(base)
.join(resource_id.to_string())
.join(file_name));
}
}

View File

@ -0,0 +1,83 @@
use serde_json as json;
use std::path::{Path, PathBuf};
use super::runtime_option::get_resource_file_path;
use crate::error::{Result, TantivyBindingError};
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
if !v.is_object() {
return Err(TantivyBindingError::InvalidArgument(format!(
"file config of {} must be object",
resource_key
)));
}
let params = v.as_object().unwrap();
let file_type = params.get("type").ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!("file type of {} must be set", resource_key))
})?;
if !file_type.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"file type of {} must be string",
resource_key
)));
}
match file_type.as_str().unwrap() {
"local" => {
let path = params.get("path").ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"file path of local file `{}` must be set",
resource_key
))
})?;
if !path.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"file path of local file `{}` must be string",
resource_key
)));
}
let path_str = path.as_str().unwrap();
Ok(Path::new(path_str).to_path_buf())
}
"remote" => {
let resource_name = params
.get("resource_name")
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"resource name of remote file `{}` must be set",
resource_key
))
})?
.as_str()
.ok_or(TantivyBindingError::InvalidArgument(format!(
"remote file resource name of remote file `{}` must be string",
resource_key
)))?;
let file_name = params
.get("file_name")
.ok_or_else(|| {
TantivyBindingError::InvalidArgument(format!(
"file name of remote file `{}` must be set",
resource_key
))
})?
.as_str()
.ok_or(TantivyBindingError::InvalidArgument(format!(
"remote file resource name of {} must be string",
resource_key
)))?;
self::get_resource_file_path(resource_name, file_name)
}
other => Err(TantivyBindingError::InvalidArgument(format!(
"unsupported file type {} of {}",
other, resource_key
))),
}
}

View File

@ -2,10 +2,12 @@ use core::{option::Option::Some, result::Result::Ok};
use jieba_rs;
use lazy_static::lazy_static;
use serde_json as json;
use std::borrow::Cow;
use std::fs;
use std::io::BufReader;
use std::{borrow::Cow, path::PathBuf};
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use crate::analyzer::options;
use crate::error::{Result, TantivyBindingError};
lazy_static! {
@ -54,16 +56,19 @@ impl TokenStream for JiebaTokenStream {
fn get_jieba_dict(
params: &json::Map<String, json::Value>,
) -> Result<(Vec<String>, Option<String>)> {
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
let mut words = Vec::<String>::new();
let mut user_dict = None;
// use default dict as default system dict
let mut system_dict = Some("_default_".to_string());
match params.get("dict") {
Some(value) => {
system_dict = None;
if !value.is_array() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer dict must be array"
)));
}
let mut dict = Vec::<String>::new();
let mut system_dict = None;
for word in value.as_array().unwrap() {
if !word.is_string() {
@ -82,18 +87,27 @@ fn get_jieba_dict(
if text == "_default_" || text == "_extend_default_" {
if system_dict.is_some() {
return Err(TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer dict can only set one default dict"
"jieba tokenizer dict can only set one system dict"
)));
}
system_dict = Some(text)
} else {
dict.push(text);
words.push(text);
}
}
Ok((dict, system_dict))
}
_ => Ok((vec![], Some("_default_".to_string()))),
}
_ => {}
};
match params.get("extra_dict_file") {
Some(v) => {
let path = options::get_resource_path(v, "jieba extra dict file")?;
user_dict = Some(path)
}
_ => {}
};
Ok((words, system_dict, user_dict))
}
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
@ -143,7 +157,7 @@ impl<'a> JiebaTokenizer<'a> {
}
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
let (dict, system_dict) = get_jieba_dict(params)?;
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
let mut tokenizer =
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
@ -163,10 +177,21 @@ impl<'a> JiebaTokenizer<'a> {
))),
})?;
for word in dict {
for word in words {
tokenizer.add_word(word.as_str(), None, None);
}
if user_dict.is_some() {
let file = fs::File::open(user_dict.unwrap())?;
let mut reader = BufReader::new(file);
tokenizer.load_dict(&mut reader).map_err(|e| {
TantivyBindingError::InvalidArgument(format!(
"jieba tokenizer load dict file failed with error: {:?}",
e
))
})?;
}
let mode = get_jieba_mode(params)?;
let hmm = get_jieba_hmm(params)?;

View File

@ -6,7 +6,6 @@ use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::token::Token as LToken;
use lindera::tokenizer::Tokenizer as LTokenizer;
use log::warn;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
@ -17,9 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
use crate::analyzer::runtime_option::{
get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY,
};
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
use crate::error::{Result, TantivyBindingError};
use serde_json as json;

View File

@ -1,4 +1,3 @@
use core::slice;
use std::ffi::{c_char, c_void, CStr};
use crate::{
@ -18,7 +17,7 @@ macro_rules! convert_to_rust_slice {
match $arr {
// there is a UB in slice::from_raw_parts if the pointer is null
x if x.is_null() => &[],
_ => slice::from_raw_parts($arr, $len),
_ => ::core::slice::from_raw_parts($arr, $len),
}
};
}