mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 09:08:43 +08:00
feat: support file params in analyzer and set jieba dict file (#45206)
relate: https://github.com/milvus-io/milvus/issues/43687 Support use user provice file by file params, in analyzer params. Could use local file or remote file resource. Support use file params in jieba extern dict. Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
5e6fdf3ba7
commit
322caafe18
1
.gitignore
vendored
1
.gitignore
vendored
@ -47,6 +47,7 @@ proxy/suvlim/*
|
||||
proxy-go/proxy-go
|
||||
|
||||
# Compiled source
|
||||
target/
|
||||
bin/
|
||||
lib/
|
||||
*.a
|
||||
|
||||
@ -2,10 +2,10 @@ mod analyzer;
|
||||
mod build_in_analyzer;
|
||||
mod dict;
|
||||
mod filter;
|
||||
mod runtime_option;
|
||||
mod options;
|
||||
|
||||
pub mod tokenizers;
|
||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||
pub use self::runtime_option::set_options;
|
||||
pub use self::options::set_options;
|
||||
|
||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||
|
||||
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs
vendored
Normal file
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
mod runtime_option;
|
||||
mod util;
|
||||
|
||||
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
|
||||
|
||||
pub use self::util::get_resource_path;
|
||||
|
||||
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;
|
||||
@ -2,6 +2,7 @@ use crate::error::{Result, TantivyBindingError};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json as json;
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
||||
@ -26,8 +27,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
|
||||
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
||||
}
|
||||
|
||||
pub fn get_resource_id(name: &str) -> Option<i64> {
|
||||
GLOBAL_OPTIONS.get_resource_id(name)
|
||||
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
|
||||
}
|
||||
|
||||
// analyzer options
|
||||
@ -57,9 +58,28 @@ impl RuntimeOption {
|
||||
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
||||
}
|
||||
|
||||
fn get_resource_id(&self, name: &str) -> Option<i64> {
|
||||
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||
let r = self.inner.read().unwrap();
|
||||
r.resource_map.get(name).cloned()
|
||||
let resource_id =
|
||||
r.resource_map
|
||||
.get(resource_name)
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource: {} not found in local resource list",
|
||||
resource_name
|
||||
)))?;
|
||||
let base = r
|
||||
.params
|
||||
.get(RESOURCE_PATH_KEY)
|
||||
.ok_or(TantivyBindingError::InternalError(
|
||||
"local_resource_path config not init success".to_string(),
|
||||
))?
|
||||
.as_str()
|
||||
.ok_or("local_resource_path must set as string")?;
|
||||
|
||||
return Ok(PathBuf::new()
|
||||
.join(base)
|
||||
.join(resource_id.to_string())
|
||||
.join(file_name));
|
||||
}
|
||||
}
|
||||
|
||||
83
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs
vendored
Normal file
83
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs
vendored
Normal file
@ -0,0 +1,83 @@
|
||||
use serde_json as json;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use super::runtime_option::get_resource_file_path;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
|
||||
if !v.is_object() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"file config of {} must be object",
|
||||
resource_key
|
||||
)));
|
||||
}
|
||||
|
||||
let params = v.as_object().unwrap();
|
||||
|
||||
let file_type = params.get("type").ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!("file type of {} must be set", resource_key))
|
||||
})?;
|
||||
|
||||
if !file_type.is_string() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"file type of {} must be string",
|
||||
resource_key
|
||||
)));
|
||||
}
|
||||
|
||||
match file_type.as_str().unwrap() {
|
||||
"local" => {
|
||||
let path = params.get("path").ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"file path of local file `{}` must be set",
|
||||
resource_key
|
||||
))
|
||||
})?;
|
||||
|
||||
if !path.is_string() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"file path of local file `{}` must be string",
|
||||
resource_key
|
||||
)));
|
||||
}
|
||||
|
||||
let path_str = path.as_str().unwrap();
|
||||
Ok(Path::new(path_str).to_path_buf())
|
||||
}
|
||||
"remote" => {
|
||||
let resource_name = params
|
||||
.get("resource_name")
|
||||
.ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"resource name of remote file `{}` must be set",
|
||||
resource_key
|
||||
))
|
||||
})?
|
||||
.as_str()
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"remote file resource name of remote file `{}` must be string",
|
||||
resource_key
|
||||
)))?;
|
||||
|
||||
let file_name = params
|
||||
.get("file_name")
|
||||
.ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"file name of remote file `{}` must be set",
|
||||
resource_key
|
||||
))
|
||||
})?
|
||||
.as_str()
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"remote file resource name of {} must be string",
|
||||
resource_key
|
||||
)))?;
|
||||
|
||||
self::get_resource_file_path(resource_name, file_name)
|
||||
}
|
||||
other => Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"unsupported file type {} of {}",
|
||||
other, resource_key
|
||||
))),
|
||||
}
|
||||
}
|
||||
@ -2,10 +2,12 @@ use core::{option::Option::Some, result::Result::Ok};
|
||||
use jieba_rs;
|
||||
use lazy_static::lazy_static;
|
||||
use serde_json as json;
|
||||
use std::borrow::Cow;
|
||||
use std::fs;
|
||||
use std::io::BufReader;
|
||||
use std::{borrow::Cow, path::PathBuf};
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use crate::analyzer::options;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
lazy_static! {
|
||||
@ -54,16 +56,19 @@ impl TokenStream for JiebaTokenStream {
|
||||
|
||||
fn get_jieba_dict(
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<(Vec<String>, Option<String>)> {
|
||||
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
|
||||
let mut words = Vec::<String>::new();
|
||||
let mut user_dict = None;
|
||||
// use default dict as default system dict
|
||||
let mut system_dict = Some("_default_".to_string());
|
||||
match params.get("dict") {
|
||||
Some(value) => {
|
||||
system_dict = None;
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer dict must be array"
|
||||
)));
|
||||
}
|
||||
let mut dict = Vec::<String>::new();
|
||||
let mut system_dict = None;
|
||||
|
||||
for word in value.as_array().unwrap() {
|
||||
if !word.is_string() {
|
||||
@ -82,18 +87,27 @@ fn get_jieba_dict(
|
||||
if text == "_default_" || text == "_extend_default_" {
|
||||
if system_dict.is_some() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer dict can only set one default dict"
|
||||
"jieba tokenizer dict can only set one system dict"
|
||||
)));
|
||||
}
|
||||
system_dict = Some(text)
|
||||
} else {
|
||||
dict.push(text);
|
||||
words.push(text);
|
||||
}
|
||||
}
|
||||
Ok((dict, system_dict))
|
||||
}
|
||||
_ => Ok((vec![], Some("_default_".to_string()))),
|
||||
_ => {}
|
||||
};
|
||||
|
||||
match params.get("extra_dict_file") {
|
||||
Some(v) => {
|
||||
let path = options::get_resource_path(v, "jieba extra dict file")?;
|
||||
user_dict = Some(path)
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
Ok((words, system_dict, user_dict))
|
||||
}
|
||||
|
||||
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
|
||||
@ -143,7 +157,7 @@ impl<'a> JiebaTokenizer<'a> {
|
||||
}
|
||||
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||
let (dict, system_dict) = get_jieba_dict(params)?;
|
||||
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
|
||||
|
||||
let mut tokenizer =
|
||||
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
||||
@ -163,10 +177,21 @@ impl<'a> JiebaTokenizer<'a> {
|
||||
))),
|
||||
})?;
|
||||
|
||||
for word in dict {
|
||||
for word in words {
|
||||
tokenizer.add_word(word.as_str(), None, None);
|
||||
}
|
||||
|
||||
if user_dict.is_some() {
|
||||
let file = fs::File::open(user_dict.unwrap())?;
|
||||
let mut reader = BufReader::new(file);
|
||||
tokenizer.load_dict(&mut reader).map_err(|e| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"jieba tokenizer load dict file failed with error: {:?}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
}
|
||||
|
||||
let mode = get_jieba_mode(params)?;
|
||||
let hmm = get_jieba_hmm(params)?;
|
||||
|
||||
|
||||
@ -6,7 +6,6 @@ use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::token::Token as LToken;
|
||||
use lindera::tokenizer::Tokenizer as LTokenizer;
|
||||
use log::warn;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
||||
@ -17,9 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
||||
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||
|
||||
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
||||
use crate::analyzer::runtime_option::{
|
||||
get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY,
|
||||
};
|
||||
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
use core::slice;
|
||||
use std::ffi::{c_char, c_void, CStr};
|
||||
|
||||
use crate::{
|
||||
@ -18,7 +17,7 @@ macro_rules! convert_to_rust_slice {
|
||||
match $arr {
|
||||
// there is a UB in slice::from_raw_parts if the pointer is null
|
||||
x if x.is_null() => &[],
|
||||
_ => slice::from_raw_parts($arr, $len),
|
||||
_ => ::core::slice::from_raw_parts($arr, $len),
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user