mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 09:08:43 +08:00
feat: support file params in analyzer and set jieba dict file (#45206)
relate: https://github.com/milvus-io/milvus/issues/43687 Support use user provice file by file params, in analyzer params. Could use local file or remote file resource. Support use file params in jieba extern dict. Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
5e6fdf3ba7
commit
322caafe18
1
.gitignore
vendored
1
.gitignore
vendored
@ -47,6 +47,7 @@ proxy/suvlim/*
|
|||||||
proxy-go/proxy-go
|
proxy-go/proxy-go
|
||||||
|
|
||||||
# Compiled source
|
# Compiled source
|
||||||
|
target/
|
||||||
bin/
|
bin/
|
||||||
lib/
|
lib/
|
||||||
*.a
|
*.a
|
||||||
|
|||||||
@ -2,10 +2,10 @@ mod analyzer;
|
|||||||
mod build_in_analyzer;
|
mod build_in_analyzer;
|
||||||
mod dict;
|
mod dict;
|
||||||
mod filter;
|
mod filter;
|
||||||
mod runtime_option;
|
mod options;
|
||||||
|
|
||||||
pub mod tokenizers;
|
pub mod tokenizers;
|
||||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||||
pub use self::runtime_option::set_options;
|
pub use self::options::set_options;
|
||||||
|
|
||||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||||
|
|||||||
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs
vendored
Normal file
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
mod runtime_option;
|
||||||
|
mod util;
|
||||||
|
|
||||||
|
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
|
||||||
|
|
||||||
|
pub use self::util::get_resource_path;
|
||||||
|
|
||||||
|
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;
|
||||||
@ -2,6 +2,7 @@ use crate::error::{Result, TantivyBindingError};
|
|||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
||||||
@ -26,8 +27,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
|
|||||||
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_resource_id(name: &str) -> Option<i64> {
|
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||||
GLOBAL_OPTIONS.get_resource_id(name)
|
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// analyzer options
|
// analyzer options
|
||||||
@ -57,9 +58,28 @@ impl RuntimeOption {
|
|||||||
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_resource_id(&self, name: &str) -> Option<i64> {
|
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||||
let r = self.inner.read().unwrap();
|
let r = self.inner.read().unwrap();
|
||||||
r.resource_map.get(name).cloned()
|
let resource_id =
|
||||||
|
r.resource_map
|
||||||
|
.get(resource_name)
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource: {} not found in local resource list",
|
||||||
|
resource_name
|
||||||
|
)))?;
|
||||||
|
let base = r
|
||||||
|
.params
|
||||||
|
.get(RESOURCE_PATH_KEY)
|
||||||
|
.ok_or(TantivyBindingError::InternalError(
|
||||||
|
"local_resource_path config not init success".to_string(),
|
||||||
|
))?
|
||||||
|
.as_str()
|
||||||
|
.ok_or("local_resource_path must set as string")?;
|
||||||
|
|
||||||
|
return Ok(PathBuf::new()
|
||||||
|
.join(base)
|
||||||
|
.join(resource_id.to_string())
|
||||||
|
.join(file_name));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
83
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs
vendored
Normal file
83
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs
vendored
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
use serde_json as json;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
|
use super::runtime_option::get_resource_file_path;
|
||||||
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
|
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
|
||||||
|
if !v.is_object() {
|
||||||
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"file config of {} must be object",
|
||||||
|
resource_key
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let params = v.as_object().unwrap();
|
||||||
|
|
||||||
|
let file_type = params.get("type").ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!("file type of {} must be set", resource_key))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if !file_type.is_string() {
|
||||||
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"file type of {} must be string",
|
||||||
|
resource_key
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
match file_type.as_str().unwrap() {
|
||||||
|
"local" => {
|
||||||
|
let path = params.get("path").ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"file path of local file `{}` must be set",
|
||||||
|
resource_key
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
if !path.is_string() {
|
||||||
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"file path of local file `{}` must be string",
|
||||||
|
resource_key
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let path_str = path.as_str().unwrap();
|
||||||
|
Ok(Path::new(path_str).to_path_buf())
|
||||||
|
}
|
||||||
|
"remote" => {
|
||||||
|
let resource_name = params
|
||||||
|
.get("resource_name")
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"resource name of remote file `{}` must be set",
|
||||||
|
resource_key
|
||||||
|
))
|
||||||
|
})?
|
||||||
|
.as_str()
|
||||||
|
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"remote file resource name of remote file `{}` must be string",
|
||||||
|
resource_key
|
||||||
|
)))?;
|
||||||
|
|
||||||
|
let file_name = params
|
||||||
|
.get("file_name")
|
||||||
|
.ok_or_else(|| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"file name of remote file `{}` must be set",
|
||||||
|
resource_key
|
||||||
|
))
|
||||||
|
})?
|
||||||
|
.as_str()
|
||||||
|
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"remote file resource name of {} must be string",
|
||||||
|
resource_key
|
||||||
|
)))?;
|
||||||
|
|
||||||
|
self::get_resource_file_path(resource_name, file_name)
|
||||||
|
}
|
||||||
|
other => Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"unsupported file type {} of {}",
|
||||||
|
other, resource_key
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -2,10 +2,12 @@ use core::{option::Option::Some, result::Result::Ok};
|
|||||||
use jieba_rs;
|
use jieba_rs;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::borrow::Cow;
|
use std::fs;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
|
use std::{borrow::Cow, path::PathBuf};
|
||||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
use crate::analyzer::options;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
@ -54,16 +56,19 @@ impl TokenStream for JiebaTokenStream {
|
|||||||
|
|
||||||
fn get_jieba_dict(
|
fn get_jieba_dict(
|
||||||
params: &json::Map<String, json::Value>,
|
params: &json::Map<String, json::Value>,
|
||||||
) -> Result<(Vec<String>, Option<String>)> {
|
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
|
||||||
|
let mut words = Vec::<String>::new();
|
||||||
|
let mut user_dict = None;
|
||||||
|
// use default dict as default system dict
|
||||||
|
let mut system_dict = Some("_default_".to_string());
|
||||||
match params.get("dict") {
|
match params.get("dict") {
|
||||||
Some(value) => {
|
Some(value) => {
|
||||||
|
system_dict = None;
|
||||||
if !value.is_array() {
|
if !value.is_array() {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
"jieba tokenizer dict must be array"
|
"jieba tokenizer dict must be array"
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
let mut dict = Vec::<String>::new();
|
|
||||||
let mut system_dict = None;
|
|
||||||
|
|
||||||
for word in value.as_array().unwrap() {
|
for word in value.as_array().unwrap() {
|
||||||
if !word.is_string() {
|
if !word.is_string() {
|
||||||
@ -82,18 +87,27 @@ fn get_jieba_dict(
|
|||||||
if text == "_default_" || text == "_extend_default_" {
|
if text == "_default_" || text == "_extend_default_" {
|
||||||
if system_dict.is_some() {
|
if system_dict.is_some() {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
"jieba tokenizer dict can only set one default dict"
|
"jieba tokenizer dict can only set one system dict"
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
system_dict = Some(text)
|
system_dict = Some(text)
|
||||||
} else {
|
} else {
|
||||||
dict.push(text);
|
words.push(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok((dict, system_dict))
|
|
||||||
}
|
}
|
||||||
_ => Ok((vec![], Some("_default_".to_string()))),
|
_ => {}
|
||||||
}
|
};
|
||||||
|
|
||||||
|
match params.get("extra_dict_file") {
|
||||||
|
Some(v) => {
|
||||||
|
let path = options::get_resource_path(v, "jieba extra dict file")?;
|
||||||
|
user_dict = Some(path)
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok((words, system_dict, user_dict))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
|
fn get_jieba_mode(params: &json::Map<String, json::Value>) -> Result<JiebaMode> {
|
||||||
@ -143,7 +157,7 @@ impl<'a> JiebaTokenizer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||||
let (dict, system_dict) = get_jieba_dict(params)?;
|
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
|
||||||
|
|
||||||
let mut tokenizer =
|
let mut tokenizer =
|
||||||
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
||||||
@ -163,10 +177,21 @@ impl<'a> JiebaTokenizer<'a> {
|
|||||||
))),
|
))),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
for word in dict {
|
for word in words {
|
||||||
tokenizer.add_word(word.as_str(), None, None);
|
tokenizer.add_word(word.as_str(), None, None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if user_dict.is_some() {
|
||||||
|
let file = fs::File::open(user_dict.unwrap())?;
|
||||||
|
let mut reader = BufReader::new(file);
|
||||||
|
tokenizer.load_dict(&mut reader).map_err(|e| {
|
||||||
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
|
"jieba tokenizer load dict file failed with error: {:?}",
|
||||||
|
e
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
}
|
||||||
|
|
||||||
let mode = get_jieba_mode(params)?;
|
let mode = get_jieba_mode(params)?;
|
||||||
let hmm = get_jieba_hmm(params)?;
|
let hmm = get_jieba_hmm(params)?;
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,6 @@ use lindera::mode::Mode;
|
|||||||
use lindera::segmenter::Segmenter;
|
use lindera::segmenter::Segmenter;
|
||||||
use lindera::token::Token as LToken;
|
use lindera::token::Token as LToken;
|
||||||
use lindera::tokenizer::Tokenizer as LTokenizer;
|
use lindera::tokenizer::Tokenizer as LTokenizer;
|
||||||
use log::warn;
|
|
||||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
||||||
@ -17,9 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
|||||||
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||||
|
|
||||||
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
||||||
use crate::analyzer::runtime_option::{
|
use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY};
|
||||||
get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY,
|
|
||||||
};
|
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
use core::slice;
|
|
||||||
use std::ffi::{c_char, c_void, CStr};
|
use std::ffi::{c_char, c_void, CStr};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
@ -18,7 +17,7 @@ macro_rules! convert_to_rust_slice {
|
|||||||
match $arr {
|
match $arr {
|
||||||
// there is a UB in slice::from_raw_parts if the pointer is null
|
// there is a UB in slice::from_raw_parts if the pointer is null
|
||||||
x if x.is_null() => &[],
|
x if x.is_null() => &[],
|
||||||
_ => slice::from_raw_parts($arr, $len),
|
_ => ::core::slice::from_raw_parts($arr, $len),
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user