diff --git a/.gitignore b/.gitignore index 7a59f15b32..887fcf1e21 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,7 @@ proxy/suvlim/* proxy-go/proxy-go # Compiled source +target/ bin/ lib/ *.a diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs index 93e8afec07..5c1bc14007 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/mod.rs @@ -2,10 +2,10 @@ mod analyzer; mod build_in_analyzer; mod dict; mod filter; -mod runtime_option; +mod options; pub mod tokenizers; pub use self::analyzer::{create_analyzer, create_analyzer_by_json}; -pub use self::runtime_option::set_options; +pub use self::options::set_options; pub(crate) use self::build_in_analyzer::standard_analyzer; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs new file mode 100644 index 0000000000..3fb36a8f45 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/mod.rs @@ -0,0 +1,8 @@ +mod runtime_option; +mod util; + +pub use self::runtime_option::{get_lindera_download_url, get_options, set_options}; + +pub use self::util::get_resource_path; + +pub use self::runtime_option::DEFAULT_DICT_PATH_KEY; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/runtime_option.rs similarity index 81% rename from internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs rename to internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/runtime_option.rs index 2848373fb6..b7af1863d1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/runtime_option.rs @@ -2,6 +2,7 @@ use crate::error::{Result, TantivyBindingError}; use once_cell::sync::Lazy; use serde_json as json; use std::collections::HashMap; +use std::path::PathBuf; use std::sync::{Arc, RwLock}; static GLOBAL_OPTIONS: Lazy> = Lazy::new(|| Arc::new(RuntimeOption::new())); @@ -26,8 +27,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option> { GLOBAL_OPTIONS.get_lindera_download_urls(kind) } -pub fn get_resource_id(name: &str) -> Option { - GLOBAL_OPTIONS.get_resource_id(name) +pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result { + GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name) } // analyzer options @@ -57,9 +58,28 @@ impl RuntimeOption { r.lindera_download_urls.get(kind).map(|v| v.clone()) } - fn get_resource_id(&self, name: &str) -> Option { + fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result { let r = self.inner.read().unwrap(); - r.resource_map.get(name).cloned() + let resource_id = + r.resource_map + .get(resource_name) + .ok_or(TantivyBindingError::InternalError(format!( + "file resource: {} not found in local resource list", + resource_name + )))?; + let base = r + .params + .get(RESOURCE_PATH_KEY) + .ok_or(TantivyBindingError::InternalError( + "local_resource_path config not init success".to_string(), + ))? + .as_str() + .ok_or("local_resource_path must set as string")?; + + return Ok(PathBuf::new() + .join(base) + .join(resource_id.to_string()) + .join(file_name)); } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs new file mode 100644 index 0000000000..e5f4f13477 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/util.rs @@ -0,0 +1,83 @@ +use serde_json as json; +use std::path::{Path, PathBuf}; + +use super::runtime_option::get_resource_file_path; +use crate::error::{Result, TantivyBindingError}; + +pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result { + if !v.is_object() { + return Err(TantivyBindingError::InvalidArgument(format!( + "file config of {} must be object", + resource_key + ))); + } + + let params = v.as_object().unwrap(); + + let file_type = params.get("type").ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!("file type of {} must be set", resource_key)) + })?; + + if !file_type.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "file type of {} must be string", + resource_key + ))); + } + + match file_type.as_str().unwrap() { + "local" => { + let path = params.get("path").ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "file path of local file `{}` must be set", + resource_key + )) + })?; + + if !path.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "file path of local file `{}` must be string", + resource_key + ))); + } + + let path_str = path.as_str().unwrap(); + Ok(Path::new(path_str).to_path_buf()) + } + "remote" => { + let resource_name = params + .get("resource_name") + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "resource name of remote file `{}` must be set", + resource_key + )) + })? + .as_str() + .ok_or(TantivyBindingError::InvalidArgument(format!( + "remote file resource name of remote file `{}` must be string", + resource_key + )))?; + + let file_name = params + .get("file_name") + .ok_or_else(|| { + TantivyBindingError::InvalidArgument(format!( + "file name of remote file `{}` must be set", + resource_key + )) + })? + .as_str() + .ok_or(TantivyBindingError::InvalidArgument(format!( + "remote file resource name of {} must be string", + resource_key + )))?; + + self::get_resource_file_path(resource_name, file_name) + } + other => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported file type {} of {}", + other, resource_key + ))), + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs index 96e48913e0..0615965261 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs @@ -2,10 +2,12 @@ use core::{option::Option::Some, result::Result::Ok}; use jieba_rs; use lazy_static::lazy_static; use serde_json as json; -use std::borrow::Cow; +use std::fs; use std::io::BufReader; +use std::{borrow::Cow, path::PathBuf}; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; +use crate::analyzer::options; use crate::error::{Result, TantivyBindingError}; lazy_static! { @@ -54,16 +56,19 @@ impl TokenStream for JiebaTokenStream { fn get_jieba_dict( params: &json::Map, -) -> Result<(Vec, Option)> { +) -> Result<(Vec, Option, Option)> { + let mut words = Vec::::new(); + let mut user_dict = None; + // use default dict as default system dict + let mut system_dict = Some("_default_".to_string()); match params.get("dict") { Some(value) => { + system_dict = None; if !value.is_array() { return Err(TantivyBindingError::InvalidArgument(format!( "jieba tokenizer dict must be array" ))); } - let mut dict = Vec::::new(); - let mut system_dict = None; for word in value.as_array().unwrap() { if !word.is_string() { @@ -82,18 +87,27 @@ fn get_jieba_dict( if text == "_default_" || text == "_extend_default_" { if system_dict.is_some() { return Err(TantivyBindingError::InvalidArgument(format!( - "jieba tokenizer dict can only set one default dict" + "jieba tokenizer dict can only set one system dict" ))); } system_dict = Some(text) } else { - dict.push(text); + words.push(text); } } - Ok((dict, system_dict)) } - _ => Ok((vec![], Some("_default_".to_string()))), - } + _ => {} + }; + + match params.get("extra_dict_file") { + Some(v) => { + let path = options::get_resource_path(v, "jieba extra dict file")?; + user_dict = Some(path) + } + _ => {} + }; + + Ok((words, system_dict, user_dict)) } fn get_jieba_mode(params: &json::Map) -> Result { @@ -143,7 +157,7 @@ impl<'a> JiebaTokenizer<'a> { } pub fn from_json(params: &json::Map) -> Result> { - let (dict, system_dict) = get_jieba_dict(params)?; + let (words, system_dict, user_dict) = get_jieba_dict(params)?; let mut tokenizer = system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() { @@ -163,10 +177,21 @@ impl<'a> JiebaTokenizer<'a> { ))), })?; - for word in dict { + for word in words { tokenizer.add_word(word.as_str(), None, None); } + if user_dict.is_some() { + let file = fs::File::open(user_dict.unwrap())?; + let mut reader = BufReader::new(file); + tokenizer.load_dict(&mut reader).map_err(|e| { + TantivyBindingError::InvalidArgument(format!( + "jieba tokenizer load dict file failed with error: {:?}", + e + )) + })?; + } + let mode = get_jieba_mode(params)?; let hmm = get_jieba_hmm(params)?; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index 4f4f72233a..f0110720c9 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -6,7 +6,6 @@ use lindera::mode::Mode; use lindera::segmenter::Segmenter; use lindera::token::Token as LToken; use lindera::tokenizer::Tokenizer as LTokenizer; -use log::warn; use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter; @@ -17,9 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter; use lindera::token_filter::BoxTokenFilter as LTokenFilter; use crate::analyzer::dict::lindera::load_dictionary_from_kind; -use crate::analyzer::runtime_option::{ - get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY, -}; +use crate::analyzer::options::{get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY}; use crate::error::{Result, TantivyBindingError}; use serde_json as json; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs index fe855e0ff1..4af781af27 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs @@ -1,4 +1,3 @@ -use core::slice; use std::ffi::{c_char, c_void, CStr}; use crate::{ @@ -18,7 +17,7 @@ macro_rules! convert_to_rust_slice { match $arr { // there is a UB in slice::from_raw_parts if the pointer is null x if x.is_null() => &[], - _ => slice::from_raw_parts($arr, $len), + _ => ::core::slice::from_raw_parts($arr, $len), } }; }