mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
feat: set related resource ids in collection schema (#46423)
Support crate analyzer with file resource info, and return used file resource ids when validate analyzer. Save the related resource ids in collection schema. relate: https://github.com/milvus-io/milvus/issues/43687 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: analyzer file-resource resolution is deterministic and traceable by threading a FileResourcePathHelper (collecting used resource IDs in a HashSet) through all tokenizer/analyzer construction and validation paths; validate_analyzer(params, extra_info) returns the collected Vec<i64) which is propagated through C/Rust/Go layers to callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 → querypb.ValidateAnalyzerResponse.ResourceIds → CollectionSchema.FileResourceIds). - Logic removed/simplified: ad‑hoc, scattered resource-path lookups and per-filter file helpers (e.g., read_synonyms_file and other inline file-reading logic) were consolidated into ResourceInfo + FileResourcePathHelper and a centralized get_resource_path(helper, ...) API; filter/tokenizer builder APIs now accept &mut FileResourcePathHelper so all file path resolution and ID collection use the same path and bookkeeping logic (redundant duplicated lookups removed). - Why no data loss or behavior regression: changes are additive and default-preserving — existing call sites pass extra_info = "" so analyzer creation/validation behavior and error paths remain unchanged; new Collection.FileResourceIds is populated from resp.ResourceIds in validateSchema and round‑tripped through marshal/unmarshal (model.Collection ↔ schemapb.CollectionSchema) so schema persistence uses the new list without overwriting other schema fields; proto change adds a repeated field (resource_ids) which is wire‑compatible (older clients ignore extra field). Concrete code paths: analyzer creation still uses create_analyzer (now with extra_info ""), tokenizer validation still returns errors as before but now also returns IDs via CValidateResult/RustResult, and rootcoord.validateSchema assigns resp.ResourceIds → schema.FileResourceIds. - New capability added: end‑to‑end discovery, return, and persistence of file resource IDs used by analyzers — validate flows now return resource IDs and the system stores them in collection schema (affects tantivy analyzer binding, canalyzer C bindings, internal/util analyzer APIs, querynode ValidateAnalyzer response, and rootcoord/create_collection flow). <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
512884524b
commit
55feb7ded8
2
go.mod
2
go.mod
@ -21,7 +21,7 @@ require (
|
||||
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
|
||||
github.com/klauspost/compress v1.18.0
|
||||
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f
|
||||
github.com/minio/minio-go/v7 v7.0.73
|
||||
github.com/panjf2000/ants/v2 v2.11.3 // indirect
|
||||
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@ -799,8 +799,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
|
||||
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece h1:s0TFMZBxADKSzIr7LW/TE3L/WgCuo7QOfzkYX92Xog0=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f h1:YQ61KOySWPEXv8ePkr0Cu5q5iVHN11IIUSTWIiALCE8=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
|
||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
|
||||
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
|
||||
|
||||
@ -45,7 +45,7 @@ TEST(CTokenizer, Default) {
|
||||
auto analyzer_params = R"({"tokenizer": "standard"})";
|
||||
CTokenizer tokenizer;
|
||||
{
|
||||
auto status = create_tokenizer(analyzer_params, &tokenizer);
|
||||
auto status = create_tokenizer(analyzer_params, "", &tokenizer);
|
||||
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
|
||||
}
|
||||
|
||||
|
||||
@ -30,9 +30,12 @@ set_tokenizer_option(const char* params) {
|
||||
}
|
||||
|
||||
CStatus
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer) {
|
||||
create_tokenizer(const char* params,
|
||||
const char* extra_info,
|
||||
CTokenizer* tokenizer) {
|
||||
try {
|
||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
|
||||
auto impl =
|
||||
std::make_unique<milvus::tantivy::Tokenizer>(params, extra_info);
|
||||
*tokenizer = impl.release();
|
||||
return milvus::SuccessCStatus();
|
||||
} catch (std::exception& e) {
|
||||
@ -63,13 +66,14 @@ create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) {
|
||||
return impl->CreateTokenStream(std::string(text, text_len)).release();
|
||||
}
|
||||
|
||||
CStatus
|
||||
validate_tokenizer(const char* params) {
|
||||
CValidateResult
|
||||
validate_tokenizer(const char* params, const char* extra_info) {
|
||||
try {
|
||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
|
||||
return milvus::SuccessCStatus();
|
||||
auto [ids, count] =
|
||||
milvus::tantivy::validate_analyzer(params, extra_info);
|
||||
return CValidateResult{ids, count, milvus::SuccessCStatus()};
|
||||
} catch (std::exception& e) {
|
||||
return milvus::FailureCStatus(&e);
|
||||
return CValidateResult{nullptr, 0, milvus::FailureCStatus(&e)};
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -27,7 +27,9 @@ CStatus
|
||||
set_tokenizer_option(const char* params);
|
||||
|
||||
CStatus
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer);
|
||||
create_tokenizer(const char* params,
|
||||
const char* extra_info,
|
||||
CTokenizer* tokenizer);
|
||||
|
||||
CStatus
|
||||
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
||||
@ -35,8 +37,14 @@ clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
||||
void
|
||||
free_tokenizer(CTokenizer tokenizer);
|
||||
|
||||
CStatus
|
||||
validate_tokenizer(const char* params);
|
||||
typedef struct CValidateResult {
|
||||
int64_t* resource_ids;
|
||||
uint64_t resource_ids_count;
|
||||
CStatus status;
|
||||
} CValidateResult;
|
||||
|
||||
CValidateResult
|
||||
validate_tokenizer(const char* params, const char* extra_info);
|
||||
|
||||
CTokenStream
|
||||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);
|
||||
|
||||
@ -32,7 +32,7 @@ fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) {
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let mut analyzer = create_analyzer(params);
|
||||
let mut analyzer = create_analyzer(params, "");
|
||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||
|
||||
c.bench_function("test", |b| {
|
||||
@ -64,7 +64,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let mut analyzer = create_analyzer(params);
|
||||
let mut analyzer = create_analyzer(params, "");
|
||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||
|
||||
c.bench_function("test", |b| {
|
||||
|
||||
@ -497,7 +497,9 @@ const char *tantivy_token_stream_get_token(void *token_stream);
|
||||
|
||||
TantivyToken tantivy_token_stream_get_detailed_token(void *token_stream);
|
||||
|
||||
RustResult tantivy_create_analyzer(const char *analyzer_params);
|
||||
RustResult tantivy_create_analyzer(const char *analyzer_params, const char *extra_info);
|
||||
|
||||
RustResult tantivy_validate_analyzer(const char *analyzer_params, const char *extra_info);
|
||||
|
||||
void *tantivy_clone_analyzer(void *ptr);
|
||||
|
||||
|
||||
@ -1,40 +1,30 @@
|
||||
use log::warn;
|
||||
use serde_json as json;
|
||||
use std::collections::HashMap;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
use super::options::{get_global_file_resource_helper, FileResourcePathHelper};
|
||||
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
|
||||
use crate::analyzer::filter::{get_stop_words_list, get_string_list};
|
||||
use crate::analyzer::filter::{create_filter, get_stop_words_list, get_string_list};
|
||||
use crate::error::Result;
|
||||
use crate::error::TantivyBindingError;
|
||||
|
||||
struct AnalyzerBuilder<'a> {
|
||||
filters: HashMap<String, SystemFilter>,
|
||||
helper: &'a mut FileResourcePathHelper,
|
||||
params: &'a json::Map<String, json::Value>,
|
||||
}
|
||||
|
||||
impl AnalyzerBuilder<'_> {
|
||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
|
||||
AnalyzerBuilder {
|
||||
impl<'a> AnalyzerBuilder<'a> {
|
||||
fn new(
|
||||
params: &'a json::Map<String, json::Value>,
|
||||
helper: &'a mut FileResourcePathHelper,
|
||||
) -> Result<AnalyzerBuilder<'a>> {
|
||||
Ok(AnalyzerBuilder {
|
||||
filters: HashMap::new(),
|
||||
params: params,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_tokenizer_params(&self) -> Result<&json::Value> {
|
||||
let tokenizer = self.params.get("tokenizer");
|
||||
if tokenizer.is_none() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name or type must be set"
|
||||
)));
|
||||
}
|
||||
let value = tokenizer.unwrap();
|
||||
if value.is_object() || value.is_string() {
|
||||
return Ok(tokenizer.unwrap());
|
||||
}
|
||||
|
||||
Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name should be string or dict"
|
||||
)))
|
||||
helper: helper,
|
||||
})
|
||||
}
|
||||
|
||||
fn build_filter(
|
||||
@ -73,7 +63,7 @@ impl AnalyzerBuilder<'_> {
|
||||
}
|
||||
}
|
||||
} else if filter.is_object() {
|
||||
let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
|
||||
let filter = create_filter(filter.as_object().unwrap(), &mut self.helper)?;
|
||||
builder = filter.transform(builder);
|
||||
}
|
||||
}
|
||||
@ -110,10 +100,13 @@ impl AnalyzerBuilder<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
|
||||
fn build_template(mut self, type_: &str) -> Result<TextAnalyzer> {
|
||||
match type_ {
|
||||
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
|
||||
"chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
|
||||
"chinese" => Ok(chinese_analyzer(
|
||||
self.get_stop_words_option()?,
|
||||
&mut self.helper,
|
||||
)),
|
||||
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
|
||||
other_ => Err(TantivyBindingError::InternalError(format!(
|
||||
"unknown build-in analyzer type: {}",
|
||||
@ -128,7 +121,7 @@ impl AnalyzerBuilder<'_> {
|
||||
Some(type_) => {
|
||||
if !type_.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"analyzer type shoud be string"
|
||||
"analyzer type should be string"
|
||||
)));
|
||||
}
|
||||
return self.build_template(type_.as_str().unwrap());
|
||||
@ -137,8 +130,25 @@ impl AnalyzerBuilder<'_> {
|
||||
};
|
||||
|
||||
//build custom analyzer
|
||||
let tokenizer_params = self.get_tokenizer_params()?;
|
||||
let mut builder = get_builder_with_tokenizer(&tokenizer_params, create_analyzer_by_json)?;
|
||||
let tokenizer_params = self.params.get("tokenizer");
|
||||
if tokenizer_params.is_none() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name or type must be set"
|
||||
)));
|
||||
}
|
||||
|
||||
let value = tokenizer_params.unwrap();
|
||||
if !value.is_object() && !value.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"tokenizer name should be string or dict"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut builder = get_builder_with_tokenizer(
|
||||
tokenizer_params.unwrap(),
|
||||
&mut self.helper,
|
||||
create_analyzer_by_json,
|
||||
)?;
|
||||
|
||||
// build and check other options
|
||||
builder = self.build_option(builder)?;
|
||||
@ -148,30 +158,50 @@ impl AnalyzerBuilder<'_> {
|
||||
|
||||
pub fn create_analyzer_by_json(
|
||||
analyzer_params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<TextAnalyzer> {
|
||||
if analyzer_params.is_empty() {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
|
||||
let builder = AnalyzerBuilder::new(analyzer_params);
|
||||
let builder = AnalyzerBuilder::new(analyzer_params, helper)?;
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
|
||||
pub fn create_helper(extra_info: &str) -> Result<FileResourcePathHelper> {
|
||||
if extra_info.is_empty() {
|
||||
Ok(get_global_file_resource_helper())
|
||||
} else {
|
||||
Ok(FileResourcePathHelper::from_json(
|
||||
&json::from_str::<json::Value>(&extra_info)
|
||||
.map_err(|e| TantivyBindingError::JsonError(e))?,
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn create_analyzer(params: &str, extra_info: &str) -> Result<TextAnalyzer> {
|
||||
if params.len() == 0 {
|
||||
return Ok(standard_analyzer(vec![]));
|
||||
}
|
||||
|
||||
let json_params =
|
||||
json::from_str::<json::Value>(¶ms).map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||
let json_params = &json::from_str::<json::Map<String, json::Value>>(¶ms)
|
||||
.map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||
|
||||
create_analyzer_by_json(
|
||||
json_params
|
||||
.as_object()
|
||||
.ok_or(TantivyBindingError::InternalError(
|
||||
"params should be a json map".to_string(),
|
||||
))?,
|
||||
)
|
||||
let mut helper = create_helper(extra_info)?;
|
||||
create_analyzer_by_json(json_params, &mut helper)
|
||||
}
|
||||
|
||||
pub fn validate_analyzer(params: &str, extra_info: &str) -> Result<Vec<i64>> {
|
||||
if params.len() == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
let json_params = &json::from_str::<json::Map<String, json::Value>>(¶ms)
|
||||
.map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||
|
||||
let mut helper = create_helper(extra_info)?;
|
||||
create_analyzer_by_json(json_params, &mut helper)?;
|
||||
Ok(helper.get_resource_ids())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -185,7 +215,7 @@ mod tests {
|
||||
"stop_words": ["_english_"]
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
}
|
||||
|
||||
@ -195,7 +225,7 @@ mod tests {
|
||||
"type": "chinese"
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||
@ -219,7 +249,7 @@ mod tests {
|
||||
}
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
@ -2,6 +2,7 @@ use tantivy::tokenizer::*;
|
||||
|
||||
use super::filter::stop_words;
|
||||
use super::filter::*;
|
||||
use super::options::FileResourcePathHelper;
|
||||
use super::tokenizers::*;
|
||||
|
||||
// default build-in analyzer
|
||||
@ -15,8 +16,13 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
||||
let builder = jieba_builder(None).unwrap().filter(CnAlphaNumOnlyFilter);
|
||||
pub fn chinese_analyzer(
|
||||
stop_words: Vec<String>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> TextAnalyzer {
|
||||
let builder = jieba_builder(None, helper)
|
||||
.unwrap()
|
||||
.filter(CnAlphaNumOnlyFilter);
|
||||
if stop_words.len() > 0 {
|
||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use super::util::read_line_file;
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::SplitCompoundWords;
|
||||
@ -8,7 +9,10 @@ const WORD_LIST_KEY: &str = "word_list";
|
||||
const WORD_LIST_FILE_KEY: &str = "word_list_file";
|
||||
|
||||
impl FilterBuilder for SplitCompoundWords {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<Self> {
|
||||
let mut dict = Vec::<String>::new();
|
||||
if let Some(value) = params.get(WORD_LIST_KEY) {
|
||||
if !value.is_array() {
|
||||
@ -29,7 +33,12 @@ impl FilterBuilder for SplitCompoundWords {
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) {
|
||||
read_line_file(&mut dict, file_params, "decompounder word list file")?;
|
||||
read_line_file(
|
||||
helper,
|
||||
&mut dict,
|
||||
file_params,
|
||||
"decompounder word list file",
|
||||
)?;
|
||||
}
|
||||
|
||||
if dict.is_empty() {
|
||||
@ -49,13 +58,17 @@ impl FilterBuilder for SplitCompoundWords {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SplitCompoundWords;
|
||||
use crate::analyzer::filter::FilterBuilder;
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde_json as json;
|
||||
|
||||
use super::SplitCompoundWords;
|
||||
use crate::analyzer::filter::FilterBuilder;
|
||||
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
|
||||
#[test]
|
||||
fn test_decompounder_filter_with_file() {
|
||||
@ -74,7 +87,8 @@ mod tests {
|
||||
);
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
// let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]);
|
||||
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
|
||||
@ -4,6 +4,7 @@ use tantivy::tokenizer::*;
|
||||
use super::{
|
||||
CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter,
|
||||
};
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
pub(crate) enum SystemFilter {
|
||||
@ -23,7 +24,10 @@ pub(crate) enum SystemFilter {
|
||||
}
|
||||
|
||||
pub(crate) trait FilterBuilder {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self>
|
||||
fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<Self>
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
@ -109,36 +113,36 @@ impl From<&str> for SystemFilter {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
||||
type Error = TantivyBindingError;
|
||||
pub fn create_filter(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<SystemFilter> {
|
||||
match params.get(&"type".to_string()) {
|
||||
Some(value) => {
|
||||
if !value.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"filter type should be string".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
match params.get(&"type".to_string()) {
|
||||
Some(value) => {
|
||||
if !value.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"filter type should be string".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
match value.as_str().unwrap() {
|
||||
"length" => get_length_filter(params),
|
||||
"stop" => StopWordFilter::from_json(params).map(|f| SystemFilter::Stop(f)),
|
||||
"decompounder" => {
|
||||
SplitCompoundWords::from_json(params).map(|f| SystemFilter::Decompounder(f))
|
||||
}
|
||||
"stemmer" => Stemmer::from_json(params).map(|f| SystemFilter::Stemmer(f)),
|
||||
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
||||
"synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)),
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport filter type: {}",
|
||||
other
|
||||
))),
|
||||
match value.as_str().unwrap() {
|
||||
"length" => get_length_filter(params),
|
||||
"stop" => StopWordFilter::from_json(params, helper).map(|f| SystemFilter::Stop(f)),
|
||||
"decompounder" => SplitCompoundWords::from_json(params, helper)
|
||||
.map(|f| SystemFilter::Decompounder(f)),
|
||||
"stemmer" => Stemmer::from_json(params, helper).map(|f| SystemFilter::Stemmer(f)),
|
||||
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
||||
"synonym" => {
|
||||
SynonymFilter::from_json(params, helper).map(|f| SystemFilter::Synonym(f))
|
||||
}
|
||||
other => Err(TantivyBindingError::InternalError(format!(
|
||||
"unsupport filter type: {}",
|
||||
other
|
||||
))),
|
||||
}
|
||||
None => Err(TantivyBindingError::InternalError(
|
||||
"no type field in filter params".to_string(),
|
||||
)),
|
||||
}
|
||||
None => Err(TantivyBindingError::InternalError(
|
||||
"no type field in filter params".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@ -116,7 +116,7 @@ mod tests {
|
||||
}]
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
@ -59,7 +59,7 @@ mod tests {
|
||||
"filter": ["removepunct"]
|
||||
}"#;
|
||||
|
||||
let tokenizer = create_analyzer(¶ms.to_string());
|
||||
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
|
||||
let mut bining = tokenizer.unwrap();
|
||||
|
||||
@ -1,10 +1,14 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::{Language, Stemmer};
|
||||
|
||||
impl FilterBuilder for Stemmer {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
_: &mut FileResourcePathHelper,
|
||||
) -> Result<Self> {
|
||||
let value = params.get("language");
|
||||
if value.is_none() || !value.unwrap().is_string() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
use super::filter::FilterBuilder;
|
||||
use super::stop_words::fetch_language_stop_words;
|
||||
use super::util::*;
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::StopWordFilter;
|
||||
@ -28,14 +29,17 @@ pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
||||
}
|
||||
|
||||
impl FilterBuilder for StopWordFilter {
|
||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
||||
fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<Self> {
|
||||
let mut dict = Vec::<String>::new();
|
||||
if let Some(value) = params.get(STOP_WORDS_LIST_KEY) {
|
||||
dict = get_stop_words_list(get_string_list(value, "stop_words")?);
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) {
|
||||
read_line_file(&mut dict, file_params, "stop words dict file")?;
|
||||
read_line_file(helper, &mut dict, file_params, "stop words dict file")?;
|
||||
}
|
||||
|
||||
Ok(StopWordFilter::remove(dict))
|
||||
@ -46,11 +50,13 @@ impl FilterBuilder for StopWordFilter {
|
||||
mod tests {
|
||||
use super::StopWordFilter;
|
||||
use crate::analyzer::filter::FilterBuilder;
|
||||
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_filter_with_file() {
|
||||
@ -69,7 +75,8 @@ mod tests {
|
||||
);
|
||||
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
let filter = StopWordFilter::from_json(json_params.as_object().unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let filter = StopWordFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
use crate::analyzer::options::get_resource_path;
|
||||
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
@ -199,30 +199,16 @@ impl SynonymDict {
|
||||
}
|
||||
}
|
||||
|
||||
fn read_synonyms_file(builder: &mut SynonymDictBuilder, params: &json::Value) -> Result<()> {
|
||||
let path = get_resource_path(params, "synonyms dict file")?;
|
||||
let file = std::fs::File::open(path)?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
if let Ok(row_data) = line {
|
||||
builder.add_row(&row_data)?;
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"read synonyms dict file failed, error: {}",
|
||||
line.unwrap_err().to_string()
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SynonymFilter {
|
||||
dict: Arc<SynonymDict>,
|
||||
}
|
||||
|
||||
impl SynonymFilter {
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<SynonymFilter> {
|
||||
pub fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<SynonymFilter> {
|
||||
let expand = params.get("expand").map_or(Ok(true), |v| {
|
||||
v.as_bool().ok_or(TantivyBindingError::InvalidArgument(
|
||||
"create synonym filter failed, `expand` must be bool".to_string(),
|
||||
@ -246,7 +232,19 @@ impl SynonymFilter {
|
||||
}
|
||||
|
||||
if let Some(file_params) = params.get("synonyms_file") {
|
||||
read_synonyms_file(&mut builder, file_params)?;
|
||||
let path = get_resource_path(helper, file_params, "synonyms dict file")?;
|
||||
let file = std::fs::File::open(path)?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
if let Ok(row_data) = line {
|
||||
builder.add_row(&row_data)?;
|
||||
} else {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"read synonyms dict file failed, error: {}",
|
||||
line.unwrap_err().to_string()
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(SynonymFilter {
|
||||
@ -350,11 +348,14 @@ impl<T: TokenStream> TokenStream for SynonymFilterStream<T> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SynonymFilter;
|
||||
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||
use crate::analyzer::tokenizers::standard_builder;
|
||||
use crate::log::init_log;
|
||||
|
||||
use serde_json as json;
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
#[test]
|
||||
fn test_synonym_filter() {
|
||||
@ -365,7 +366,8 @@ mod tests {
|
||||
"synonyms": ["trans => translate, \\=>", "\\\\test, test, tests"]
|
||||
}"#;
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
@ -402,7 +404,8 @@ mod tests {
|
||||
}}"#
|
||||
);
|
||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||
let builder = standard_builder().filter(filter.unwrap());
|
||||
let mut analyzer = builder.build();
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
use crate::analyzer::options::get_resource_path;
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use std::io::BufRead;
|
||||
@ -26,11 +27,12 @@ pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>>
|
||||
}
|
||||
|
||||
pub(crate) fn read_line_file(
|
||||
helper: &mut FileResourcePathHelper,
|
||||
dict: &mut Vec<String>,
|
||||
params: &json::Value,
|
||||
key: &str,
|
||||
) -> Result<()> {
|
||||
let path = get_resource_path(params, key)?;
|
||||
let path = get_resource_path(helper, params, key)?;
|
||||
let file = std::fs::File::open(path)?;
|
||||
let reader = std::io::BufReader::new(file);
|
||||
for line in reader.lines() {
|
||||
|
||||
@ -2,10 +2,10 @@ mod analyzer;
|
||||
mod build_in_analyzer;
|
||||
mod dict;
|
||||
mod filter;
|
||||
mod options;
|
||||
|
||||
pub mod options;
|
||||
pub mod tokenizers;
|
||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json, validate_analyzer};
|
||||
pub use self::options::set_options;
|
||||
|
||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||
|
||||
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/common.rs
vendored
Normal file
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/common.rs
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
// cache key
|
||||
pub(crate) static LINDERA_DOWNLOAD_KEY: &str = "lindera_download_urls";
|
||||
pub(crate) static RESOURCE_MAP_KEY: &str = "resource_map";
|
||||
|
||||
// normal key
|
||||
pub static DEFAULT_DICT_PATH_KEY: &str = "default_dict_path";
|
||||
pub static RESOURCE_PATH_KEY: &str = "resource_path";
|
||||
pub static RESOURCE_STORAGE_NAME_KEY: &str = "storage_name";
|
||||
@ -1,8 +1,13 @@
|
||||
mod common;
|
||||
mod resource_info;
|
||||
mod runtime_option;
|
||||
mod util;
|
||||
|
||||
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
|
||||
pub use self::resource_info::{FileResourcePathHelper, ResourceInfo};
|
||||
pub use self::runtime_option::{
|
||||
get_global_file_resource_helper, get_lindera_download_url, get_options, set_options,
|
||||
};
|
||||
|
||||
pub use self::util::get_resource_path;
|
||||
|
||||
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;
|
||||
pub use self::common::{DEFAULT_DICT_PATH_KEY, RESOURCE_PATH_KEY};
|
||||
|
||||
190
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/resource_info.rs
vendored
Normal file
190
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/resource_info.rs
vendored
Normal file
@ -0,0 +1,190 @@
|
||||
// resource options
|
||||
use super::common::*;
|
||||
use super::runtime_option::get_options;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct ResourceInfo {
|
||||
storage_name: Option<String>,
|
||||
resource_map: HashMap<String, i64>,
|
||||
}
|
||||
|
||||
impl ResourceInfo {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
storage_name: None,
|
||||
resource_map: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn debug(&self) -> String {
|
||||
format!(
|
||||
"storage_name: {:?}, resource_map: {:?}",
|
||||
self.storage_name, self.resource_map
|
||||
)
|
||||
}
|
||||
|
||||
pub fn from_global_json(value: &json::Value) -> Result<Self> {
|
||||
let mut resource_map = HashMap::new();
|
||||
let kv = value
|
||||
.as_object()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource map should be a json map, but got: {}",
|
||||
json::to_string(value).unwrap()
|
||||
)))?;
|
||||
for (key, value) in kv {
|
||||
let url = value
|
||||
.as_i64()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource id should be integer, but got: {}",
|
||||
json::to_string(value).unwrap()
|
||||
)))?;
|
||||
resource_map.insert(key.to_string(), url);
|
||||
}
|
||||
Ok(Self {
|
||||
storage_name: None,
|
||||
resource_map,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn from_json(value: &json::Value) -> Result<Self> {
|
||||
let mut resource_map = HashMap::new();
|
||||
let m = value
|
||||
.as_object()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"extra info should be a json map, but got: {}",
|
||||
json::to_string(value).unwrap()
|
||||
)))?;
|
||||
|
||||
if let Some(v) = m.get(RESOURCE_MAP_KEY) {
|
||||
let kv = v
|
||||
.as_object()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource map should be a json map, but got: {}",
|
||||
json::to_string(v).unwrap()
|
||||
)))?;
|
||||
for (key, value) in kv {
|
||||
let url = value
|
||||
.as_i64()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource id should be integer, but got: {}",
|
||||
json::to_string(value).unwrap()
|
||||
)))?;
|
||||
resource_map.insert(key.to_string(), url);
|
||||
}
|
||||
}
|
||||
|
||||
let mut storage_name = None;
|
||||
if let Some(v) = m.get(RESOURCE_STORAGE_NAME_KEY) {
|
||||
let name = v
|
||||
.as_str()
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"storage_name must set as string, but got: {}",
|
||||
json::to_string(v).unwrap()
|
||||
)))?
|
||||
.to_string();
|
||||
storage_name = Some(name)
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
storage_name,
|
||||
resource_map,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FileResourcePathBuilder for ResourceInfo {
|
||||
fn get_resource_file_path(
|
||||
&self,
|
||||
resource_name: &str,
|
||||
file_name: &str,
|
||||
) -> Result<(i64, PathBuf)> {
|
||||
let resource_id =
|
||||
self.resource_map
|
||||
.get(resource_name)
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource: {} not found in local resource list",
|
||||
resource_name
|
||||
)))?;
|
||||
|
||||
let base_value =
|
||||
get_options(RESOURCE_PATH_KEY).ok_or(TantivyBindingError::InternalError(
|
||||
"local_resource_path config not init success".to_string(),
|
||||
))?;
|
||||
|
||||
let base = base_value
|
||||
.as_str()
|
||||
.ok_or(TantivyBindingError::InternalError(
|
||||
"local_resource_path must set as string".to_string(),
|
||||
))?;
|
||||
|
||||
if let Some(storage_name) = &self.storage_name {
|
||||
return Ok((
|
||||
resource_id.clone(),
|
||||
PathBuf::new()
|
||||
.join(base)
|
||||
.join(storage_name)
|
||||
.join(resource_id.to_string())
|
||||
.join(file_name),
|
||||
));
|
||||
} else {
|
||||
return Ok((
|
||||
resource_id.clone(),
|
||||
PathBuf::new()
|
||||
.join(base)
|
||||
.join(resource_id.to_string())
|
||||
.join(file_name),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait FileResourcePathBuilder {
|
||||
fn get_resource_file_path(
|
||||
&self,
|
||||
resource_name: &str,
|
||||
file_name: &str,
|
||||
) -> Result<(i64, PathBuf)>;
|
||||
}
|
||||
|
||||
pub struct FileResourcePathHelper {
|
||||
builder: Arc<dyn FileResourcePathBuilder>,
|
||||
ids: HashSet<i64>,
|
||||
}
|
||||
|
||||
impl FileResourcePathHelper {
|
||||
pub fn new(builder: Arc<dyn FileResourcePathBuilder>) -> Self {
|
||||
Self {
|
||||
builder,
|
||||
ids: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(value: &json::Value) -> Result<Self> {
|
||||
let info = ResourceInfo::from_json(value)?;
|
||||
let builder: Arc<dyn FileResourcePathBuilder> = Arc::new(info);
|
||||
Ok(Self {
|
||||
builder,
|
||||
ids: HashSet::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_resource_file_path(
|
||||
&mut self,
|
||||
resource_name: &str,
|
||||
file_name: &str,
|
||||
) -> Result<PathBuf> {
|
||||
let (resource_id, path) = self
|
||||
.builder
|
||||
.get_resource_file_path(resource_name, file_name)?;
|
||||
self.ids.insert(resource_id);
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
pub fn get_resource_ids(self) -> Vec<i64> {
|
||||
self.ids.into_iter().collect()
|
||||
}
|
||||
}
|
||||
@ -1,3 +1,5 @@
|
||||
use super::common::*;
|
||||
use super::resource_info::{FileResourcePathBuilder, FileResourcePathHelper, ResourceInfo};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json as json;
|
||||
@ -7,14 +9,6 @@ use std::sync::{Arc, RwLock};
|
||||
|
||||
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
||||
|
||||
// cache key
|
||||
static LINDERA_DOWNLOAD_KEY: &str = "lindera_download_urls";
|
||||
static RESOURCE_MAP_KEY: &str = "resource_map";
|
||||
|
||||
// normal key
|
||||
pub static DEFAULT_DICT_PATH_KEY: &str = "default_dict_path";
|
||||
pub static RESOURCE_PATH_KEY: &str = "resource_path";
|
||||
|
||||
pub fn set_options(params: &String) -> Result<()> {
|
||||
GLOBAL_OPTIONS.set_json(params)
|
||||
}
|
||||
@ -27,8 +21,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
|
||||
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
||||
}
|
||||
|
||||
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
|
||||
pub fn get_global_file_resource_helper() -> FileResourcePathHelper {
|
||||
FileResourcePathHelper::new(GLOBAL_OPTIONS.clone())
|
||||
}
|
||||
|
||||
// analyzer options
|
||||
@ -57,35 +51,25 @@ impl RuntimeOption {
|
||||
let r = self.inner.read().unwrap();
|
||||
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
||||
// file resource
|
||||
impl FileResourcePathBuilder for RuntimeOption {
|
||||
fn get_resource_file_path(
|
||||
&self,
|
||||
resource_name: &str,
|
||||
file_name: &str,
|
||||
) -> Result<(i64, PathBuf)> {
|
||||
let r = self.inner.read().unwrap();
|
||||
let resource_id =
|
||||
r.resource_map
|
||||
.get(resource_name)
|
||||
.ok_or(TantivyBindingError::InternalError(format!(
|
||||
"file resource: {} not found in local resource list",
|
||||
resource_name
|
||||
)))?;
|
||||
let base = r
|
||||
.params
|
||||
.get(RESOURCE_PATH_KEY)
|
||||
.ok_or(TantivyBindingError::InternalError(
|
||||
"local_resource_path config not init success".to_string(),
|
||||
))?
|
||||
.as_str()
|
||||
.ok_or("local_resource_path must set as string")?;
|
||||
|
||||
return Ok(PathBuf::new()
|
||||
.join(base)
|
||||
.join(resource_id.to_string())
|
||||
.join(file_name));
|
||||
return r
|
||||
.resource_info
|
||||
.get_resource_file_path(resource_name, file_name);
|
||||
}
|
||||
}
|
||||
|
||||
struct RuntimeOptionInner {
|
||||
params: HashMap<String, json::Value>,
|
||||
resource_map: HashMap<String, i64>, // resource name -> resource id
|
||||
resource_info: ResourceInfo, // resource name -> resource id
|
||||
lindera_download_urls: HashMap<String, Vec<String>>, // dict name -> url
|
||||
}
|
||||
|
||||
@ -93,7 +77,7 @@ impl RuntimeOptionInner {
|
||||
fn new() -> Self {
|
||||
RuntimeOptionInner {
|
||||
params: HashMap::new(),
|
||||
resource_map: HashMap::new(),
|
||||
resource_info: ResourceInfo::new(),
|
||||
lindera_download_urls: HashMap::new(),
|
||||
}
|
||||
}
|
||||
@ -124,7 +108,7 @@ impl RuntimeOptionInner {
|
||||
|
||||
for (key, value) in m {
|
||||
let array = value.as_array().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download urls shoud be list".to_string(),
|
||||
"lindera download urls should be list".to_string(),
|
||||
))?;
|
||||
|
||||
if !array.iter().all(|v| v.is_string()) {
|
||||
@ -143,18 +127,7 @@ impl RuntimeOptionInner {
|
||||
}
|
||||
|
||||
if key == RESOURCE_MAP_KEY {
|
||||
self.resource_map = HashMap::new();
|
||||
|
||||
let m = value.as_object().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download urls should be a json map".to_string(),
|
||||
))?;
|
||||
|
||||
for (key, value) in m {
|
||||
let url = value.as_i64().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download url shoud be string".to_string(),
|
||||
))?;
|
||||
self.resource_map.insert(key.to_string(), url);
|
||||
}
|
||||
self.resource_info = ResourceInfo::from_global_json(&value)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,14 @@
|
||||
use serde_json as json;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use super::runtime_option::get_resource_file_path;
|
||||
use super::resource_info::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
|
||||
pub fn get_resource_path(
|
||||
helper: &mut FileResourcePathHelper,
|
||||
v: &json::Value,
|
||||
resource_key: &str,
|
||||
) -> Result<PathBuf> {
|
||||
if !v.is_object() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"file config of {} must be object",
|
||||
@ -73,7 +77,7 @@ pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf>
|
||||
resource_key
|
||||
)))?;
|
||||
|
||||
self::get_resource_file_path(resource_name, file_name)
|
||||
helper.get_resource_file_path(resource_name, file_name)
|
||||
}
|
||||
other => Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"unsupported file type {} of {}",
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
use core::{option::Option::Some, result::Result::Ok};
|
||||
use jieba_rs;
|
||||
use lazy_static::lazy_static;
|
||||
use log::warn;
|
||||
use serde_json as json;
|
||||
use std::fs;
|
||||
use std::io::BufReader;
|
||||
use std::{borrow::Cow, path::PathBuf};
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use crate::analyzer::options;
|
||||
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
lazy_static! {
|
||||
@ -56,6 +57,7 @@ impl TokenStream for JiebaTokenStream {
|
||||
|
||||
fn get_jieba_dict(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
|
||||
let mut words = Vec::<String>::new();
|
||||
let mut user_dict = None;
|
||||
@ -101,7 +103,7 @@ fn get_jieba_dict(
|
||||
|
||||
match params.get("extra_dict_file") {
|
||||
Some(v) => {
|
||||
let path = options::get_resource_path(v, "jieba extra dict file")?;
|
||||
let path = get_resource_path(helper, v, "jieba extra dict file")?;
|
||||
user_dict = Some(path)
|
||||
}
|
||||
_ => {}
|
||||
@ -156,8 +158,11 @@ impl<'a> JiebaTokenizer<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
||||
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
|
||||
pub fn from_json(
|
||||
params: &json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<JiebaTokenizer<'a>> {
|
||||
let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?;
|
||||
|
||||
let mut tokenizer =
|
||||
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
||||
@ -242,8 +247,11 @@ impl Tokenizer for JiebaTokenizer<'static> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json as json;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::JiebaTokenizer;
|
||||
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||
|
||||
use tantivy::tokenizer::TokenStream;
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
|
||||
@ -255,7 +263,8 @@ mod tests {
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("结巴分词器");
|
||||
@ -280,7 +289,8 @@ mod tests {
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("milvus结巴分词器中文测试");
|
||||
@ -303,7 +313,8 @@ mod tests {
|
||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||
assert!(json_param.is_ok());
|
||||
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
||||
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||
let mut bining = tokenizer.unwrap();
|
||||
let mut stream = bining.token_stream("milvus結巴分詞器中文測試");
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lingua::{LanguageDetector, LanguageDetectorBuilder};
|
||||
use serde_json as json;
|
||||
@ -164,7 +165,11 @@ impl<'a> LangIdentTokenizer<'a> {
|
||||
|
||||
pub fn from_json<'b>(
|
||||
params: &'b json::Map<String, json::Value>,
|
||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
fc: fn(
|
||||
&json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<TextAnalyzer>,
|
||||
) -> Result<LangIdentTokenizer<'a>> {
|
||||
// init identfier for tokenizer
|
||||
let identifier = params
|
||||
@ -188,12 +193,15 @@ impl<'a> LangIdentTokenizer<'a> {
|
||||
for (name, params) in sub_analyzers {
|
||||
analyzer.add(
|
||||
name,
|
||||
fc(params.as_object().ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"sub analyzer \"{}\" params must be dict",
|
||||
name
|
||||
))
|
||||
})?)?,
|
||||
fc(
|
||||
params.as_object().ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"sub analyzer \"{}\" params must be dict",
|
||||
name
|
||||
))
|
||||
})?,
|
||||
helper,
|
||||
)?,
|
||||
);
|
||||
}
|
||||
|
||||
@ -257,9 +265,11 @@ impl Tokenizer for LangIdentTokenizer<'static> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json as json;
|
||||
use std::sync::Arc;
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
|
||||
use super::LangIdentTokenizer;
|
||||
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||
use crate::analyzer::tokenizers::lang_ident_tokenizer::BoxIdentifier;
|
||||
use crate::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||
use crate::error::Result;
|
||||
@ -276,8 +286,8 @@ mod tests {
|
||||
|
||||
let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
|
||||
let result = || -> Result<()> {
|
||||
analyzer.add("default", create_analyzer(standard_params)?);
|
||||
analyzer.add("cmn", create_analyzer(jieba_params)?);
|
||||
analyzer.add("default", create_analyzer(standard_params, "")?);
|
||||
analyzer.add("cmn", create_analyzer(jieba_params, "")?);
|
||||
Ok(())
|
||||
}();
|
||||
|
||||
@ -304,6 +314,7 @@ mod tests {
|
||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||
LangIdentTokenizer::from_json(
|
||||
json_params.as_object().unwrap(),
|
||||
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||
create_analyzer_by_json,
|
||||
);
|
||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||
@ -337,6 +348,7 @@ mod tests {
|
||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||
LangIdentTokenizer::from_json(
|
||||
json_params.as_object().unwrap(),
|
||||
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||
create_analyzer_by_json,
|
||||
);
|
||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||
@ -372,6 +384,7 @@ mod tests {
|
||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||
LangIdentTokenizer::from_json(
|
||||
json_params.as_object().unwrap(),
|
||||
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||
create_analyzer_by_json,
|
||||
);
|
||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
use crate::analyzer::options::FileResourcePathHelper;
|
||||
use log::warn;
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::*;
|
||||
@ -24,24 +25,29 @@ pub fn icu_builder() -> TextAnalyzerBuilder {
|
||||
|
||||
pub fn lang_ident_builder(
|
||||
params: Option<&json::Map<String, json::Value>>,
|
||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
fc: fn(
|
||||
&json::Map<String, json::Value>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<TextAnalyzer>,
|
||||
) -> Result<TextAnalyzerBuilder> {
|
||||
if params.is_none() {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"lang ident tokenizer must be customized"
|
||||
)));
|
||||
}
|
||||
let tokenizer = LangIdentTokenizer::from_json(params.unwrap(), fc)?;
|
||||
let tokenizer = LangIdentTokenizer::from_json(params.unwrap(), helper, fc)?;
|
||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||
}
|
||||
|
||||
pub fn jieba_builder(
|
||||
params: Option<&json::Map<String, json::Value>>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
) -> Result<TextAnalyzerBuilder> {
|
||||
if params.is_none() {
|
||||
return Ok(TextAnalyzer::builder(JiebaTokenizer::new()).dynamic());
|
||||
}
|
||||
let tokenizer = JiebaTokenizer::from_json(params.unwrap())?;
|
||||
let tokenizer = JiebaTokenizer::from_json(params.unwrap(), helper)?;
|
||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||
}
|
||||
|
||||
@ -83,7 +89,8 @@ pub fn char_group_builder(
|
||||
|
||||
pub fn get_builder_with_tokenizer(
|
||||
params: &json::Value,
|
||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
||||
helper: &mut FileResourcePathHelper,
|
||||
fc: fn(&json::Map<String, json::Value>, &mut FileResourcePathHelper) -> Result<TextAnalyzer>,
|
||||
) -> Result<TextAnalyzerBuilder> {
|
||||
let name;
|
||||
let params_map;
|
||||
@ -113,11 +120,11 @@ pub fn get_builder_with_tokenizer(
|
||||
match name {
|
||||
"standard" => Ok(standard_builder()),
|
||||
"whitespace" => Ok(whitespace_builder()),
|
||||
"jieba" => jieba_builder(params_map),
|
||||
"jieba" => jieba_builder(params_map, helper),
|
||||
"lindera" => lindera_builder(params_map),
|
||||
"char_group" => char_group_builder(params_map),
|
||||
"icu" => Ok(icu_builder()),
|
||||
"language_identifier" => lang_ident_builder(params_map, fc),
|
||||
"language_identifier" => lang_ident_builder(params_map, helper, fc),
|
||||
"grpc" => grpc_builder(params_map),
|
||||
other => {
|
||||
warn!("unsupported tokenizer: {}", other);
|
||||
|
||||
@ -147,6 +147,14 @@ impl RustResult {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_vec_i64(value: Vec<i64>) -> Self {
|
||||
RustResult {
|
||||
success: true,
|
||||
value: Value::RustArrayI64(RustArrayI64::from_vec(value)),
|
||||
error: std::ptr::null(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_error(error: String) -> Self {
|
||||
RustResult {
|
||||
success: false,
|
||||
@ -184,6 +192,11 @@ pub extern "C" fn free_rust_result(result: RustResult) {
|
||||
free_rust_array(array);
|
||||
}
|
||||
}
|
||||
Value::RustArrayI64(array) => {
|
||||
if !array.array.is_null() {
|
||||
free_rust_array_i64(array);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
if !result.error.is_null() {
|
||||
|
||||
@ -49,7 +49,7 @@ pub extern "C" fn tantivy_register_tokenizer(
|
||||
let real = ptr as *mut IndexReaderWrapper;
|
||||
let tokenizer_name = cstr_to_str!(tokenizer_name);
|
||||
let params = cstr_to_str!(analyzer_params);
|
||||
let analyzer = create_analyzer(params);
|
||||
let analyzer = create_analyzer(params, "");
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => unsafe {
|
||||
(*real).register_tokenizer(String::from(tokenizer_name), text_analyzer);
|
||||
|
||||
@ -157,7 +157,7 @@ impl AnalyzerBuilder<'_> {
|
||||
Some(type_) => {
|
||||
if !type_.is_string() {
|
||||
return Err(TantivyBindingError::InternalError(format!(
|
||||
"analyzer type shoud be string"
|
||||
"analyzer type should be string"
|
||||
)));
|
||||
}
|
||||
return self.build_template(type_.as_str().unwrap());
|
||||
|
||||
@ -37,7 +37,7 @@ impl IndexWriterWrapperImpl {
|
||||
field_name
|
||||
);
|
||||
|
||||
let tokenizer = create_analyzer(tokenizer_params)?;
|
||||
let tokenizer = create_analyzer(tokenizer_params, "")?;
|
||||
|
||||
let (schema, field) = build_text_schema(field_name, tokenizer_name);
|
||||
let index = if in_ram {
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
use crate::analyzer::create_analyzer_by_json;
|
||||
use crate::analyzer::options::get_global_file_resource_helper;
|
||||
use serde_json::{self, Value};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
@ -38,7 +39,9 @@ pub fn compute_phrase_match_slop(
|
||||
.ok_or("Tokenizer params must be a JSON object")?;
|
||||
|
||||
// 2. Create Analyzer
|
||||
let mut analyzer = create_analyzer_by_json(params_obj)
|
||||
// TODO: support build helper from extra_info
|
||||
let mut helper = get_global_file_resource_helper();
|
||||
let mut analyzer = create_analyzer_by_json(params_obj, &mut helper)
|
||||
.map_err(|e| format!("Failed to create analyzer: {:?}", e))?;
|
||||
|
||||
// 3. Tokenize Query
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
use libc::{c_char, c_void};
|
||||
use tantivy::tokenizer::TextAnalyzer;
|
||||
|
||||
use crate::analyzer::{create_analyzer, set_options};
|
||||
use crate::analyzer::{create_analyzer, set_options, validate_analyzer};
|
||||
use crate::{
|
||||
array::RustResult,
|
||||
log::init_log,
|
||||
@ -10,10 +10,14 @@ use crate::{
|
||||
};
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> RustResult {
|
||||
pub extern "C" fn tantivy_create_analyzer(
|
||||
analyzer_params: *const c_char,
|
||||
extra_info: *const c_char,
|
||||
) -> RustResult {
|
||||
init_log();
|
||||
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
||||
let analyzer = create_analyzer(¶ms);
|
||||
let extra_info_str = unsafe { c_str_to_str(extra_info).to_string() };
|
||||
let analyzer = create_analyzer(¶ms, &extra_info_str);
|
||||
match analyzer {
|
||||
Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)),
|
||||
Err(err) => RustResult::from_error(format!(
|
||||
@ -23,6 +27,24 @@ pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> Rus
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_validate_analyzer(
|
||||
analyzer_params: *const c_char,
|
||||
extra_info: *const c_char,
|
||||
) -> RustResult {
|
||||
init_log();
|
||||
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
||||
let extra_info_str = unsafe { c_str_to_str(extra_info).to_string() };
|
||||
let result = validate_analyzer(¶ms, &extra_info_str);
|
||||
match result {
|
||||
Ok(ids) => RustResult::from_vec_i64(ids),
|
||||
Err(err) => RustResult::from_error(format!(
|
||||
"validate tokenizer failed with error: {} param: {}",
|
||||
err, params,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
|
||||
let analyzer = ptr as *mut TextAnalyzer;
|
||||
|
||||
44
internal/core/thirdparty/tantivy/tokenizer.h
vendored
44
internal/core/thirdparty/tantivy/tokenizer.h
vendored
@ -14,9 +14,20 @@ struct Tokenizer {
|
||||
NO_COPY_OR_ASSIGN(Tokenizer);
|
||||
|
||||
explicit Tokenizer(std::string&& params) {
|
||||
auto shared_params = std::make_shared<std::string>(std::move(params));
|
||||
auto res =
|
||||
RustResultWrapper(tantivy_create_analyzer(shared_params->c_str()));
|
||||
auto shared_params = std::make_shared<std::string>(params);
|
||||
auto res = RustResultWrapper(
|
||||
tantivy_create_analyzer(shared_params->c_str(), ""));
|
||||
AssertInfo(res.result_->success,
|
||||
"Tokenizer creation failed: {}",
|
||||
res.result_->error);
|
||||
ptr_ = res.result_->value.ptr._0;
|
||||
}
|
||||
|
||||
explicit Tokenizer(std::string&& params, std::string&& extra_info) {
|
||||
auto shared_params = std::make_shared<std::string>(params);
|
||||
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
||||
auto res = RustResultWrapper(tantivy_create_analyzer(
|
||||
shared_params->c_str(), shared_extra_info->c_str()));
|
||||
AssertInfo(res.result_->success,
|
||||
"Tokenizer creation failed: {}",
|
||||
res.result_->error);
|
||||
@ -69,4 +80,31 @@ set_tokenizer_options(std::string&& params) {
|
||||
res.result_->error);
|
||||
}
|
||||
|
||||
inline std::pair<int64_t*, size_t>
|
||||
validate_analyzer(std::string&& params, std::string&& extra_info) {
|
||||
auto shared_params = std::make_shared<std::string>(params);
|
||||
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
||||
auto res = RustResultWrapper(tantivy_validate_analyzer(
|
||||
shared_params->c_str(), shared_extra_info->c_str()));
|
||||
AssertInfo(res.result_->success,
|
||||
"Validate analyzer params failed: {}",
|
||||
res.result_->error);
|
||||
auto array_wrapper =
|
||||
RustArrayI64Wrapper(std::move(res.result_->value.rust_array_i64._0));
|
||||
auto* array = array_wrapper.array_.array;
|
||||
auto len = array_wrapper.array_.len;
|
||||
|
||||
int64_t* result = nullptr;
|
||||
if (len > 0) {
|
||||
result = static_cast<int64_t*>(malloc(len * sizeof(int64_t)));
|
||||
if (result == nullptr) {
|
||||
throw std::bad_alloc();
|
||||
}
|
||||
std::memcpy(result,
|
||||
array,
|
||||
len * sizeof(int64_t)); // Copy the array to the result
|
||||
}
|
||||
return {result, len};
|
||||
}
|
||||
|
||||
} // namespace milvus::tantivy
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"slices"
|
||||
|
||||
"github.com/samber/lo"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||
@ -52,6 +54,7 @@ type Collection struct {
|
||||
UpdateTimestamp uint64
|
||||
SchemaVersion int32
|
||||
ShardInfos map[string]*ShardInfo
|
||||
FileResourceIds []int64
|
||||
}
|
||||
|
||||
type ShardInfo struct {
|
||||
@ -90,6 +93,7 @@ func (c *Collection) ShallowClone() *Collection {
|
||||
UpdateTimestamp: c.UpdateTimestamp,
|
||||
SchemaVersion: c.SchemaVersion,
|
||||
ShardInfos: c.ShardInfos,
|
||||
FileResourceIds: c.FileResourceIds,
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,6 +131,7 @@ func (c *Collection) Clone() *Collection {
|
||||
UpdateTimestamp: c.UpdateTimestamp,
|
||||
SchemaVersion: c.SchemaVersion,
|
||||
ShardInfos: shardInfos,
|
||||
FileResourceIds: slices.Clone(c.FileResourceIds),
|
||||
}
|
||||
}
|
||||
|
||||
@ -232,6 +237,7 @@ func UnmarshalCollectionModel(coll *pb.CollectionInfo) *Collection {
|
||||
UpdateTimestamp: coll.UpdateTimestamp,
|
||||
SchemaVersion: coll.Schema.Version,
|
||||
ShardInfos: shardInfos,
|
||||
FileResourceIds: coll.Schema.GetFileResourceIds(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -283,6 +289,7 @@ func marshalCollectionModelWithConfig(coll *Collection, c *config) *pb.Collectio
|
||||
EnableDynamicField: coll.EnableDynamicField,
|
||||
DbName: coll.DBName,
|
||||
Version: coll.SchemaVersion,
|
||||
FileResourceIds: coll.FileResourceIds,
|
||||
}
|
||||
|
||||
if c.withFields {
|
||||
|
||||
@ -1667,17 +1667,20 @@ func (node *QueryNode) ValidateAnalyzer(ctx context.Context, req *querypb.Valida
|
||||
}
|
||||
defer node.lifetime.Done()
|
||||
|
||||
resourceSet := typeutil.NewSet[int64]()
|
||||
|
||||
for _, info := range req.AnalyzerInfos {
|
||||
err := analyzer.ValidateAnalyzer(info.GetParams())
|
||||
ids, err := analyzer.ValidateAnalyzer(info.GetParams())
|
||||
if err != nil {
|
||||
if info.GetName() != "" {
|
||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, name: %s, error: %v", info.GetField(), info.GetName(), err))}, nil
|
||||
}
|
||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, error: %v", info.GetField(), err))}, nil
|
||||
}
|
||||
resourceSet.Insert(ids...)
|
||||
}
|
||||
|
||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(nil)}, nil
|
||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(nil), ResourceIds: resourceSet.Collect()}, nil
|
||||
}
|
||||
|
||||
type deleteRequestStringer struct {
|
||||
|
||||
@ -200,6 +200,7 @@ func (t *createCollectionTask) validateSchema(ctx context.Context, schema *schem
|
||||
}
|
||||
|
||||
// validate analyzer params at any streaming node
|
||||
// and set file resource ids to schema
|
||||
if len(analyzerInfos) > 0 {
|
||||
resp, err := t.mixCoord.ValidateAnalyzer(t.ctx, &querypb.ValidateAnalyzerRequest{
|
||||
AnalyzerInfos: analyzerInfos,
|
||||
@ -211,6 +212,7 @@ func (t *createCollectionTask) validateSchema(ctx context.Context, schema *schem
|
||||
if err := merr.Error(resp.GetStatus()); err != nil {
|
||||
return err
|
||||
}
|
||||
schema.FileResourceIds = resp.GetResourceIds()
|
||||
}
|
||||
|
||||
return validateFieldDataType(schema.GetFields())
|
||||
|
||||
@ -204,6 +204,7 @@ func newCollectionModel(header *message.CreateCollectionMessageHeader, body *mes
|
||||
UpdateTimestamp: ts,
|
||||
SchemaVersion: 0,
|
||||
ShardInfos: shardInfos,
|
||||
FileResourceIds: body.CollectionSchema.GetFileResourceIds(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1080,6 +1080,7 @@ func convertModelToDesc(collInfo *model.Collection, aliases []string, dbName str
|
||||
Functions: model.MarshalFunctionModels(collInfo.Functions),
|
||||
EnableDynamicField: collInfo.EnableDynamicField,
|
||||
Properties: collInfo.Properties,
|
||||
FileResourceIds: collInfo.FileResourceIds,
|
||||
}
|
||||
resp.CollectionID = collInfo.CollectionID
|
||||
resp.VirtualChannelNames = collInfo.VirtualChannelNames
|
||||
|
||||
@ -11,11 +11,15 @@ type (
|
||||
)
|
||||
|
||||
func NewAnalyzer(param string) (Analyzer, error) {
|
||||
return canalyzer.NewAnalyzer(param)
|
||||
return canalyzer.NewAnalyzer(param, "")
|
||||
}
|
||||
|
||||
func ValidateAnalyzer(param string) error {
|
||||
return canalyzer.ValidateAnalyzer(param)
|
||||
func ValidateAnalyzer(param string) ([]int64, error) {
|
||||
return canalyzer.ValidateAnalyzer(param, "")
|
||||
}
|
||||
|
||||
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
|
||||
return canalyzer.UpdateGlobalResourceInfo(resourceMap)
|
||||
}
|
||||
|
||||
func InitOptions() {
|
||||
|
||||
@ -13,9 +13,11 @@ import (
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
|
||||
"github.com/milvus-io/milvus/internal/util/pathutil"
|
||||
"github.com/milvus-io/milvus/pkg/v2/log"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
@ -23,8 +25,8 @@ import (
|
||||
const (
|
||||
LinderaDictURLKey = "lindera_download_urls"
|
||||
ResourceMapKey = "resource_map"
|
||||
DictPathKey = "local_dict_path"
|
||||
ResourcePathKey = "resource_path"
|
||||
StorageNameKey = "storage_name"
|
||||
)
|
||||
|
||||
var initOnce sync.Once
|
||||
@ -39,7 +41,7 @@ func UpdateParams() {
|
||||
cfg := paramtable.Get()
|
||||
params := map[string]any{}
|
||||
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
||||
params[DictPathKey] = cfg.FunctionCfg.LocalResourcePath.GetValue()
|
||||
params[ResourcePathKey] = pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||
|
||||
bytes, err := json.Marshal(params)
|
||||
if err != nil {
|
||||
@ -55,12 +57,31 @@ func UpdateParams() {
|
||||
}
|
||||
}
|
||||
|
||||
func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
||||
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
|
||||
bytes, err := json.Marshal(map[string]any{"resource_map": resourceMap})
|
||||
if err != nil {
|
||||
return errors.Wrap(err, "marshal global resource info failed")
|
||||
}
|
||||
|
||||
paramPtr := C.CString(string(bytes))
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
status := C.set_tokenizer_option(paramPtr)
|
||||
if err := HandleCStatus(&status, "failed to update global resource info"); err != nil {
|
||||
return errors.Wrap(err, "update global resource info failed")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func NewAnalyzer(param string, extraInfo string) (interfaces.Analyzer, error) {
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
extraInfoPtr := C.CString(extraInfo)
|
||||
defer C.free(unsafe.Pointer(extraInfoPtr))
|
||||
|
||||
var ptr C.CTokenizer
|
||||
status := C.create_tokenizer(paramPtr, &ptr)
|
||||
status := C.create_tokenizer(paramPtr, extraInfoPtr, &ptr)
|
||||
if err := HandleCStatus(&status, "failed to create analyzer"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -68,13 +89,21 @@ func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
||||
return NewCAnalyzer(ptr), nil
|
||||
}
|
||||
|
||||
func ValidateAnalyzer(param string) error {
|
||||
func ValidateAnalyzer(param string, extraInfo string) ([]int64, error) {
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
status := C.validate_tokenizer(paramPtr)
|
||||
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
|
||||
return err
|
||||
extraInfoPtr := C.CString(extraInfo)
|
||||
defer C.free(unsafe.Pointer(extraInfoPtr))
|
||||
|
||||
result := C.validate_tokenizer(paramPtr, extraInfoPtr)
|
||||
if err := HandleCStatus(&result.status, "failed to validate tokenizer"); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return nil
|
||||
|
||||
cIds := unsafe.Slice((*int64)(unsafe.Pointer(result.resource_ids)), result.resource_ids_count)
|
||||
goIds := make([]int64, len(cIds))
|
||||
copy(goIds, cIds)
|
||||
C.free(unsafe.Pointer(result.resource_ids))
|
||||
return goIds, nil
|
||||
}
|
||||
|
||||
@ -4,6 +4,8 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@ -12,6 +14,8 @@ import (
|
||||
"google.golang.org/grpc"
|
||||
|
||||
pb "github.com/milvus-io/milvus-proto/go-api/v2/tokenizerpb"
|
||||
"github.com/milvus-io/milvus/internal/util/pathutil"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
|
||||
type mockServer struct {
|
||||
@ -32,7 +36,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
// use default analyzer.
|
||||
{
|
||||
m := "{}"
|
||||
analyzer, err := NewAnalyzer(m)
|
||||
analyzer, err := NewAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
defer analyzer.Destroy()
|
||||
|
||||
@ -48,7 +52,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
|
||||
{
|
||||
m := ""
|
||||
analyzer, err := NewAnalyzer(m)
|
||||
analyzer, err := NewAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
defer analyzer.Destroy()
|
||||
|
||||
@ -65,7 +69,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
// use default tokenizer.
|
||||
{
|
||||
m := "{\"tokenizer\": \"standard\"}"
|
||||
analyzer, err := NewAnalyzer(m)
|
||||
analyzer, err := NewAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
defer analyzer.Destroy()
|
||||
|
||||
@ -82,7 +86,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
// jieba tokenizer.
|
||||
{
|
||||
m := "{\"tokenizer\": \"jieba\"}"
|
||||
analyzer, err := NewAnalyzer(m)
|
||||
analyzer, err := NewAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
defer analyzer.Destroy()
|
||||
|
||||
@ -124,7 +128,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
defer stop()
|
||||
|
||||
m := "{\"tokenizer\": {\"type\":\"grpc\", \"endpoint\":\"http://" + addr + "\"}}"
|
||||
analyzer, err := NewAnalyzer(m)
|
||||
analyzer, err := NewAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
defer analyzer.Destroy()
|
||||
|
||||
@ -138,7 +142,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
// lindera tokenizer.
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
||||
tokenizer, err := NewAnalyzer(m)
|
||||
tokenizer, err := NewAnalyzer(m, "")
|
||||
require.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
@ -156,20 +160,78 @@ func TestValidateAnalyzer(t *testing.T) {
|
||||
// valid analyzer
|
||||
{
|
||||
m := "{\"tokenizer\": \"standard\"}"
|
||||
err := ValidateAnalyzer(m)
|
||||
ids, err := ValidateAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(ids), 0)
|
||||
}
|
||||
|
||||
{
|
||||
m := ""
|
||||
err := ValidateAnalyzer(m)
|
||||
_, err := ValidateAnalyzer(m, "")
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// invalid tokenizer
|
||||
{
|
||||
m := "{\"tokenizer\": \"invalid\"}"
|
||||
err := ValidateAnalyzer(m)
|
||||
_, err := ValidateAnalyzer(m, "")
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// with user resource
|
||||
{
|
||||
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||
defer os.RemoveAll(resourcePath)
|
||||
UpdateParams()
|
||||
resourceID := int64(100)
|
||||
|
||||
// mock remote resource file
|
||||
dir := filepath.Join(resourcePath, "default", fmt.Sprintf("%d", resourceID))
|
||||
err := os.MkdirAll(dir, os.ModePerm)
|
||||
require.NoError(t, err)
|
||||
|
||||
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
||||
require.NoError(t, err)
|
||||
|
||||
f.WriteString("stop")
|
||||
f.Close()
|
||||
|
||||
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
||||
|
||||
ids, err := ValidateAnalyzer(m, "{\"resource_map\": {\"jieba_dict\": 100}, \"storage_name\": \"default\"}")
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, len(ids), 1)
|
||||
assert.Equal(t, ids[0], resourceID)
|
||||
}
|
||||
|
||||
// with user resource and update global resource info
|
||||
{
|
||||
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||
defer os.RemoveAll(resourcePath)
|
||||
UpdateParams()
|
||||
resourceID := int64(100)
|
||||
|
||||
// mock remote resource file
|
||||
dir := filepath.Join(resourcePath, fmt.Sprintf("%d", resourceID))
|
||||
err := os.MkdirAll(dir, os.ModePerm)
|
||||
require.NoError(t, err)
|
||||
|
||||
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
||||
require.NoError(t, err)
|
||||
|
||||
f.WriteString("stop")
|
||||
f.Close()
|
||||
|
||||
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
||||
|
||||
// update global resource info
|
||||
err = UpdateGlobalResourceInfo(map[string]int64{"jieba_dict": resourceID})
|
||||
require.NoError(t, err)
|
||||
|
||||
ids, err := ValidateAnalyzer(m, "")
|
||||
require.NoError(t, err)
|
||||
|
||||
assert.Equal(t, len(ids), 1)
|
||||
assert.Equal(t, ids[0], resourceID)
|
||||
}
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ require (
|
||||
github.com/jolestar/go-commons-pool/v2 v2.1.2
|
||||
github.com/json-iterator/go v1.1.13-0.20220915233716-71ac16282d12
|
||||
github.com/klauspost/compress v1.18.0
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f
|
||||
github.com/minio/minio-go/v7 v7.0.73
|
||||
github.com/panjf2000/ants/v2 v2.11.3
|
||||
github.com/prometheus/client_golang v1.20.5
|
||||
|
||||
@ -482,8 +482,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
|
||||
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece h1:s0TFMZBxADKSzIr7LW/TE3L/WgCuo7QOfzkYX92Xog0=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f h1:YQ61KOySWPEXv8ePkr0Cu5q5iVHN11IIUSTWIiALCE8=
|
||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
||||
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
||||
github.com/minio/minio-go/v7 v7.0.73 h1:qr2vi96Qm7kZ4v7LLebjte+MQh621fFWnv93p12htEo=
|
||||
|
||||
@ -1037,6 +1037,7 @@ message ValidateAnalyzerRequest{
|
||||
|
||||
message ValidateAnalyzerResponse{
|
||||
common.Status status = 1;
|
||||
repeated int64 resource_ids = 2;
|
||||
}
|
||||
|
||||
message HighlightOptions{
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user