mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
feat: set related resource ids in collection schema (#46423)
Support crate analyzer with file resource info, and return used file resource ids when validate analyzer. Save the related resource ids in collection schema. relate: https://github.com/milvus-io/milvus/issues/43687 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: analyzer file-resource resolution is deterministic and traceable by threading a FileResourcePathHelper (collecting used resource IDs in a HashSet) through all tokenizer/analyzer construction and validation paths; validate_analyzer(params, extra_info) returns the collected Vec<i64) which is propagated through C/Rust/Go layers to callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 → querypb.ValidateAnalyzerResponse.ResourceIds → CollectionSchema.FileResourceIds). - Logic removed/simplified: ad‑hoc, scattered resource-path lookups and per-filter file helpers (e.g., read_synonyms_file and other inline file-reading logic) were consolidated into ResourceInfo + FileResourcePathHelper and a centralized get_resource_path(helper, ...) API; filter/tokenizer builder APIs now accept &mut FileResourcePathHelper so all file path resolution and ID collection use the same path and bookkeeping logic (redundant duplicated lookups removed). - Why no data loss or behavior regression: changes are additive and default-preserving — existing call sites pass extra_info = "" so analyzer creation/validation behavior and error paths remain unchanged; new Collection.FileResourceIds is populated from resp.ResourceIds in validateSchema and round‑tripped through marshal/unmarshal (model.Collection ↔ schemapb.CollectionSchema) so schema persistence uses the new list without overwriting other schema fields; proto change adds a repeated field (resource_ids) which is wire‑compatible (older clients ignore extra field). Concrete code paths: analyzer creation still uses create_analyzer (now with extra_info ""), tokenizer validation still returns errors as before but now also returns IDs via CValidateResult/RustResult, and rootcoord.validateSchema assigns resp.ResourceIds → schema.FileResourceIds. - New capability added: end‑to‑end discovery, return, and persistence of file resource IDs used by analyzers — validate flows now return resource IDs and the system stores them in collection schema (affects tantivy analyzer binding, canalyzer C bindings, internal/util analyzer APIs, querynode ValidateAnalyzer response, and rootcoord/create_collection flow). <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
512884524b
commit
55feb7ded8
2
go.mod
2
go.mod
@ -21,7 +21,7 @@ require (
|
|||||||
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
|
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
|
||||||
github.com/klauspost/compress v1.18.0
|
github.com/klauspost/compress v1.18.0
|
||||||
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
|
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f
|
||||||
github.com/minio/minio-go/v7 v7.0.73
|
github.com/minio/minio-go/v7 v7.0.73
|
||||||
github.com/panjf2000/ants/v2 v2.11.3 // indirect
|
github.com/panjf2000/ants/v2 v2.11.3 // indirect
|
||||||
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 // indirect
|
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81 // indirect
|
||||||
|
|||||||
4
go.sum
4
go.sum
@ -799,8 +799,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
|
|||||||
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
||||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
||||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece h1:s0TFMZBxADKSzIr7LW/TE3L/WgCuo7QOfzkYX92Xog0=
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f h1:YQ61KOySWPEXv8ePkr0Cu5q5iVHN11IIUSTWIiALCE8=
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
|
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
|
||||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
|
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
|
||||||
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
|
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
|
||||||
|
|||||||
@ -45,7 +45,7 @@ TEST(CTokenizer, Default) {
|
|||||||
auto analyzer_params = R"({"tokenizer": "standard"})";
|
auto analyzer_params = R"({"tokenizer": "standard"})";
|
||||||
CTokenizer tokenizer;
|
CTokenizer tokenizer;
|
||||||
{
|
{
|
||||||
auto status = create_tokenizer(analyzer_params, &tokenizer);
|
auto status = create_tokenizer(analyzer_params, "", &tokenizer);
|
||||||
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
|
ASSERT_EQ(milvus::ErrorCode::Success, status.error_code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -30,9 +30,12 @@ set_tokenizer_option(const char* params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CStatus
|
CStatus
|
||||||
create_tokenizer(const char* params, CTokenizer* tokenizer) {
|
create_tokenizer(const char* params,
|
||||||
|
const char* extra_info,
|
||||||
|
CTokenizer* tokenizer) {
|
||||||
try {
|
try {
|
||||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
|
auto impl =
|
||||||
|
std::make_unique<milvus::tantivy::Tokenizer>(params, extra_info);
|
||||||
*tokenizer = impl.release();
|
*tokenizer = impl.release();
|
||||||
return milvus::SuccessCStatus();
|
return milvus::SuccessCStatus();
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception& e) {
|
||||||
@ -63,13 +66,14 @@ create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len) {
|
|||||||
return impl->CreateTokenStream(std::string(text, text_len)).release();
|
return impl->CreateTokenStream(std::string(text, text_len)).release();
|
||||||
}
|
}
|
||||||
|
|
||||||
CStatus
|
CValidateResult
|
||||||
validate_tokenizer(const char* params) {
|
validate_tokenizer(const char* params, const char* extra_info) {
|
||||||
try {
|
try {
|
||||||
auto impl = std::make_unique<milvus::tantivy::Tokenizer>(params);
|
auto [ids, count] =
|
||||||
return milvus::SuccessCStatus();
|
milvus::tantivy::validate_analyzer(params, extra_info);
|
||||||
|
return CValidateResult{ids, count, milvus::SuccessCStatus()};
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception& e) {
|
||||||
return milvus::FailureCStatus(&e);
|
return CValidateResult{nullptr, 0, milvus::FailureCStatus(&e)};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -27,7 +27,9 @@ CStatus
|
|||||||
set_tokenizer_option(const char* params);
|
set_tokenizer_option(const char* params);
|
||||||
|
|
||||||
CStatus
|
CStatus
|
||||||
create_tokenizer(const char* params, CTokenizer* tokenizer);
|
create_tokenizer(const char* params,
|
||||||
|
const char* extra_info,
|
||||||
|
CTokenizer* tokenizer);
|
||||||
|
|
||||||
CStatus
|
CStatus
|
||||||
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
||||||
@ -35,8 +37,14 @@ clone_tokenizer(CTokenizer* tokenizer, CTokenizer* rst);
|
|||||||
void
|
void
|
||||||
free_tokenizer(CTokenizer tokenizer);
|
free_tokenizer(CTokenizer tokenizer);
|
||||||
|
|
||||||
CStatus
|
typedef struct CValidateResult {
|
||||||
validate_tokenizer(const char* params);
|
int64_t* resource_ids;
|
||||||
|
uint64_t resource_ids_count;
|
||||||
|
CStatus status;
|
||||||
|
} CValidateResult;
|
||||||
|
|
||||||
|
CValidateResult
|
||||||
|
validate_tokenizer(const char* params, const char* extra_info);
|
||||||
|
|
||||||
CTokenStream
|
CTokenStream
|
||||||
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);
|
create_token_stream(CTokenizer tokenizer, const char* text, uint32_t text_len);
|
||||||
|
|||||||
@ -32,7 +32,7 @@ fn bench_lindua_language_identifier_tokenizer(c: &mut Criterion) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
"#;
|
"#;
|
||||||
let mut analyzer = create_analyzer(params);
|
let mut analyzer = create_analyzer(params, "");
|
||||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||||
|
|
||||||
c.bench_function("test", |b| {
|
c.bench_function("test", |b| {
|
||||||
@ -64,7 +64,7 @@ fn bench_whatlang_language_identifier_tokenizer(c: &mut Criterion) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
"#;
|
"#;
|
||||||
let mut analyzer = create_analyzer(params);
|
let mut analyzer = create_analyzer(params, "");
|
||||||
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
assert!(analyzer.is_ok(), "error: {}", analyzer.err().unwrap());
|
||||||
|
|
||||||
c.bench_function("test", |b| {
|
c.bench_function("test", |b| {
|
||||||
|
|||||||
@ -497,7 +497,9 @@ const char *tantivy_token_stream_get_token(void *token_stream);
|
|||||||
|
|
||||||
TantivyToken tantivy_token_stream_get_detailed_token(void *token_stream);
|
TantivyToken tantivy_token_stream_get_detailed_token(void *token_stream);
|
||||||
|
|
||||||
RustResult tantivy_create_analyzer(const char *analyzer_params);
|
RustResult tantivy_create_analyzer(const char *analyzer_params, const char *extra_info);
|
||||||
|
|
||||||
|
RustResult tantivy_validate_analyzer(const char *analyzer_params, const char *extra_info);
|
||||||
|
|
||||||
void *tantivy_clone_analyzer(void *ptr);
|
void *tantivy_clone_analyzer(void *ptr);
|
||||||
|
|
||||||
|
|||||||
@ -1,40 +1,30 @@
|
|||||||
|
use log::warn;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
|
|
||||||
|
use super::options::{get_global_file_resource_helper, FileResourcePathHelper};
|
||||||
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
|
use super::{build_in_analyzer::*, filter::*, tokenizers::get_builder_with_tokenizer};
|
||||||
use crate::analyzer::filter::{get_stop_words_list, get_string_list};
|
use crate::analyzer::filter::{create_filter, get_stop_words_list, get_string_list};
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
use crate::error::TantivyBindingError;
|
use crate::error::TantivyBindingError;
|
||||||
|
|
||||||
struct AnalyzerBuilder<'a> {
|
struct AnalyzerBuilder<'a> {
|
||||||
filters: HashMap<String, SystemFilter>,
|
filters: HashMap<String, SystemFilter>,
|
||||||
|
helper: &'a mut FileResourcePathHelper,
|
||||||
params: &'a json::Map<String, json::Value>,
|
params: &'a json::Map<String, json::Value>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AnalyzerBuilder<'_> {
|
impl<'a> AnalyzerBuilder<'a> {
|
||||||
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
|
fn new(
|
||||||
AnalyzerBuilder {
|
params: &'a json::Map<String, json::Value>,
|
||||||
|
helper: &'a mut FileResourcePathHelper,
|
||||||
|
) -> Result<AnalyzerBuilder<'a>> {
|
||||||
|
Ok(AnalyzerBuilder {
|
||||||
filters: HashMap::new(),
|
filters: HashMap::new(),
|
||||||
params: params,
|
params: params,
|
||||||
}
|
helper: helper,
|
||||||
}
|
})
|
||||||
|
|
||||||
fn get_tokenizer_params(&self) -> Result<&json::Value> {
|
|
||||||
let tokenizer = self.params.get("tokenizer");
|
|
||||||
if tokenizer.is_none() {
|
|
||||||
return Err(TantivyBindingError::InternalError(format!(
|
|
||||||
"tokenizer name or type must be set"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
let value = tokenizer.unwrap();
|
|
||||||
if value.is_object() || value.is_string() {
|
|
||||||
return Ok(tokenizer.unwrap());
|
|
||||||
}
|
|
||||||
|
|
||||||
Err(TantivyBindingError::InternalError(format!(
|
|
||||||
"tokenizer name should be string or dict"
|
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_filter(
|
fn build_filter(
|
||||||
@ -73,7 +63,7 @@ impl AnalyzerBuilder<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if filter.is_object() {
|
} else if filter.is_object() {
|
||||||
let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
|
let filter = create_filter(filter.as_object().unwrap(), &mut self.helper)?;
|
||||||
builder = filter.transform(builder);
|
builder = filter.transform(builder);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,10 +100,13 @@ impl AnalyzerBuilder<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
|
fn build_template(mut self, type_: &str) -> Result<TextAnalyzer> {
|
||||||
match type_ {
|
match type_ {
|
||||||
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
|
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
|
||||||
"chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
|
"chinese" => Ok(chinese_analyzer(
|
||||||
|
self.get_stop_words_option()?,
|
||||||
|
&mut self.helper,
|
||||||
|
)),
|
||||||
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
|
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
|
||||||
other_ => Err(TantivyBindingError::InternalError(format!(
|
other_ => Err(TantivyBindingError::InternalError(format!(
|
||||||
"unknown build-in analyzer type: {}",
|
"unknown build-in analyzer type: {}",
|
||||||
@ -128,7 +121,7 @@ impl AnalyzerBuilder<'_> {
|
|||||||
Some(type_) => {
|
Some(type_) => {
|
||||||
if !type_.is_string() {
|
if !type_.is_string() {
|
||||||
return Err(TantivyBindingError::InternalError(format!(
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
"analyzer type shoud be string"
|
"analyzer type should be string"
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
return self.build_template(type_.as_str().unwrap());
|
return self.build_template(type_.as_str().unwrap());
|
||||||
@ -137,8 +130,25 @@ impl AnalyzerBuilder<'_> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
//build custom analyzer
|
//build custom analyzer
|
||||||
let tokenizer_params = self.get_tokenizer_params()?;
|
let tokenizer_params = self.params.get("tokenizer");
|
||||||
let mut builder = get_builder_with_tokenizer(&tokenizer_params, create_analyzer_by_json)?;
|
if tokenizer_params.is_none() {
|
||||||
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"tokenizer name or type must be set"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let value = tokenizer_params.unwrap();
|
||||||
|
if !value.is_object() && !value.is_string() {
|
||||||
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"tokenizer name should be string or dict"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut builder = get_builder_with_tokenizer(
|
||||||
|
tokenizer_params.unwrap(),
|
||||||
|
&mut self.helper,
|
||||||
|
create_analyzer_by_json,
|
||||||
|
)?;
|
||||||
|
|
||||||
// build and check other options
|
// build and check other options
|
||||||
builder = self.build_option(builder)?;
|
builder = self.build_option(builder)?;
|
||||||
@ -148,30 +158,50 @@ impl AnalyzerBuilder<'_> {
|
|||||||
|
|
||||||
pub fn create_analyzer_by_json(
|
pub fn create_analyzer_by_json(
|
||||||
analyzer_params: &json::Map<String, json::Value>,
|
analyzer_params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
) -> Result<TextAnalyzer> {
|
) -> Result<TextAnalyzer> {
|
||||||
if analyzer_params.is_empty() {
|
if analyzer_params.is_empty() {
|
||||||
return Ok(standard_analyzer(vec![]));
|
return Ok(standard_analyzer(vec![]));
|
||||||
}
|
}
|
||||||
|
|
||||||
let builder = AnalyzerBuilder::new(analyzer_params);
|
let builder = AnalyzerBuilder::new(analyzer_params, helper)?;
|
||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
|
pub fn create_helper(extra_info: &str) -> Result<FileResourcePathHelper> {
|
||||||
|
if extra_info.is_empty() {
|
||||||
|
Ok(get_global_file_resource_helper())
|
||||||
|
} else {
|
||||||
|
Ok(FileResourcePathHelper::from_json(
|
||||||
|
&json::from_str::<json::Value>(&extra_info)
|
||||||
|
.map_err(|e| TantivyBindingError::JsonError(e))?,
|
||||||
|
)?)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_analyzer(params: &str, extra_info: &str) -> Result<TextAnalyzer> {
|
||||||
if params.len() == 0 {
|
if params.len() == 0 {
|
||||||
return Ok(standard_analyzer(vec![]));
|
return Ok(standard_analyzer(vec![]));
|
||||||
}
|
}
|
||||||
|
|
||||||
let json_params =
|
let json_params = &json::from_str::<json::Map<String, json::Value>>(¶ms)
|
||||||
json::from_str::<json::Value>(¶ms).map_err(|e| TantivyBindingError::JsonError(e))?;
|
.map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||||
|
|
||||||
create_analyzer_by_json(
|
let mut helper = create_helper(extra_info)?;
|
||||||
json_params
|
create_analyzer_by_json(json_params, &mut helper)
|
||||||
.as_object()
|
}
|
||||||
.ok_or(TantivyBindingError::InternalError(
|
|
||||||
"params should be a json map".to_string(),
|
pub fn validate_analyzer(params: &str, extra_info: &str) -> Result<Vec<i64>> {
|
||||||
))?,
|
if params.len() == 0 {
|
||||||
)
|
return Ok(vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let json_params = &json::from_str::<json::Map<String, json::Value>>(¶ms)
|
||||||
|
.map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||||
|
|
||||||
|
let mut helper = create_helper(extra_info)?;
|
||||||
|
create_analyzer_by_json(json_params, &mut helper)?;
|
||||||
|
Ok(helper.get_resource_ids())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@ -185,7 +215,7 @@ mod tests {
|
|||||||
"stop_words": ["_english_"]
|
"stop_words": ["_english_"]
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_analyzer(¶ms.to_string());
|
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,7 +225,7 @@ mod tests {
|
|||||||
"type": "chinese"
|
"type": "chinese"
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_analyzer(¶ms.to_string());
|
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
|
||||||
@ -219,7 +249,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_analyzer(¶ms.to_string());
|
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
|
|||||||
@ -2,6 +2,7 @@ use tantivy::tokenizer::*;
|
|||||||
|
|
||||||
use super::filter::stop_words;
|
use super::filter::stop_words;
|
||||||
use super::filter::*;
|
use super::filter::*;
|
||||||
|
use super::options::FileResourcePathHelper;
|
||||||
use super::tokenizers::*;
|
use super::tokenizers::*;
|
||||||
|
|
||||||
// default build-in analyzer
|
// default build-in analyzer
|
||||||
@ -15,8 +16,13 @@ pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
|||||||
builder.build()
|
builder.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
|
pub fn chinese_analyzer(
|
||||||
let builder = jieba_builder(None).unwrap().filter(CnAlphaNumOnlyFilter);
|
stop_words: Vec<String>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> TextAnalyzer {
|
||||||
|
let builder = jieba_builder(None, helper)
|
||||||
|
.unwrap()
|
||||||
|
.filter(CnAlphaNumOnlyFilter);
|
||||||
if stop_words.len() > 0 {
|
if stop_words.len() > 0 {
|
||||||
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
return builder.filter(StopWordFilter::remove(stop_words)).build();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
use super::filter::FilterBuilder;
|
use super::filter::FilterBuilder;
|
||||||
use super::util::read_line_file;
|
use super::util::read_line_file;
|
||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use tantivy::tokenizer::SplitCompoundWords;
|
use tantivy::tokenizer::SplitCompoundWords;
|
||||||
@ -8,7 +9,10 @@ const WORD_LIST_KEY: &str = "word_list";
|
|||||||
const WORD_LIST_FILE_KEY: &str = "word_list_file";
|
const WORD_LIST_FILE_KEY: &str = "word_list_file";
|
||||||
|
|
||||||
impl FilterBuilder for SplitCompoundWords {
|
impl FilterBuilder for SplitCompoundWords {
|
||||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
fn from_json(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<Self> {
|
||||||
let mut dict = Vec::<String>::new();
|
let mut dict = Vec::<String>::new();
|
||||||
if let Some(value) = params.get(WORD_LIST_KEY) {
|
if let Some(value) = params.get(WORD_LIST_KEY) {
|
||||||
if !value.is_array() {
|
if !value.is_array() {
|
||||||
@ -29,7 +33,12 @@ impl FilterBuilder for SplitCompoundWords {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) {
|
if let Some(file_params) = params.get(WORD_LIST_FILE_KEY) {
|
||||||
read_line_file(&mut dict, file_params, "decompounder word list file")?;
|
read_line_file(
|
||||||
|
helper,
|
||||||
|
&mut dict,
|
||||||
|
file_params,
|
||||||
|
"decompounder word list file",
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if dict.is_empty() {
|
if dict.is_empty() {
|
||||||
@ -49,13 +58,17 @@ impl FilterBuilder for SplitCompoundWords {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::SplitCompoundWords;
|
|
||||||
use crate::analyzer::filter::FilterBuilder;
|
|
||||||
use crate::analyzer::tokenizers::standard_builder;
|
|
||||||
use crate::log::init_log;
|
|
||||||
use serde_json as json;
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use serde_json as json;
|
||||||
|
|
||||||
|
use super::SplitCompoundWords;
|
||||||
|
use crate::analyzer::filter::FilterBuilder;
|
||||||
|
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||||
|
use crate::analyzer::tokenizers::standard_builder;
|
||||||
|
use crate::log::init_log;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_decompounder_filter_with_file() {
|
fn test_decompounder_filter_with_file() {
|
||||||
@ -74,7 +87,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||||
// let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]);
|
// let filter = SplitCompoundWords::from_dictionary(vec!["bank", "note"]);
|
||||||
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let filter = SplitCompoundWords::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||||
let builder = standard_builder().filter(filter.unwrap());
|
let builder = standard_builder().filter(filter.unwrap());
|
||||||
let mut analyzer = builder.build();
|
let mut analyzer = builder.build();
|
||||||
|
|||||||
@ -4,6 +4,7 @@ use tantivy::tokenizer::*;
|
|||||||
use super::{
|
use super::{
|
||||||
CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter,
|
CnAlphaNumOnlyFilter, CnCharOnlyFilter, RegexFilter, RemovePunctFilter, SynonymFilter,
|
||||||
};
|
};
|
||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
pub(crate) enum SystemFilter {
|
pub(crate) enum SystemFilter {
|
||||||
@ -23,7 +24,10 @@ pub(crate) enum SystemFilter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) trait FilterBuilder {
|
pub(crate) trait FilterBuilder {
|
||||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self>
|
fn from_json(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<Self>
|
||||||
where
|
where
|
||||||
Self: Sized;
|
Self: Sized;
|
||||||
}
|
}
|
||||||
@ -109,36 +113,36 @@ impl From<&str> for SystemFilter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
|
pub fn create_filter(
|
||||||
type Error = TantivyBindingError;
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<SystemFilter> {
|
||||||
|
match params.get(&"type".to_string()) {
|
||||||
|
Some(value) => {
|
||||||
|
if !value.is_string() {
|
||||||
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
"filter type should be string".to_string(),
|
||||||
|
));
|
||||||
|
};
|
||||||
|
|
||||||
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
|
match value.as_str().unwrap() {
|
||||||
match params.get(&"type".to_string()) {
|
"length" => get_length_filter(params),
|
||||||
Some(value) => {
|
"stop" => StopWordFilter::from_json(params, helper).map(|f| SystemFilter::Stop(f)),
|
||||||
if !value.is_string() {
|
"decompounder" => SplitCompoundWords::from_json(params, helper)
|
||||||
return Err(TantivyBindingError::InternalError(
|
.map(|f| SystemFilter::Decompounder(f)),
|
||||||
"filter type should be string".to_string(),
|
"stemmer" => Stemmer::from_json(params, helper).map(|f| SystemFilter::Stemmer(f)),
|
||||||
));
|
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
||||||
};
|
"synonym" => {
|
||||||
|
SynonymFilter::from_json(params, helper).map(|f| SystemFilter::Synonym(f))
|
||||||
match value.as_str().unwrap() {
|
|
||||||
"length" => get_length_filter(params),
|
|
||||||
"stop" => StopWordFilter::from_json(params).map(|f| SystemFilter::Stop(f)),
|
|
||||||
"decompounder" => {
|
|
||||||
SplitCompoundWords::from_json(params).map(|f| SystemFilter::Decompounder(f))
|
|
||||||
}
|
|
||||||
"stemmer" => Stemmer::from_json(params).map(|f| SystemFilter::Stemmer(f)),
|
|
||||||
"regex" => RegexFilter::from_json(params).map(|f| SystemFilter::Regex(f)),
|
|
||||||
"synonym" => SynonymFilter::from_json(params).map(|f| SystemFilter::Synonym(f)),
|
|
||||||
other => Err(TantivyBindingError::InternalError(format!(
|
|
||||||
"unsupport filter type: {}",
|
|
||||||
other
|
|
||||||
))),
|
|
||||||
}
|
}
|
||||||
|
other => Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"unsupport filter type: {}",
|
||||||
|
other
|
||||||
|
))),
|
||||||
}
|
}
|
||||||
None => Err(TantivyBindingError::InternalError(
|
|
||||||
"no type field in filter params".to_string(),
|
|
||||||
)),
|
|
||||||
}
|
}
|
||||||
|
None => Err(TantivyBindingError::InternalError(
|
||||||
|
"no type field in filter params".to_string(),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -116,7 +116,7 @@ mod tests {
|
|||||||
}]
|
}]
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_analyzer(¶ms.to_string());
|
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
|
|||||||
@ -59,7 +59,7 @@ mod tests {
|
|||||||
"filter": ["removepunct"]
|
"filter": ["removepunct"]
|
||||||
}"#;
|
}"#;
|
||||||
|
|
||||||
let tokenizer = create_analyzer(¶ms.to_string());
|
let tokenizer = create_analyzer(¶ms.to_string(), "");
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
|
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
|
|||||||
@ -1,10 +1,14 @@
|
|||||||
use super::filter::FilterBuilder;
|
use super::filter::FilterBuilder;
|
||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use tantivy::tokenizer::{Language, Stemmer};
|
use tantivy::tokenizer::{Language, Stemmer};
|
||||||
|
|
||||||
impl FilterBuilder for Stemmer {
|
impl FilterBuilder for Stemmer {
|
||||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
fn from_json(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
_: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<Self> {
|
||||||
let value = params.get("language");
|
let value = params.get("language");
|
||||||
if value.is_none() || !value.unwrap().is_string() {
|
if value.is_none() || !value.unwrap().is_string() {
|
||||||
return Err(TantivyBindingError::InternalError(
|
return Err(TantivyBindingError::InternalError(
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
use super::filter::FilterBuilder;
|
use super::filter::FilterBuilder;
|
||||||
use super::stop_words::fetch_language_stop_words;
|
use super::stop_words::fetch_language_stop_words;
|
||||||
use super::util::*;
|
use super::util::*;
|
||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use tantivy::tokenizer::StopWordFilter;
|
use tantivy::tokenizer::StopWordFilter;
|
||||||
@ -28,14 +29,17 @@ pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl FilterBuilder for StopWordFilter {
|
impl FilterBuilder for StopWordFilter {
|
||||||
fn from_json(params: &json::Map<String, json::Value>) -> Result<Self> {
|
fn from_json(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<Self> {
|
||||||
let mut dict = Vec::<String>::new();
|
let mut dict = Vec::<String>::new();
|
||||||
if let Some(value) = params.get(STOP_WORDS_LIST_KEY) {
|
if let Some(value) = params.get(STOP_WORDS_LIST_KEY) {
|
||||||
dict = get_stop_words_list(get_string_list(value, "stop_words")?);
|
dict = get_stop_words_list(get_string_list(value, "stop_words")?);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) {
|
if let Some(file_params) = params.get(STOP_WORDS_FILE_KEY) {
|
||||||
read_line_file(&mut dict, file_params, "stop words dict file")?;
|
read_line_file(helper, &mut dict, file_params, "stop words dict file")?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(StopWordFilter::remove(dict))
|
Ok(StopWordFilter::remove(dict))
|
||||||
@ -46,11 +50,13 @@ impl FilterBuilder for StopWordFilter {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::StopWordFilter;
|
use super::StopWordFilter;
|
||||||
use crate::analyzer::filter::FilterBuilder;
|
use crate::analyzer::filter::FilterBuilder;
|
||||||
|
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||||
use crate::analyzer::tokenizers::standard_builder;
|
use crate::analyzer::tokenizers::standard_builder;
|
||||||
use crate::log::init_log;
|
use crate::log::init_log;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_stop_words_filter_with_file() {
|
fn test_stop_words_filter_with_file() {
|
||||||
@ -69,7 +75,8 @@ mod tests {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||||
let filter = StopWordFilter::from_json(json_params.as_object().unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let filter = StopWordFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||||
|
|
||||||
let builder = standard_builder().filter(filter.unwrap());
|
let builder = standard_builder().filter(filter.unwrap());
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
use crate::analyzer::options::get_resource_path;
|
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
@ -199,30 +199,16 @@ impl SynonymDict {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_synonyms_file(builder: &mut SynonymDictBuilder, params: &json::Value) -> Result<()> {
|
|
||||||
let path = get_resource_path(params, "synonyms dict file")?;
|
|
||||||
let file = std::fs::File::open(path)?;
|
|
||||||
let reader = std::io::BufReader::new(file);
|
|
||||||
for line in reader.lines() {
|
|
||||||
if let Ok(row_data) = line {
|
|
||||||
builder.add_row(&row_data)?;
|
|
||||||
} else {
|
|
||||||
return Err(TantivyBindingError::InternalError(format!(
|
|
||||||
"read synonyms dict file failed, error: {}",
|
|
||||||
line.unwrap_err().to_string()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SynonymFilter {
|
pub struct SynonymFilter {
|
||||||
dict: Arc<SynonymDict>,
|
dict: Arc<SynonymDict>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SynonymFilter {
|
impl SynonymFilter {
|
||||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<SynonymFilter> {
|
pub fn from_json(
|
||||||
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<SynonymFilter> {
|
||||||
let expand = params.get("expand").map_or(Ok(true), |v| {
|
let expand = params.get("expand").map_or(Ok(true), |v| {
|
||||||
v.as_bool().ok_or(TantivyBindingError::InvalidArgument(
|
v.as_bool().ok_or(TantivyBindingError::InvalidArgument(
|
||||||
"create synonym filter failed, `expand` must be bool".to_string(),
|
"create synonym filter failed, `expand` must be bool".to_string(),
|
||||||
@ -246,7 +232,19 @@ impl SynonymFilter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Some(file_params) = params.get("synonyms_file") {
|
if let Some(file_params) = params.get("synonyms_file") {
|
||||||
read_synonyms_file(&mut builder, file_params)?;
|
let path = get_resource_path(helper, file_params, "synonyms dict file")?;
|
||||||
|
let file = std::fs::File::open(path)?;
|
||||||
|
let reader = std::io::BufReader::new(file);
|
||||||
|
for line in reader.lines() {
|
||||||
|
if let Ok(row_data) = line {
|
||||||
|
builder.add_row(&row_data)?;
|
||||||
|
} else {
|
||||||
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
|
"read synonyms dict file failed, error: {}",
|
||||||
|
line.unwrap_err().to_string()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(SynonymFilter {
|
Ok(SynonymFilter {
|
||||||
@ -350,11 +348,14 @@ impl<T: TokenStream> TokenStream for SynonymFilterStream<T> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::SynonymFilter;
|
use super::SynonymFilter;
|
||||||
|
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||||
use crate::analyzer::tokenizers::standard_builder;
|
use crate::analyzer::tokenizers::standard_builder;
|
||||||
use crate::log::init_log;
|
use crate::log::init_log;
|
||||||
|
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_synonym_filter() {
|
fn test_synonym_filter() {
|
||||||
@ -365,7 +366,8 @@ mod tests {
|
|||||||
"synonyms": ["trans => translate, \\=>", "\\\\test, test, tests"]
|
"synonyms": ["trans => translate, \\=>", "\\\\test, test, tests"]
|
||||||
}"#;
|
}"#;
|
||||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let filter = SynonymFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||||
let builder = standard_builder().filter(filter.unwrap());
|
let builder = standard_builder().filter(filter.unwrap());
|
||||||
let mut analyzer = builder.build();
|
let mut analyzer = builder.build();
|
||||||
@ -402,7 +404,8 @@ mod tests {
|
|||||||
}}"#
|
}}"#
|
||||||
);
|
);
|
||||||
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
let json_params = json::from_str::<json::Value>(¶ms).unwrap();
|
||||||
let filter = SynonymFilter::from_json(json_params.as_object().unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let filter = SynonymFilter::from_json(json_params.as_object().unwrap(), &mut helper);
|
||||||
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
assert!(filter.is_ok(), "error: {}", filter.err().unwrap());
|
||||||
let builder = standard_builder().filter(filter.unwrap());
|
let builder = standard_builder().filter(filter.unwrap());
|
||||||
let mut analyzer = builder.build();
|
let mut analyzer = builder.build();
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
use crate::analyzer::options::get_resource_path;
|
use crate::analyzer::options::get_resource_path;
|
||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::io::BufRead;
|
use std::io::BufRead;
|
||||||
@ -26,11 +27,12 @@ pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>>
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn read_line_file(
|
pub(crate) fn read_line_file(
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
dict: &mut Vec<String>,
|
dict: &mut Vec<String>,
|
||||||
params: &json::Value,
|
params: &json::Value,
|
||||||
key: &str,
|
key: &str,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let path = get_resource_path(params, key)?;
|
let path = get_resource_path(helper, params, key)?;
|
||||||
let file = std::fs::File::open(path)?;
|
let file = std::fs::File::open(path)?;
|
||||||
let reader = std::io::BufReader::new(file);
|
let reader = std::io::BufReader::new(file);
|
||||||
for line in reader.lines() {
|
for line in reader.lines() {
|
||||||
|
|||||||
@ -2,10 +2,10 @@ mod analyzer;
|
|||||||
mod build_in_analyzer;
|
mod build_in_analyzer;
|
||||||
mod dict;
|
mod dict;
|
||||||
mod filter;
|
mod filter;
|
||||||
mod options;
|
|
||||||
|
|
||||||
|
pub mod options;
|
||||||
pub mod tokenizers;
|
pub mod tokenizers;
|
||||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
pub use self::analyzer::{create_analyzer, create_analyzer_by_json, validate_analyzer};
|
||||||
pub use self::options::set_options;
|
pub use self::options::set_options;
|
||||||
|
|
||||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||||
|
|||||||
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/common.rs
vendored
Normal file
8
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/common.rs
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
// cache key
|
||||||
|
pub(crate) static LINDERA_DOWNLOAD_KEY: &str = "lindera_download_urls";
|
||||||
|
pub(crate) static RESOURCE_MAP_KEY: &str = "resource_map";
|
||||||
|
|
||||||
|
// normal key
|
||||||
|
pub static DEFAULT_DICT_PATH_KEY: &str = "default_dict_path";
|
||||||
|
pub static RESOURCE_PATH_KEY: &str = "resource_path";
|
||||||
|
pub static RESOURCE_STORAGE_NAME_KEY: &str = "storage_name";
|
||||||
@ -1,8 +1,13 @@
|
|||||||
|
mod common;
|
||||||
|
mod resource_info;
|
||||||
mod runtime_option;
|
mod runtime_option;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
pub use self::runtime_option::{get_lindera_download_url, get_options, set_options};
|
pub use self::resource_info::{FileResourcePathHelper, ResourceInfo};
|
||||||
|
pub use self::runtime_option::{
|
||||||
|
get_global_file_resource_helper, get_lindera_download_url, get_options, set_options,
|
||||||
|
};
|
||||||
|
|
||||||
pub use self::util::get_resource_path;
|
pub use self::util::get_resource_path;
|
||||||
|
|
||||||
pub use self::runtime_option::DEFAULT_DICT_PATH_KEY;
|
pub use self::common::{DEFAULT_DICT_PATH_KEY, RESOURCE_PATH_KEY};
|
||||||
|
|||||||
190
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/resource_info.rs
vendored
Normal file
190
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/options/resource_info.rs
vendored
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
// resource options
|
||||||
|
use super::common::*;
|
||||||
|
use super::runtime_option::get_options;
|
||||||
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
use serde_json as json;
|
||||||
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
pub struct ResourceInfo {
|
||||||
|
storage_name: Option<String>,
|
||||||
|
resource_map: HashMap<String, i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ResourceInfo {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
storage_name: None,
|
||||||
|
resource_map: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn debug(&self) -> String {
|
||||||
|
format!(
|
||||||
|
"storage_name: {:?}, resource_map: {:?}",
|
||||||
|
self.storage_name, self.resource_map
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_global_json(value: &json::Value) -> Result<Self> {
|
||||||
|
let mut resource_map = HashMap::new();
|
||||||
|
let kv = value
|
||||||
|
.as_object()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource map should be a json map, but got: {}",
|
||||||
|
json::to_string(value).unwrap()
|
||||||
|
)))?;
|
||||||
|
for (key, value) in kv {
|
||||||
|
let url = value
|
||||||
|
.as_i64()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource id should be integer, but got: {}",
|
||||||
|
json::to_string(value).unwrap()
|
||||||
|
)))?;
|
||||||
|
resource_map.insert(key.to_string(), url);
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
storage_name: None,
|
||||||
|
resource_map,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_json(value: &json::Value) -> Result<Self> {
|
||||||
|
let mut resource_map = HashMap::new();
|
||||||
|
let m = value
|
||||||
|
.as_object()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"extra info should be a json map, but got: {}",
|
||||||
|
json::to_string(value).unwrap()
|
||||||
|
)))?;
|
||||||
|
|
||||||
|
if let Some(v) = m.get(RESOURCE_MAP_KEY) {
|
||||||
|
let kv = v
|
||||||
|
.as_object()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource map should be a json map, but got: {}",
|
||||||
|
json::to_string(v).unwrap()
|
||||||
|
)))?;
|
||||||
|
for (key, value) in kv {
|
||||||
|
let url = value
|
||||||
|
.as_i64()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource id should be integer, but got: {}",
|
||||||
|
json::to_string(value).unwrap()
|
||||||
|
)))?;
|
||||||
|
resource_map.insert(key.to_string(), url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut storage_name = None;
|
||||||
|
if let Some(v) = m.get(RESOURCE_STORAGE_NAME_KEY) {
|
||||||
|
let name = v
|
||||||
|
.as_str()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"storage_name must set as string, but got: {}",
|
||||||
|
json::to_string(v).unwrap()
|
||||||
|
)))?
|
||||||
|
.to_string();
|
||||||
|
storage_name = Some(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
storage_name,
|
||||||
|
resource_map,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FileResourcePathBuilder for ResourceInfo {
|
||||||
|
fn get_resource_file_path(
|
||||||
|
&self,
|
||||||
|
resource_name: &str,
|
||||||
|
file_name: &str,
|
||||||
|
) -> Result<(i64, PathBuf)> {
|
||||||
|
let resource_id =
|
||||||
|
self.resource_map
|
||||||
|
.get(resource_name)
|
||||||
|
.ok_or(TantivyBindingError::InternalError(format!(
|
||||||
|
"file resource: {} not found in local resource list",
|
||||||
|
resource_name
|
||||||
|
)))?;
|
||||||
|
|
||||||
|
let base_value =
|
||||||
|
get_options(RESOURCE_PATH_KEY).ok_or(TantivyBindingError::InternalError(
|
||||||
|
"local_resource_path config not init success".to_string(),
|
||||||
|
))?;
|
||||||
|
|
||||||
|
let base = base_value
|
||||||
|
.as_str()
|
||||||
|
.ok_or(TantivyBindingError::InternalError(
|
||||||
|
"local_resource_path must set as string".to_string(),
|
||||||
|
))?;
|
||||||
|
|
||||||
|
if let Some(storage_name) = &self.storage_name {
|
||||||
|
return Ok((
|
||||||
|
resource_id.clone(),
|
||||||
|
PathBuf::new()
|
||||||
|
.join(base)
|
||||||
|
.join(storage_name)
|
||||||
|
.join(resource_id.to_string())
|
||||||
|
.join(file_name),
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
return Ok((
|
||||||
|
resource_id.clone(),
|
||||||
|
PathBuf::new()
|
||||||
|
.join(base)
|
||||||
|
.join(resource_id.to_string())
|
||||||
|
.join(file_name),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait FileResourcePathBuilder {
|
||||||
|
fn get_resource_file_path(
|
||||||
|
&self,
|
||||||
|
resource_name: &str,
|
||||||
|
file_name: &str,
|
||||||
|
) -> Result<(i64, PathBuf)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct FileResourcePathHelper {
|
||||||
|
builder: Arc<dyn FileResourcePathBuilder>,
|
||||||
|
ids: HashSet<i64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FileResourcePathHelper {
|
||||||
|
pub fn new(builder: Arc<dyn FileResourcePathBuilder>) -> Self {
|
||||||
|
Self {
|
||||||
|
builder,
|
||||||
|
ids: HashSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_json(value: &json::Value) -> Result<Self> {
|
||||||
|
let info = ResourceInfo::from_json(value)?;
|
||||||
|
let builder: Arc<dyn FileResourcePathBuilder> = Arc::new(info);
|
||||||
|
Ok(Self {
|
||||||
|
builder,
|
||||||
|
ids: HashSet::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_resource_file_path(
|
||||||
|
&mut self,
|
||||||
|
resource_name: &str,
|
||||||
|
file_name: &str,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
|
let (resource_id, path) = self
|
||||||
|
.builder
|
||||||
|
.get_resource_file_path(resource_name, file_name)?;
|
||||||
|
self.ids.insert(resource_id);
|
||||||
|
Ok(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_resource_ids(self) -> Vec<i64> {
|
||||||
|
self.ids.into_iter().collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -1,3 +1,5 @@
|
|||||||
|
use super::common::*;
|
||||||
|
use super::resource_info::{FileResourcePathBuilder, FileResourcePathHelper, ResourceInfo};
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
@ -7,14 +9,6 @@ use std::sync::{Arc, RwLock};
|
|||||||
|
|
||||||
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
||||||
|
|
||||||
// cache key
|
|
||||||
static LINDERA_DOWNLOAD_KEY: &str = "lindera_download_urls";
|
|
||||||
static RESOURCE_MAP_KEY: &str = "resource_map";
|
|
||||||
|
|
||||||
// normal key
|
|
||||||
pub static DEFAULT_DICT_PATH_KEY: &str = "default_dict_path";
|
|
||||||
pub static RESOURCE_PATH_KEY: &str = "resource_path";
|
|
||||||
|
|
||||||
pub fn set_options(params: &String) -> Result<()> {
|
pub fn set_options(params: &String) -> Result<()> {
|
||||||
GLOBAL_OPTIONS.set_json(params)
|
GLOBAL_OPTIONS.set_json(params)
|
||||||
}
|
}
|
||||||
@ -27,8 +21,8 @@ pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
|
|||||||
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_resource_file_path(resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
pub fn get_global_file_resource_helper() -> FileResourcePathHelper {
|
||||||
GLOBAL_OPTIONS.get_resource_file_path(resource_name, file_name)
|
FileResourcePathHelper::new(GLOBAL_OPTIONS.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
// analyzer options
|
// analyzer options
|
||||||
@ -57,35 +51,25 @@ impl RuntimeOption {
|
|||||||
let r = self.inner.read().unwrap();
|
let r = self.inner.read().unwrap();
|
||||||
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn get_resource_file_path(&self, resource_name: &str, file_name: &str) -> Result<PathBuf> {
|
// file resource
|
||||||
|
impl FileResourcePathBuilder for RuntimeOption {
|
||||||
|
fn get_resource_file_path(
|
||||||
|
&self,
|
||||||
|
resource_name: &str,
|
||||||
|
file_name: &str,
|
||||||
|
) -> Result<(i64, PathBuf)> {
|
||||||
let r = self.inner.read().unwrap();
|
let r = self.inner.read().unwrap();
|
||||||
let resource_id =
|
return r
|
||||||
r.resource_map
|
.resource_info
|
||||||
.get(resource_name)
|
.get_resource_file_path(resource_name, file_name);
|
||||||
.ok_or(TantivyBindingError::InternalError(format!(
|
|
||||||
"file resource: {} not found in local resource list",
|
|
||||||
resource_name
|
|
||||||
)))?;
|
|
||||||
let base = r
|
|
||||||
.params
|
|
||||||
.get(RESOURCE_PATH_KEY)
|
|
||||||
.ok_or(TantivyBindingError::InternalError(
|
|
||||||
"local_resource_path config not init success".to_string(),
|
|
||||||
))?
|
|
||||||
.as_str()
|
|
||||||
.ok_or("local_resource_path must set as string")?;
|
|
||||||
|
|
||||||
return Ok(PathBuf::new()
|
|
||||||
.join(base)
|
|
||||||
.join(resource_id.to_string())
|
|
||||||
.join(file_name));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct RuntimeOptionInner {
|
struct RuntimeOptionInner {
|
||||||
params: HashMap<String, json::Value>,
|
params: HashMap<String, json::Value>,
|
||||||
resource_map: HashMap<String, i64>, // resource name -> resource id
|
resource_info: ResourceInfo, // resource name -> resource id
|
||||||
lindera_download_urls: HashMap<String, Vec<String>>, // dict name -> url
|
lindera_download_urls: HashMap<String, Vec<String>>, // dict name -> url
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +77,7 @@ impl RuntimeOptionInner {
|
|||||||
fn new() -> Self {
|
fn new() -> Self {
|
||||||
RuntimeOptionInner {
|
RuntimeOptionInner {
|
||||||
params: HashMap::new(),
|
params: HashMap::new(),
|
||||||
resource_map: HashMap::new(),
|
resource_info: ResourceInfo::new(),
|
||||||
lindera_download_urls: HashMap::new(),
|
lindera_download_urls: HashMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -124,7 +108,7 @@ impl RuntimeOptionInner {
|
|||||||
|
|
||||||
for (key, value) in m {
|
for (key, value) in m {
|
||||||
let array = value.as_array().ok_or(TantivyBindingError::InternalError(
|
let array = value.as_array().ok_or(TantivyBindingError::InternalError(
|
||||||
"lindera download urls shoud be list".to_string(),
|
"lindera download urls should be list".to_string(),
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
if !array.iter().all(|v| v.is_string()) {
|
if !array.iter().all(|v| v.is_string()) {
|
||||||
@ -143,18 +127,7 @@ impl RuntimeOptionInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if key == RESOURCE_MAP_KEY {
|
if key == RESOURCE_MAP_KEY {
|
||||||
self.resource_map = HashMap::new();
|
self.resource_info = ResourceInfo::from_global_json(&value)?;
|
||||||
|
|
||||||
let m = value.as_object().ok_or(TantivyBindingError::InternalError(
|
|
||||||
"lindera download urls should be a json map".to_string(),
|
|
||||||
))?;
|
|
||||||
|
|
||||||
for (key, value) in m {
|
|
||||||
let url = value.as_i64().ok_or(TantivyBindingError::InternalError(
|
|
||||||
"lindera download url shoud be string".to_string(),
|
|
||||||
))?;
|
|
||||||
self.resource_map.insert(key.to_string(), url);
|
|
||||||
}
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,10 +1,14 @@
|
|||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use super::runtime_option::get_resource_file_path;
|
use super::resource_info::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf> {
|
pub fn get_resource_path(
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
v: &json::Value,
|
||||||
|
resource_key: &str,
|
||||||
|
) -> Result<PathBuf> {
|
||||||
if !v.is_object() {
|
if !v.is_object() {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
"file config of {} must be object",
|
"file config of {} must be object",
|
||||||
@ -73,7 +77,7 @@ pub fn get_resource_path(v: &json::Value, resource_key: &str) -> Result<PathBuf>
|
|||||||
resource_key
|
resource_key
|
||||||
)))?;
|
)))?;
|
||||||
|
|
||||||
self::get_resource_file_path(resource_name, file_name)
|
helper.get_resource_file_path(resource_name, file_name)
|
||||||
}
|
}
|
||||||
other => Err(TantivyBindingError::InvalidArgument(format!(
|
other => Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
"unsupported file type {} of {}",
|
"unsupported file type {} of {}",
|
||||||
|
|||||||
@ -1,13 +1,14 @@
|
|||||||
use core::{option::Option::Some, result::Result::Ok};
|
use core::{option::Option::Some, result::Result::Ok};
|
||||||
use jieba_rs;
|
use jieba_rs;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
use log::warn;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::{borrow::Cow, path::PathBuf};
|
use std::{borrow::Cow, path::PathBuf};
|
||||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
use crate::analyzer::options;
|
use crate::analyzer::options::{get_resource_path, FileResourcePathHelper};
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
@ -56,6 +57,7 @@ impl TokenStream for JiebaTokenStream {
|
|||||||
|
|
||||||
fn get_jieba_dict(
|
fn get_jieba_dict(
|
||||||
params: &json::Map<String, json::Value>,
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
|
) -> Result<(Vec<String>, Option<String>, Option<PathBuf>)> {
|
||||||
let mut words = Vec::<String>::new();
|
let mut words = Vec::<String>::new();
|
||||||
let mut user_dict = None;
|
let mut user_dict = None;
|
||||||
@ -101,7 +103,7 @@ fn get_jieba_dict(
|
|||||||
|
|
||||||
match params.get("extra_dict_file") {
|
match params.get("extra_dict_file") {
|
||||||
Some(v) => {
|
Some(v) => {
|
||||||
let path = options::get_resource_path(v, "jieba extra dict file")?;
|
let path = get_resource_path(helper, v, "jieba extra dict file")?;
|
||||||
user_dict = Some(path)
|
user_dict = Some(path)
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
@ -156,8 +158,11 @@ impl<'a> JiebaTokenizer<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<JiebaTokenizer<'a>> {
|
pub fn from_json(
|
||||||
let (words, system_dict, user_dict) = get_jieba_dict(params)?;
|
params: &json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<JiebaTokenizer<'a>> {
|
||||||
|
let (words, system_dict, user_dict) = get_jieba_dict(params, helper)?;
|
||||||
|
|
||||||
let mut tokenizer =
|
let mut tokenizer =
|
||||||
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
system_dict.map_or(Ok(jieba_rs::Jieba::empty()), |name| match name.as_str() {
|
||||||
@ -242,8 +247,11 @@ impl Tokenizer for JiebaTokenizer<'static> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use super::JiebaTokenizer;
|
use super::JiebaTokenizer;
|
||||||
|
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||||
|
|
||||||
use tantivy::tokenizer::TokenStream;
|
use tantivy::tokenizer::TokenStream;
|
||||||
use tantivy::tokenizer::Tokenizer;
|
use tantivy::tokenizer::Tokenizer;
|
||||||
|
|
||||||
@ -255,7 +263,8 @@ mod tests {
|
|||||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||||
assert!(json_param.is_ok());
|
assert!(json_param.is_ok());
|
||||||
|
|
||||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
let mut stream = bining.token_stream("结巴分词器");
|
let mut stream = bining.token_stream("结巴分词器");
|
||||||
@ -280,7 +289,8 @@ mod tests {
|
|||||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||||
assert!(json_param.is_ok());
|
assert!(json_param.is_ok());
|
||||||
|
|
||||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
let mut stream = bining.token_stream("milvus结巴分词器中文测试");
|
let mut stream = bining.token_stream("milvus结巴分词器中文测试");
|
||||||
@ -303,7 +313,8 @@ mod tests {
|
|||||||
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
let json_param = json::from_str::<json::Map<String, json::Value>>(¶ms);
|
||||||
assert!(json_param.is_ok());
|
assert!(json_param.is_ok());
|
||||||
|
|
||||||
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap());
|
let mut helper = FileResourcePathHelper::new(Arc::new(ResourceInfo::new()));
|
||||||
|
let tokenizer = JiebaTokenizer::from_json(&json_param.unwrap(), &mut helper);
|
||||||
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
|
||||||
let mut bining = tokenizer.unwrap();
|
let mut bining = tokenizer.unwrap();
|
||||||
let mut stream = bining.token_stream("milvus結巴分詞器中文測試");
|
let mut stream = bining.token_stream("milvus結巴分詞器中文測試");
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use crate::error::{Result, TantivyBindingError};
|
use crate::error::{Result, TantivyBindingError};
|
||||||
use lingua::{LanguageDetector, LanguageDetectorBuilder};
|
use lingua::{LanguageDetector, LanguageDetectorBuilder};
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
@ -164,7 +165,11 @@ impl<'a> LangIdentTokenizer<'a> {
|
|||||||
|
|
||||||
pub fn from_json<'b>(
|
pub fn from_json<'b>(
|
||||||
params: &'b json::Map<String, json::Value>,
|
params: &'b json::Map<String, json::Value>,
|
||||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
helper: &mut FileResourcePathHelper,
|
||||||
|
fc: fn(
|
||||||
|
&json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<TextAnalyzer>,
|
||||||
) -> Result<LangIdentTokenizer<'a>> {
|
) -> Result<LangIdentTokenizer<'a>> {
|
||||||
// init identfier for tokenizer
|
// init identfier for tokenizer
|
||||||
let identifier = params
|
let identifier = params
|
||||||
@ -188,12 +193,15 @@ impl<'a> LangIdentTokenizer<'a> {
|
|||||||
for (name, params) in sub_analyzers {
|
for (name, params) in sub_analyzers {
|
||||||
analyzer.add(
|
analyzer.add(
|
||||||
name,
|
name,
|
||||||
fc(params.as_object().ok_or_else(|| {
|
fc(
|
||||||
TantivyBindingError::InvalidArgument(format!(
|
params.as_object().ok_or_else(|| {
|
||||||
"sub analyzer \"{}\" params must be dict",
|
TantivyBindingError::InvalidArgument(format!(
|
||||||
name
|
"sub analyzer \"{}\" params must be dict",
|
||||||
))
|
name
|
||||||
})?)?,
|
))
|
||||||
|
})?,
|
||||||
|
helper,
|
||||||
|
)?,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -257,9 +265,11 @@ impl Tokenizer for LangIdentTokenizer<'static> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
|
use std::sync::Arc;
|
||||||
use tantivy::tokenizer::Tokenizer;
|
use tantivy::tokenizer::Tokenizer;
|
||||||
|
|
||||||
use super::LangIdentTokenizer;
|
use super::LangIdentTokenizer;
|
||||||
|
use crate::analyzer::options::{FileResourcePathHelper, ResourceInfo};
|
||||||
use crate::analyzer::tokenizers::lang_ident_tokenizer::BoxIdentifier;
|
use crate::analyzer::tokenizers::lang_ident_tokenizer::BoxIdentifier;
|
||||||
use crate::analyzer::{create_analyzer, create_analyzer_by_json};
|
use crate::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
@ -276,8 +286,8 @@ mod tests {
|
|||||||
|
|
||||||
let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
|
let mut analyzer = LangIdentTokenizer::new(BoxIdentifier::default());
|
||||||
let result = || -> Result<()> {
|
let result = || -> Result<()> {
|
||||||
analyzer.add("default", create_analyzer(standard_params)?);
|
analyzer.add("default", create_analyzer(standard_params, "")?);
|
||||||
analyzer.add("cmn", create_analyzer(jieba_params)?);
|
analyzer.add("cmn", create_analyzer(jieba_params, "")?);
|
||||||
Ok(())
|
Ok(())
|
||||||
}();
|
}();
|
||||||
|
|
||||||
@ -304,6 +314,7 @@ mod tests {
|
|||||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||||
LangIdentTokenizer::from_json(
|
LangIdentTokenizer::from_json(
|
||||||
json_params.as_object().unwrap(),
|
json_params.as_object().unwrap(),
|
||||||
|
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||||
create_analyzer_by_json,
|
create_analyzer_by_json,
|
||||||
);
|
);
|
||||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||||
@ -337,6 +348,7 @@ mod tests {
|
|||||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||||
LangIdentTokenizer::from_json(
|
LangIdentTokenizer::from_json(
|
||||||
json_params.as_object().unwrap(),
|
json_params.as_object().unwrap(),
|
||||||
|
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||||
create_analyzer_by_json,
|
create_analyzer_by_json,
|
||||||
);
|
);
|
||||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||||
@ -372,6 +384,7 @@ mod tests {
|
|||||||
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
let builder: std::result::Result<LangIdentTokenizer, crate::error::TantivyBindingError> =
|
||||||
LangIdentTokenizer::from_json(
|
LangIdentTokenizer::from_json(
|
||||||
json_params.as_object().unwrap(),
|
json_params.as_object().unwrap(),
|
||||||
|
&mut FileResourcePathHelper::new(Arc::new(ResourceInfo::new())),
|
||||||
create_analyzer_by_json,
|
create_analyzer_by_json,
|
||||||
);
|
);
|
||||||
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
assert!(builder.is_ok(), "error: {}", builder.err().unwrap());
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
use crate::analyzer::options::FileResourcePathHelper;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use serde_json as json;
|
use serde_json as json;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
@ -24,24 +25,29 @@ pub fn icu_builder() -> TextAnalyzerBuilder {
|
|||||||
|
|
||||||
pub fn lang_ident_builder(
|
pub fn lang_ident_builder(
|
||||||
params: Option<&json::Map<String, json::Value>>,
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
helper: &mut FileResourcePathHelper,
|
||||||
|
fc: fn(
|
||||||
|
&json::Map<String, json::Value>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
|
) -> Result<TextAnalyzer>,
|
||||||
) -> Result<TextAnalyzerBuilder> {
|
) -> Result<TextAnalyzerBuilder> {
|
||||||
if params.is_none() {
|
if params.is_none() {
|
||||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||||
"lang ident tokenizer must be customized"
|
"lang ident tokenizer must be customized"
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
let tokenizer = LangIdentTokenizer::from_json(params.unwrap(), fc)?;
|
let tokenizer = LangIdentTokenizer::from_json(params.unwrap(), helper, fc)?;
|
||||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn jieba_builder(
|
pub fn jieba_builder(
|
||||||
params: Option<&json::Map<String, json::Value>>,
|
params: Option<&json::Map<String, json::Value>>,
|
||||||
|
helper: &mut FileResourcePathHelper,
|
||||||
) -> Result<TextAnalyzerBuilder> {
|
) -> Result<TextAnalyzerBuilder> {
|
||||||
if params.is_none() {
|
if params.is_none() {
|
||||||
return Ok(TextAnalyzer::builder(JiebaTokenizer::new()).dynamic());
|
return Ok(TextAnalyzer::builder(JiebaTokenizer::new()).dynamic());
|
||||||
}
|
}
|
||||||
let tokenizer = JiebaTokenizer::from_json(params.unwrap())?;
|
let tokenizer = JiebaTokenizer::from_json(params.unwrap(), helper)?;
|
||||||
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
Ok(TextAnalyzer::builder(tokenizer).dynamic())
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -83,7 +89,8 @@ pub fn char_group_builder(
|
|||||||
|
|
||||||
pub fn get_builder_with_tokenizer(
|
pub fn get_builder_with_tokenizer(
|
||||||
params: &json::Value,
|
params: &json::Value,
|
||||||
fc: fn(&json::Map<String, json::Value>) -> Result<TextAnalyzer>,
|
helper: &mut FileResourcePathHelper,
|
||||||
|
fc: fn(&json::Map<String, json::Value>, &mut FileResourcePathHelper) -> Result<TextAnalyzer>,
|
||||||
) -> Result<TextAnalyzerBuilder> {
|
) -> Result<TextAnalyzerBuilder> {
|
||||||
let name;
|
let name;
|
||||||
let params_map;
|
let params_map;
|
||||||
@ -113,11 +120,11 @@ pub fn get_builder_with_tokenizer(
|
|||||||
match name {
|
match name {
|
||||||
"standard" => Ok(standard_builder()),
|
"standard" => Ok(standard_builder()),
|
||||||
"whitespace" => Ok(whitespace_builder()),
|
"whitespace" => Ok(whitespace_builder()),
|
||||||
"jieba" => jieba_builder(params_map),
|
"jieba" => jieba_builder(params_map, helper),
|
||||||
"lindera" => lindera_builder(params_map),
|
"lindera" => lindera_builder(params_map),
|
||||||
"char_group" => char_group_builder(params_map),
|
"char_group" => char_group_builder(params_map),
|
||||||
"icu" => Ok(icu_builder()),
|
"icu" => Ok(icu_builder()),
|
||||||
"language_identifier" => lang_ident_builder(params_map, fc),
|
"language_identifier" => lang_ident_builder(params_map, helper, fc),
|
||||||
"grpc" => grpc_builder(params_map),
|
"grpc" => grpc_builder(params_map),
|
||||||
other => {
|
other => {
|
||||||
warn!("unsupported tokenizer: {}", other);
|
warn!("unsupported tokenizer: {}", other);
|
||||||
|
|||||||
@ -147,6 +147,14 @@ impl RustResult {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn from_vec_i64(value: Vec<i64>) -> Self {
|
||||||
|
RustResult {
|
||||||
|
success: true,
|
||||||
|
value: Value::RustArrayI64(RustArrayI64::from_vec(value)),
|
||||||
|
error: std::ptr::null(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn from_error(error: String) -> Self {
|
pub fn from_error(error: String) -> Self {
|
||||||
RustResult {
|
RustResult {
|
||||||
success: false,
|
success: false,
|
||||||
@ -184,6 +192,11 @@ pub extern "C" fn free_rust_result(result: RustResult) {
|
|||||||
free_rust_array(array);
|
free_rust_array(array);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Value::RustArrayI64(array) => {
|
||||||
|
if !array.array.is_null() {
|
||||||
|
free_rust_array_i64(array);
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
if !result.error.is_null() {
|
if !result.error.is_null() {
|
||||||
|
|||||||
@ -49,7 +49,7 @@ pub extern "C" fn tantivy_register_tokenizer(
|
|||||||
let real = ptr as *mut IndexReaderWrapper;
|
let real = ptr as *mut IndexReaderWrapper;
|
||||||
let tokenizer_name = cstr_to_str!(tokenizer_name);
|
let tokenizer_name = cstr_to_str!(tokenizer_name);
|
||||||
let params = cstr_to_str!(analyzer_params);
|
let params = cstr_to_str!(analyzer_params);
|
||||||
let analyzer = create_analyzer(params);
|
let analyzer = create_analyzer(params, "");
|
||||||
match analyzer {
|
match analyzer {
|
||||||
Ok(text_analyzer) => unsafe {
|
Ok(text_analyzer) => unsafe {
|
||||||
(*real).register_tokenizer(String::from(tokenizer_name), text_analyzer);
|
(*real).register_tokenizer(String::from(tokenizer_name), text_analyzer);
|
||||||
|
|||||||
@ -157,7 +157,7 @@ impl AnalyzerBuilder<'_> {
|
|||||||
Some(type_) => {
|
Some(type_) => {
|
||||||
if !type_.is_string() {
|
if !type_.is_string() {
|
||||||
return Err(TantivyBindingError::InternalError(format!(
|
return Err(TantivyBindingError::InternalError(format!(
|
||||||
"analyzer type shoud be string"
|
"analyzer type should be string"
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
return self.build_template(type_.as_str().unwrap());
|
return self.build_template(type_.as_str().unwrap());
|
||||||
|
|||||||
@ -37,7 +37,7 @@ impl IndexWriterWrapperImpl {
|
|||||||
field_name
|
field_name
|
||||||
);
|
);
|
||||||
|
|
||||||
let tokenizer = create_analyzer(tokenizer_params)?;
|
let tokenizer = create_analyzer(tokenizer_params, "")?;
|
||||||
|
|
||||||
let (schema, field) = build_text_schema(field_name, tokenizer_name);
|
let (schema, field) = build_text_schema(field_name, tokenizer_name);
|
||||||
let index = if in_ram {
|
let index = if in_ram {
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
use crate::analyzer::create_analyzer_by_json;
|
use crate::analyzer::create_analyzer_by_json;
|
||||||
|
use crate::analyzer::options::get_global_file_resource_helper;
|
||||||
use serde_json::{self, Value};
|
use serde_json::{self, Value};
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
@ -38,7 +39,9 @@ pub fn compute_phrase_match_slop(
|
|||||||
.ok_or("Tokenizer params must be a JSON object")?;
|
.ok_or("Tokenizer params must be a JSON object")?;
|
||||||
|
|
||||||
// 2. Create Analyzer
|
// 2. Create Analyzer
|
||||||
let mut analyzer = create_analyzer_by_json(params_obj)
|
// TODO: support build helper from extra_info
|
||||||
|
let mut helper = get_global_file_resource_helper();
|
||||||
|
let mut analyzer = create_analyzer_by_json(params_obj, &mut helper)
|
||||||
.map_err(|e| format!("Failed to create analyzer: {:?}", e))?;
|
.map_err(|e| format!("Failed to create analyzer: {:?}", e))?;
|
||||||
|
|
||||||
// 3. Tokenize Query
|
// 3. Tokenize Query
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
use libc::{c_char, c_void};
|
use libc::{c_char, c_void};
|
||||||
use tantivy::tokenizer::TextAnalyzer;
|
use tantivy::tokenizer::TextAnalyzer;
|
||||||
|
|
||||||
use crate::analyzer::{create_analyzer, set_options};
|
use crate::analyzer::{create_analyzer, set_options, validate_analyzer};
|
||||||
use crate::{
|
use crate::{
|
||||||
array::RustResult,
|
array::RustResult,
|
||||||
log::init_log,
|
log::init_log,
|
||||||
@ -10,10 +10,14 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> RustResult {
|
pub extern "C" fn tantivy_create_analyzer(
|
||||||
|
analyzer_params: *const c_char,
|
||||||
|
extra_info: *const c_char,
|
||||||
|
) -> RustResult {
|
||||||
init_log();
|
init_log();
|
||||||
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
||||||
let analyzer = create_analyzer(¶ms);
|
let extra_info_str = unsafe { c_str_to_str(extra_info).to_string() };
|
||||||
|
let analyzer = create_analyzer(¶ms, &extra_info_str);
|
||||||
match analyzer {
|
match analyzer {
|
||||||
Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)),
|
Ok(text_analyzer) => RustResult::from_ptr(create_binding(text_analyzer)),
|
||||||
Err(err) => RustResult::from_error(format!(
|
Err(err) => RustResult::from_error(format!(
|
||||||
@ -23,6 +27,24 @@ pub extern "C" fn tantivy_create_analyzer(analyzer_params: *const c_char) -> Rus
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[no_mangle]
|
||||||
|
pub extern "C" fn tantivy_validate_analyzer(
|
||||||
|
analyzer_params: *const c_char,
|
||||||
|
extra_info: *const c_char,
|
||||||
|
) -> RustResult {
|
||||||
|
init_log();
|
||||||
|
let params = unsafe { c_str_to_str(analyzer_params).to_string() };
|
||||||
|
let extra_info_str = unsafe { c_str_to_str(extra_info).to_string() };
|
||||||
|
let result = validate_analyzer(¶ms, &extra_info_str);
|
||||||
|
match result {
|
||||||
|
Ok(ids) => RustResult::from_vec_i64(ids),
|
||||||
|
Err(err) => RustResult::from_error(format!(
|
||||||
|
"validate tokenizer failed with error: {} param: {}",
|
||||||
|
err, params,
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
|
pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
|
||||||
let analyzer = ptr as *mut TextAnalyzer;
|
let analyzer = ptr as *mut TextAnalyzer;
|
||||||
|
|||||||
44
internal/core/thirdparty/tantivy/tokenizer.h
vendored
44
internal/core/thirdparty/tantivy/tokenizer.h
vendored
@ -14,9 +14,20 @@ struct Tokenizer {
|
|||||||
NO_COPY_OR_ASSIGN(Tokenizer);
|
NO_COPY_OR_ASSIGN(Tokenizer);
|
||||||
|
|
||||||
explicit Tokenizer(std::string&& params) {
|
explicit Tokenizer(std::string&& params) {
|
||||||
auto shared_params = std::make_shared<std::string>(std::move(params));
|
auto shared_params = std::make_shared<std::string>(params);
|
||||||
auto res =
|
auto res = RustResultWrapper(
|
||||||
RustResultWrapper(tantivy_create_analyzer(shared_params->c_str()));
|
tantivy_create_analyzer(shared_params->c_str(), ""));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"Tokenizer creation failed: {}",
|
||||||
|
res.result_->error);
|
||||||
|
ptr_ = res.result_->value.ptr._0;
|
||||||
|
}
|
||||||
|
|
||||||
|
explicit Tokenizer(std::string&& params, std::string&& extra_info) {
|
||||||
|
auto shared_params = std::make_shared<std::string>(params);
|
||||||
|
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
||||||
|
auto res = RustResultWrapper(tantivy_create_analyzer(
|
||||||
|
shared_params->c_str(), shared_extra_info->c_str()));
|
||||||
AssertInfo(res.result_->success,
|
AssertInfo(res.result_->success,
|
||||||
"Tokenizer creation failed: {}",
|
"Tokenizer creation failed: {}",
|
||||||
res.result_->error);
|
res.result_->error);
|
||||||
@ -69,4 +80,31 @@ set_tokenizer_options(std::string&& params) {
|
|||||||
res.result_->error);
|
res.result_->error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline std::pair<int64_t*, size_t>
|
||||||
|
validate_analyzer(std::string&& params, std::string&& extra_info) {
|
||||||
|
auto shared_params = std::make_shared<std::string>(params);
|
||||||
|
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
||||||
|
auto res = RustResultWrapper(tantivy_validate_analyzer(
|
||||||
|
shared_params->c_str(), shared_extra_info->c_str()));
|
||||||
|
AssertInfo(res.result_->success,
|
||||||
|
"Validate analyzer params failed: {}",
|
||||||
|
res.result_->error);
|
||||||
|
auto array_wrapper =
|
||||||
|
RustArrayI64Wrapper(std::move(res.result_->value.rust_array_i64._0));
|
||||||
|
auto* array = array_wrapper.array_.array;
|
||||||
|
auto len = array_wrapper.array_.len;
|
||||||
|
|
||||||
|
int64_t* result = nullptr;
|
||||||
|
if (len > 0) {
|
||||||
|
result = static_cast<int64_t*>(malloc(len * sizeof(int64_t)));
|
||||||
|
if (result == nullptr) {
|
||||||
|
throw std::bad_alloc();
|
||||||
|
}
|
||||||
|
std::memcpy(result,
|
||||||
|
array,
|
||||||
|
len * sizeof(int64_t)); // Copy the array to the result
|
||||||
|
}
|
||||||
|
return {result, len};
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace milvus::tantivy
|
} // namespace milvus::tantivy
|
||||||
|
|||||||
@ -17,6 +17,8 @@
|
|||||||
package model
|
package model
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"slices"
|
||||||
|
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||||
@ -52,6 +54,7 @@ type Collection struct {
|
|||||||
UpdateTimestamp uint64
|
UpdateTimestamp uint64
|
||||||
SchemaVersion int32
|
SchemaVersion int32
|
||||||
ShardInfos map[string]*ShardInfo
|
ShardInfos map[string]*ShardInfo
|
||||||
|
FileResourceIds []int64
|
||||||
}
|
}
|
||||||
|
|
||||||
type ShardInfo struct {
|
type ShardInfo struct {
|
||||||
@ -90,6 +93,7 @@ func (c *Collection) ShallowClone() *Collection {
|
|||||||
UpdateTimestamp: c.UpdateTimestamp,
|
UpdateTimestamp: c.UpdateTimestamp,
|
||||||
SchemaVersion: c.SchemaVersion,
|
SchemaVersion: c.SchemaVersion,
|
||||||
ShardInfos: c.ShardInfos,
|
ShardInfos: c.ShardInfos,
|
||||||
|
FileResourceIds: c.FileResourceIds,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -127,6 +131,7 @@ func (c *Collection) Clone() *Collection {
|
|||||||
UpdateTimestamp: c.UpdateTimestamp,
|
UpdateTimestamp: c.UpdateTimestamp,
|
||||||
SchemaVersion: c.SchemaVersion,
|
SchemaVersion: c.SchemaVersion,
|
||||||
ShardInfos: shardInfos,
|
ShardInfos: shardInfos,
|
||||||
|
FileResourceIds: slices.Clone(c.FileResourceIds),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,6 +237,7 @@ func UnmarshalCollectionModel(coll *pb.CollectionInfo) *Collection {
|
|||||||
UpdateTimestamp: coll.UpdateTimestamp,
|
UpdateTimestamp: coll.UpdateTimestamp,
|
||||||
SchemaVersion: coll.Schema.Version,
|
SchemaVersion: coll.Schema.Version,
|
||||||
ShardInfos: shardInfos,
|
ShardInfos: shardInfos,
|
||||||
|
FileResourceIds: coll.Schema.GetFileResourceIds(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -283,6 +289,7 @@ func marshalCollectionModelWithConfig(coll *Collection, c *config) *pb.Collectio
|
|||||||
EnableDynamicField: coll.EnableDynamicField,
|
EnableDynamicField: coll.EnableDynamicField,
|
||||||
DbName: coll.DBName,
|
DbName: coll.DBName,
|
||||||
Version: coll.SchemaVersion,
|
Version: coll.SchemaVersion,
|
||||||
|
FileResourceIds: coll.FileResourceIds,
|
||||||
}
|
}
|
||||||
|
|
||||||
if c.withFields {
|
if c.withFields {
|
||||||
|
|||||||
@ -1667,17 +1667,20 @@ func (node *QueryNode) ValidateAnalyzer(ctx context.Context, req *querypb.Valida
|
|||||||
}
|
}
|
||||||
defer node.lifetime.Done()
|
defer node.lifetime.Done()
|
||||||
|
|
||||||
|
resourceSet := typeutil.NewSet[int64]()
|
||||||
|
|
||||||
for _, info := range req.AnalyzerInfos {
|
for _, info := range req.AnalyzerInfos {
|
||||||
err := analyzer.ValidateAnalyzer(info.GetParams())
|
ids, err := analyzer.ValidateAnalyzer(info.GetParams())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if info.GetName() != "" {
|
if info.GetName() != "" {
|
||||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, name: %s, error: %v", info.GetField(), info.GetName(), err))}, nil
|
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, name: %s, error: %v", info.GetField(), info.GetName(), err))}, nil
|
||||||
}
|
}
|
||||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, error: %v", info.GetField(), err))}, nil
|
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(merr.WrapErrParameterInvalidMsg("validate analyzer failed for field: %s, error: %v", info.GetField(), err))}, nil
|
||||||
}
|
}
|
||||||
|
resourceSet.Insert(ids...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(nil)}, nil
|
return &querypb.ValidateAnalyzerResponse{Status: merr.Status(nil), ResourceIds: resourceSet.Collect()}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type deleteRequestStringer struct {
|
type deleteRequestStringer struct {
|
||||||
|
|||||||
@ -200,6 +200,7 @@ func (t *createCollectionTask) validateSchema(ctx context.Context, schema *schem
|
|||||||
}
|
}
|
||||||
|
|
||||||
// validate analyzer params at any streaming node
|
// validate analyzer params at any streaming node
|
||||||
|
// and set file resource ids to schema
|
||||||
if len(analyzerInfos) > 0 {
|
if len(analyzerInfos) > 0 {
|
||||||
resp, err := t.mixCoord.ValidateAnalyzer(t.ctx, &querypb.ValidateAnalyzerRequest{
|
resp, err := t.mixCoord.ValidateAnalyzer(t.ctx, &querypb.ValidateAnalyzerRequest{
|
||||||
AnalyzerInfos: analyzerInfos,
|
AnalyzerInfos: analyzerInfos,
|
||||||
@ -211,6 +212,7 @@ func (t *createCollectionTask) validateSchema(ctx context.Context, schema *schem
|
|||||||
if err := merr.Error(resp.GetStatus()); err != nil {
|
if err := merr.Error(resp.GetStatus()); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
schema.FileResourceIds = resp.GetResourceIds()
|
||||||
}
|
}
|
||||||
|
|
||||||
return validateFieldDataType(schema.GetFields())
|
return validateFieldDataType(schema.GetFields())
|
||||||
|
|||||||
@ -204,6 +204,7 @@ func newCollectionModel(header *message.CreateCollectionMessageHeader, body *mes
|
|||||||
UpdateTimestamp: ts,
|
UpdateTimestamp: ts,
|
||||||
SchemaVersion: 0,
|
SchemaVersion: 0,
|
||||||
ShardInfos: shardInfos,
|
ShardInfos: shardInfos,
|
||||||
|
FileResourceIds: body.CollectionSchema.GetFileResourceIds(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1080,6 +1080,7 @@ func convertModelToDesc(collInfo *model.Collection, aliases []string, dbName str
|
|||||||
Functions: model.MarshalFunctionModels(collInfo.Functions),
|
Functions: model.MarshalFunctionModels(collInfo.Functions),
|
||||||
EnableDynamicField: collInfo.EnableDynamicField,
|
EnableDynamicField: collInfo.EnableDynamicField,
|
||||||
Properties: collInfo.Properties,
|
Properties: collInfo.Properties,
|
||||||
|
FileResourceIds: collInfo.FileResourceIds,
|
||||||
}
|
}
|
||||||
resp.CollectionID = collInfo.CollectionID
|
resp.CollectionID = collInfo.CollectionID
|
||||||
resp.VirtualChannelNames = collInfo.VirtualChannelNames
|
resp.VirtualChannelNames = collInfo.VirtualChannelNames
|
||||||
|
|||||||
@ -11,11 +11,15 @@ type (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func NewAnalyzer(param string) (Analyzer, error) {
|
func NewAnalyzer(param string) (Analyzer, error) {
|
||||||
return canalyzer.NewAnalyzer(param)
|
return canalyzer.NewAnalyzer(param, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
func ValidateAnalyzer(param string) error {
|
func ValidateAnalyzer(param string) ([]int64, error) {
|
||||||
return canalyzer.ValidateAnalyzer(param)
|
return canalyzer.ValidateAnalyzer(param, "")
|
||||||
|
}
|
||||||
|
|
||||||
|
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
|
||||||
|
return canalyzer.UpdateGlobalResourceInfo(resourceMap)
|
||||||
}
|
}
|
||||||
|
|
||||||
func InitOptions() {
|
func InitOptions() {
|
||||||
|
|||||||
@ -13,9 +13,11 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/cockroachdb/errors"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
|
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
|
||||||
|
"github.com/milvus-io/milvus/internal/util/pathutil"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/log"
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
||||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||||
)
|
)
|
||||||
@ -23,8 +25,8 @@ import (
|
|||||||
const (
|
const (
|
||||||
LinderaDictURLKey = "lindera_download_urls"
|
LinderaDictURLKey = "lindera_download_urls"
|
||||||
ResourceMapKey = "resource_map"
|
ResourceMapKey = "resource_map"
|
||||||
DictPathKey = "local_dict_path"
|
|
||||||
ResourcePathKey = "resource_path"
|
ResourcePathKey = "resource_path"
|
||||||
|
StorageNameKey = "storage_name"
|
||||||
)
|
)
|
||||||
|
|
||||||
var initOnce sync.Once
|
var initOnce sync.Once
|
||||||
@ -39,7 +41,7 @@ func UpdateParams() {
|
|||||||
cfg := paramtable.Get()
|
cfg := paramtable.Get()
|
||||||
params := map[string]any{}
|
params := map[string]any{}
|
||||||
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
||||||
params[DictPathKey] = cfg.FunctionCfg.LocalResourcePath.GetValue()
|
params[ResourcePathKey] = pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||||
|
|
||||||
bytes, err := json.Marshal(params)
|
bytes, err := json.Marshal(params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -55,12 +57,31 @@ func UpdateParams() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
|
||||||
|
bytes, err := json.Marshal(map[string]any{"resource_map": resourceMap})
|
||||||
|
if err != nil {
|
||||||
|
return errors.Wrap(err, "marshal global resource info failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
paramPtr := C.CString(string(bytes))
|
||||||
|
defer C.free(unsafe.Pointer(paramPtr))
|
||||||
|
|
||||||
|
status := C.set_tokenizer_option(paramPtr)
|
||||||
|
if err := HandleCStatus(&status, "failed to update global resource info"); err != nil {
|
||||||
|
return errors.Wrap(err, "update global resource info failed")
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAnalyzer(param string, extraInfo string) (interfaces.Analyzer, error) {
|
||||||
paramPtr := C.CString(param)
|
paramPtr := C.CString(param)
|
||||||
defer C.free(unsafe.Pointer(paramPtr))
|
defer C.free(unsafe.Pointer(paramPtr))
|
||||||
|
|
||||||
|
extraInfoPtr := C.CString(extraInfo)
|
||||||
|
defer C.free(unsafe.Pointer(extraInfoPtr))
|
||||||
|
|
||||||
var ptr C.CTokenizer
|
var ptr C.CTokenizer
|
||||||
status := C.create_tokenizer(paramPtr, &ptr)
|
status := C.create_tokenizer(paramPtr, extraInfoPtr, &ptr)
|
||||||
if err := HandleCStatus(&status, "failed to create analyzer"); err != nil {
|
if err := HandleCStatus(&status, "failed to create analyzer"); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -68,13 +89,21 @@ func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
|||||||
return NewCAnalyzer(ptr), nil
|
return NewCAnalyzer(ptr), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ValidateAnalyzer(param string) error {
|
func ValidateAnalyzer(param string, extraInfo string) ([]int64, error) {
|
||||||
paramPtr := C.CString(param)
|
paramPtr := C.CString(param)
|
||||||
defer C.free(unsafe.Pointer(paramPtr))
|
defer C.free(unsafe.Pointer(paramPtr))
|
||||||
|
|
||||||
status := C.validate_tokenizer(paramPtr)
|
extraInfoPtr := C.CString(extraInfo)
|
||||||
if err := HandleCStatus(&status, "failed to create tokenizer"); err != nil {
|
defer C.free(unsafe.Pointer(extraInfoPtr))
|
||||||
return err
|
|
||||||
|
result := C.validate_tokenizer(paramPtr, extraInfoPtr)
|
||||||
|
if err := HandleCStatus(&result.status, "failed to validate tokenizer"); err != nil {
|
||||||
|
return nil, err
|
||||||
}
|
}
|
||||||
return nil
|
|
||||||
|
cIds := unsafe.Slice((*int64)(unsafe.Pointer(result.resource_ids)), result.resource_ids_count)
|
||||||
|
goIds := make([]int64, len(cIds))
|
||||||
|
copy(goIds, cIds)
|
||||||
|
C.free(unsafe.Pointer(result.resource_ids))
|
||||||
|
return goIds, nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4,6 +4,8 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
@ -12,6 +14,8 @@ import (
|
|||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
|
|
||||||
pb "github.com/milvus-io/milvus-proto/go-api/v2/tokenizerpb"
|
pb "github.com/milvus-io/milvus-proto/go-api/v2/tokenizerpb"
|
||||||
|
"github.com/milvus-io/milvus/internal/util/pathutil"
|
||||||
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||||
)
|
)
|
||||||
|
|
||||||
type mockServer struct {
|
type mockServer struct {
|
||||||
@ -32,7 +36,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
// use default analyzer.
|
// use default analyzer.
|
||||||
{
|
{
|
||||||
m := "{}"
|
m := "{}"
|
||||||
analyzer, err := NewAnalyzer(m)
|
analyzer, err := NewAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
defer analyzer.Destroy()
|
defer analyzer.Destroy()
|
||||||
|
|
||||||
@ -48,7 +52,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
|
|
||||||
{
|
{
|
||||||
m := ""
|
m := ""
|
||||||
analyzer, err := NewAnalyzer(m)
|
analyzer, err := NewAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
defer analyzer.Destroy()
|
defer analyzer.Destroy()
|
||||||
|
|
||||||
@ -65,7 +69,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
// use default tokenizer.
|
// use default tokenizer.
|
||||||
{
|
{
|
||||||
m := "{\"tokenizer\": \"standard\"}"
|
m := "{\"tokenizer\": \"standard\"}"
|
||||||
analyzer, err := NewAnalyzer(m)
|
analyzer, err := NewAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
defer analyzer.Destroy()
|
defer analyzer.Destroy()
|
||||||
|
|
||||||
@ -82,7 +86,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
// jieba tokenizer.
|
// jieba tokenizer.
|
||||||
{
|
{
|
||||||
m := "{\"tokenizer\": \"jieba\"}"
|
m := "{\"tokenizer\": \"jieba\"}"
|
||||||
analyzer, err := NewAnalyzer(m)
|
analyzer, err := NewAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
defer analyzer.Destroy()
|
defer analyzer.Destroy()
|
||||||
|
|
||||||
@ -124,7 +128,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
defer stop()
|
defer stop()
|
||||||
|
|
||||||
m := "{\"tokenizer\": {\"type\":\"grpc\", \"endpoint\":\"http://" + addr + "\"}}"
|
m := "{\"tokenizer\": {\"type\":\"grpc\", \"endpoint\":\"http://" + addr + "\"}}"
|
||||||
analyzer, err := NewAnalyzer(m)
|
analyzer, err := NewAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
defer analyzer.Destroy()
|
defer analyzer.Destroy()
|
||||||
|
|
||||||
@ -138,7 +142,7 @@ func TestAnalyzer(t *testing.T) {
|
|||||||
// lindera tokenizer.
|
// lindera tokenizer.
|
||||||
{
|
{
|
||||||
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
||||||
tokenizer, err := NewAnalyzer(m)
|
tokenizer, err := NewAnalyzer(m, "")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
defer tokenizer.Destroy()
|
defer tokenizer.Destroy()
|
||||||
|
|
||||||
@ -156,20 +160,78 @@ func TestValidateAnalyzer(t *testing.T) {
|
|||||||
// valid analyzer
|
// valid analyzer
|
||||||
{
|
{
|
||||||
m := "{\"tokenizer\": \"standard\"}"
|
m := "{\"tokenizer\": \"standard\"}"
|
||||||
err := ValidateAnalyzer(m)
|
ids, err := ValidateAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, len(ids), 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
m := ""
|
m := ""
|
||||||
err := ValidateAnalyzer(m)
|
_, err := ValidateAnalyzer(m, "")
|
||||||
assert.NoError(t, err)
|
assert.NoError(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// invalid tokenizer
|
// invalid tokenizer
|
||||||
{
|
{
|
||||||
m := "{\"tokenizer\": \"invalid\"}"
|
m := "{\"tokenizer\": \"invalid\"}"
|
||||||
err := ValidateAnalyzer(m)
|
_, err := ValidateAnalyzer(m, "")
|
||||||
assert.Error(t, err)
|
assert.Error(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// with user resource
|
||||||
|
{
|
||||||
|
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||||
|
defer os.RemoveAll(resourcePath)
|
||||||
|
UpdateParams()
|
||||||
|
resourceID := int64(100)
|
||||||
|
|
||||||
|
// mock remote resource file
|
||||||
|
dir := filepath.Join(resourcePath, "default", fmt.Sprintf("%d", resourceID))
|
||||||
|
err := os.MkdirAll(dir, os.ModePerm)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
f.WriteString("stop")
|
||||||
|
f.Close()
|
||||||
|
|
||||||
|
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
||||||
|
|
||||||
|
ids, err := ValidateAnalyzer(m, "{\"resource_map\": {\"jieba_dict\": 100}, \"storage_name\": \"default\"}")
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Equal(t, len(ids), 1)
|
||||||
|
assert.Equal(t, ids[0], resourceID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// with user resource and update global resource info
|
||||||
|
{
|
||||||
|
resourcePath := pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
||||||
|
defer os.RemoveAll(resourcePath)
|
||||||
|
UpdateParams()
|
||||||
|
resourceID := int64(100)
|
||||||
|
|
||||||
|
// mock remote resource file
|
||||||
|
dir := filepath.Join(resourcePath, fmt.Sprintf("%d", resourceID))
|
||||||
|
err := os.MkdirAll(dir, os.ModePerm)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
f, err := os.Create(filepath.Join(dir, "jieba.txt"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
f.WriteString("stop")
|
||||||
|
f.Close()
|
||||||
|
|
||||||
|
m := "{\"tokenizer\": \"standard\", \"filter\": [{\"type\": \"stop\", \"stop_words_file\": {\"type\": \"remote\",\"resource_name\": \"jieba_dict\", \"file_name\": \"jieba.txt\"}}]}"
|
||||||
|
|
||||||
|
// update global resource info
|
||||||
|
err = UpdateGlobalResourceInfo(map[string]int64{"jieba_dict": resourceID})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
ids, err := ValidateAnalyzer(m, "")
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
assert.Equal(t, len(ids), 1)
|
||||||
|
assert.Equal(t, ids[0], resourceID)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -22,7 +22,7 @@ require (
|
|||||||
github.com/jolestar/go-commons-pool/v2 v2.1.2
|
github.com/jolestar/go-commons-pool/v2 v2.1.2
|
||||||
github.com/json-iterator/go v1.1.13-0.20220915233716-71ac16282d12
|
github.com/json-iterator/go v1.1.13-0.20220915233716-71ac16282d12
|
||||||
github.com/klauspost/compress v1.18.0
|
github.com/klauspost/compress v1.18.0
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f
|
||||||
github.com/minio/minio-go/v7 v7.0.73
|
github.com/minio/minio-go/v7 v7.0.73
|
||||||
github.com/panjf2000/ants/v2 v2.11.3
|
github.com/panjf2000/ants/v2 v2.11.3
|
||||||
github.com/prometheus/client_golang v1.20.5
|
github.com/prometheus/client_golang v1.20.5
|
||||||
|
|||||||
@ -482,8 +482,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
|
|||||||
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
|
||||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
|
||||||
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece h1:s0TFMZBxADKSzIr7LW/TE3L/WgCuo7QOfzkYX92Xog0=
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f h1:YQ61KOySWPEXv8ePkr0Cu5q5iVHN11IIUSTWIiALCE8=
|
||||||
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251215075310-deda9c0dcece/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
github.com/milvus-io/milvus-proto/go-api/v2 v2.6.6-0.20251218031911-f415d420437f/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
|
||||||
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
|
||||||
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
|
||||||
github.com/minio/minio-go/v7 v7.0.73 h1:qr2vi96Qm7kZ4v7LLebjte+MQh621fFWnv93p12htEo=
|
github.com/minio/minio-go/v7 v7.0.73 h1:qr2vi96Qm7kZ4v7LLebjte+MQh621fFWnv93p12htEo=
|
||||||
|
|||||||
@ -1037,6 +1037,7 @@ message ValidateAnalyzerRequest{
|
|||||||
|
|
||||||
message ValidateAnalyzerResponse{
|
message ValidateAnalyzerResponse{
|
||||||
common.Status status = 1;
|
common.Status status = 1;
|
||||||
|
repeated int64 resource_ids = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
message HighlightOptions{
|
message HighlightOptions{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user