mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: add global analyzer options (#44684)
relate: https://github.com/milvus-io/milvus/issues/43687 Add global analyzer options, avoid having to merge some milvus params into user's analyzer params. Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
c33d221536
commit
ad9a0cae48
@ -20,6 +20,18 @@
|
||||
|
||||
using Map = std::map<std::string, std::string>;
|
||||
|
||||
CStatus
|
||||
set_tokenizer_option(const char* params) {
|
||||
SCOPE_CGO_CALL_METRIC();
|
||||
|
||||
try {
|
||||
milvus::tantivy::set_tokenizer_options(params);
|
||||
return milvus::SuccessCStatus();
|
||||
} catch (std::exception& e) {
|
||||
return milvus::FailureCStatus(&e);
|
||||
}
|
||||
}
|
||||
|
||||
CStatus
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer) {
|
||||
SCOPE_CGO_CALL_METRIC();
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common/common_type_c.h"
|
||||
#include "segcore/token_stream_c.h"
|
||||
#include "common/type_c.h"
|
||||
|
||||
@ -22,6 +23,9 @@ extern "C" {
|
||||
|
||||
typedef void* CTokenizer;
|
||||
|
||||
CStatus
|
||||
set_tokenizer_option(const char* params);
|
||||
|
||||
CStatus
|
||||
create_tokenizer(const char* params, CTokenizer* tokenizer);
|
||||
|
||||
|
||||
@ -500,6 +500,8 @@ void *tantivy_clone_analyzer(void *ptr);
|
||||
|
||||
void tantivy_free_analyzer(void *tokenizer);
|
||||
|
||||
RustResult tantivy_set_analyzer_options(const char *params);
|
||||
|
||||
bool tantivy_index_exist(const char *path);
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@ -2,8 +2,10 @@ mod analyzer;
|
||||
mod build_in_analyzer;
|
||||
mod dict;
|
||||
mod filter;
|
||||
mod runtime_option;
|
||||
|
||||
pub mod tokenizers;
|
||||
pub use self::analyzer::{create_analyzer, create_analyzer_by_json};
|
||||
pub use self::runtime_option::set_options;
|
||||
|
||||
pub(crate) use self::build_in_analyzer::standard_analyzer;
|
||||
|
||||
144
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs
vendored
Normal file
144
internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/runtime_option.rs
vendored
Normal file
@ -0,0 +1,144 @@
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde_json as json;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
static GLOBAL_OPTIONS: Lazy<Arc<RuntimeOption>> = Lazy::new(|| Arc::new(RuntimeOption::new()));
|
||||
|
||||
// cache key
|
||||
static LINDERA_DOWNLOAD_KEY: &str = "lindera_download_urls";
|
||||
static RESOURCE_MAP_KEY: &str = "resource_map";
|
||||
|
||||
// normal key
|
||||
pub static DEFAULT_DICT_PATH_KEY: &str = "default_dict_path";
|
||||
pub static RESOURCE_PATH_KEY: &str = "resource_path";
|
||||
|
||||
pub fn set_options(params: &String) -> Result<()> {
|
||||
GLOBAL_OPTIONS.set_json(params)
|
||||
}
|
||||
|
||||
pub fn get_options(key: &str) -> Option<json::Value> {
|
||||
GLOBAL_OPTIONS.get(key)
|
||||
}
|
||||
|
||||
pub fn get_lindera_download_url(kind: &str) -> Option<Vec<String>> {
|
||||
GLOBAL_OPTIONS.get_lindera_download_urls(kind)
|
||||
}
|
||||
|
||||
pub fn get_resource_id(name: &str) -> Option<i64> {
|
||||
GLOBAL_OPTIONS.get_resource_id(name)
|
||||
}
|
||||
|
||||
// analyzer options
|
||||
struct RuntimeOption {
|
||||
inner: RwLock<RuntimeOptionInner>,
|
||||
}
|
||||
|
||||
impl RuntimeOption {
|
||||
fn new() -> Self {
|
||||
return RuntimeOption {
|
||||
inner: RwLock::new(RuntimeOptionInner::new()),
|
||||
};
|
||||
}
|
||||
|
||||
fn set_json(&self, json_params: &String) -> Result<()> {
|
||||
let mut w = self.inner.write().unwrap();
|
||||
w.set_json(json_params)
|
||||
}
|
||||
|
||||
fn get(&self, key: &str) -> Option<json::Value> {
|
||||
let r = self.inner.read().unwrap();
|
||||
r.params.get(key).map(|v| v.clone())
|
||||
}
|
||||
|
||||
fn get_lindera_download_urls(&self, kind: &str) -> Option<Vec<String>> {
|
||||
let r = self.inner.read().unwrap();
|
||||
r.lindera_download_urls.get(kind).map(|v| v.clone())
|
||||
}
|
||||
|
||||
fn get_resource_id(&self, name: &str) -> Option<i64> {
|
||||
let r = self.inner.read().unwrap();
|
||||
r.resource_map.get(name).cloned()
|
||||
}
|
||||
}
|
||||
|
||||
struct RuntimeOptionInner {
|
||||
params: HashMap<String, json::Value>,
|
||||
resource_map: HashMap<String, i64>, // resource name -> resource id
|
||||
lindera_download_urls: HashMap<String, Vec<String>>, // dict name -> url
|
||||
}
|
||||
|
||||
impl RuntimeOptionInner {
|
||||
fn new() -> Self {
|
||||
RuntimeOptionInner {
|
||||
params: HashMap::new(),
|
||||
resource_map: HashMap::new(),
|
||||
lindera_download_urls: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn set_json(&mut self, json_params: &String) -> Result<()> {
|
||||
let v = json::from_str::<json::Value>(json_params)
|
||||
.map_err(|e| TantivyBindingError::JsonError(e))?;
|
||||
|
||||
let m = v.as_object().ok_or(TantivyBindingError::InternalError(
|
||||
"analyzer params should be json map".to_string(),
|
||||
))?;
|
||||
|
||||
for (key, value) in m.to_owned() {
|
||||
self.set(key, value)?;
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
fn set(&mut self, key: String, value: json::Value) -> Result<()> {
|
||||
// cache linera download map
|
||||
if key == LINDERA_DOWNLOAD_KEY {
|
||||
self.lindera_download_urls = HashMap::new();
|
||||
|
||||
let m = value.as_object().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download urls should be a json map".to_string(),
|
||||
))?;
|
||||
|
||||
for (key, value) in m {
|
||||
let array = value.as_array().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download urls shoud be list".to_string(),
|
||||
))?;
|
||||
|
||||
if !array.iter().all(|v| v.is_string()) {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
"all elements in lindera download urls must be string".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let urls = array
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str().map(|s| s.to_string()))
|
||||
.collect();
|
||||
self.lindera_download_urls.insert(key.to_string(), urls);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if key == RESOURCE_MAP_KEY {
|
||||
self.resource_map = HashMap::new();
|
||||
|
||||
let m = value.as_object().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download urls should be a json map".to_string(),
|
||||
))?;
|
||||
|
||||
for (key, value) in m {
|
||||
let url = value.as_i64().ok_or(TantivyBindingError::InternalError(
|
||||
"lindera download url shoud be string".to_string(),
|
||||
))?;
|
||||
self.resource_map.insert(key.to_string(), url);
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.params.insert(key, value);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
@ -6,6 +6,7 @@ use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
use lindera::token::Token as LToken;
|
||||
use lindera::tokenizer::Tokenizer as LTokenizer;
|
||||
use log::warn;
|
||||
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
|
||||
use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;
|
||||
@ -16,7 +17,9 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
||||
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||
|
||||
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
||||
use crate::analyzer::filter::get_string_list;
|
||||
use crate::analyzer::runtime_option::{
|
||||
get_lindera_download_url, get_options, DEFAULT_DICT_PATH_KEY,
|
||||
};
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
|
||||
@ -25,10 +28,8 @@ pub struct LinderaTokenStream<'a> {
|
||||
pub token: &'a mut Token,
|
||||
}
|
||||
|
||||
const DICTKINDKEY: &str = "dict_kind";
|
||||
const DICTBUILDDIRKEY: &str = "dict_build_dir";
|
||||
const DICTDOWNLOADURLKEY: &str = "download_urls";
|
||||
const FILTERKEY: &str = "filter";
|
||||
const DICT_KIND_KEY: &str = "dict_kind";
|
||||
const FILTER_KEY: &str = "filter";
|
||||
|
||||
impl<'a> TokenStream for LinderaTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
@ -67,8 +68,8 @@ impl LinderaTokenizer {
|
||||
let kind: DictionaryKind = fetch_lindera_kind(params)?;
|
||||
|
||||
// for download dict online
|
||||
let build_dir = fetch_dict_build_dir(params)?;
|
||||
let download_urls = fetch_dict_download_urls(params)?;
|
||||
let build_dir = fetch_dict_build_dir()?;
|
||||
let download_urls = get_lindera_download_url(kind.as_str()).unwrap_or(vec![]);
|
||||
|
||||
let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?;
|
||||
|
||||
@ -132,7 +133,7 @@ impl DictionaryKindParser for &str {
|
||||
|
||||
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
|
||||
params
|
||||
.get(DICTKINDKEY)
|
||||
.get(DICT_KIND_KEY)
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict_kind must be set"
|
||||
)))?
|
||||
@ -143,21 +144,13 @@ fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<Diction
|
||||
.into_dict_kind()
|
||||
}
|
||||
|
||||
fn fetch_dict_build_dir(params: &json::Map<String, json::Value>) -> Result<String> {
|
||||
params
|
||||
.get(DICTBUILDDIRKEY)
|
||||
.map_or(Ok("/var/lib/milvus/dict/lindera".to_string()), |v| {
|
||||
fn fetch_dict_build_dir() -> Result<String> {
|
||||
get_options(DEFAULT_DICT_PATH_KEY).map_or(Ok("/var/lib/milvus/dict/lindera".to_string()), |v| {
|
||||
v.as_str()
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"dict build dir must be string"
|
||||
)))
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_dict_download_urls(params: &json::Map<String, json::Value>) -> Result<Vec<String>> {
|
||||
params.get(DICTDOWNLOADURLKEY).map_or(Ok(vec![]), |v| {
|
||||
get_string_list(v, "lindera dict download urls")
|
||||
.map(|s| format!("{}/{}", s, "lindera").to_string())
|
||||
})
|
||||
}
|
||||
|
||||
@ -328,7 +321,7 @@ fn fetch_lindera_token_filters(
|
||||
) -> Result<Vec<LTokenFilter>> {
|
||||
let mut result: Vec<LTokenFilter> = vec![];
|
||||
|
||||
match params.get(FILTERKEY) {
|
||||
match params.get(FILTER_KEY) {
|
||||
Some(v) => {
|
||||
let filter_list = v.as_array().ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!("lindera filters should be array"))
|
||||
|
||||
@ -131,6 +131,14 @@ pub struct RustResult {
|
||||
}
|
||||
|
||||
impl RustResult {
|
||||
pub fn from_success() -> Self {
|
||||
RustResult {
|
||||
success: true,
|
||||
value: Value::None(()),
|
||||
error: std::ptr::null(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_ptr(value: *mut c_void) -> Self {
|
||||
RustResult {
|
||||
success: true,
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
use libc::{c_char, c_void};
|
||||
use tantivy::tokenizer::TextAnalyzer;
|
||||
|
||||
use crate::analyzer::{create_analyzer, set_options};
|
||||
use crate::{
|
||||
analyzer::create_analyzer,
|
||||
array::RustResult,
|
||||
log::init_log,
|
||||
string_c::c_str_to_str,
|
||||
@ -34,3 +34,19 @@ pub extern "C" fn tantivy_clone_analyzer(ptr: *mut c_void) -> *mut c_void {
|
||||
pub extern "C" fn tantivy_free_analyzer(tokenizer: *mut c_void) {
|
||||
free_binding::<TextAnalyzer>(tokenizer);
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_set_analyzer_options(params: *const c_char) -> RustResult {
|
||||
init_log();
|
||||
let json_str = unsafe { c_str_to_str(params).to_string() };
|
||||
|
||||
set_options(&json_str).map_or_else(
|
||||
|e| {
|
||||
RustResult::from_error(format!(
|
||||
"set analyzer option failed: {}, params: {}",
|
||||
e, json_str
|
||||
))
|
||||
},
|
||||
|_| RustResult::from_success(),
|
||||
)
|
||||
}
|
||||
|
||||
11
internal/core/thirdparty/tantivy/tokenizer.h
vendored
11
internal/core/thirdparty/tantivy/tokenizer.h
vendored
@ -5,6 +5,7 @@
|
||||
#include "rust-hashmap.h"
|
||||
#include "tantivy/rust-array.h"
|
||||
#include "token-stream.h"
|
||||
#include "log/Log.h"
|
||||
|
||||
namespace milvus::tantivy {
|
||||
|
||||
@ -58,4 +59,14 @@ struct Tokenizer {
|
||||
void* ptr_;
|
||||
};
|
||||
|
||||
void
|
||||
set_tokenizer_options(std::string&& params) {
|
||||
auto shared_params = std::make_shared<std::string>(params);
|
||||
auto res =
|
||||
RustResultWrapper(tantivy_set_analyzer_options(shared_params->c_str()));
|
||||
AssertInfo(res.result_->success,
|
||||
"Set analyzer option failed: {}",
|
||||
res.result_->error);
|
||||
}
|
||||
|
||||
} // namespace milvus::tantivy
|
||||
|
||||
@ -55,6 +55,7 @@ import (
|
||||
"github.com/milvus-io/milvus/internal/registry"
|
||||
"github.com/milvus-io/milvus/internal/storage"
|
||||
"github.com/milvus-io/milvus/internal/types"
|
||||
"github.com/milvus-io/milvus/internal/util/analyzer"
|
||||
"github.com/milvus-io/milvus/internal/util/dependency"
|
||||
"github.com/milvus-io/milvus/internal/util/hookutil"
|
||||
"github.com/milvus-io/milvus/internal/util/initcore"
|
||||
@ -300,6 +301,8 @@ func (node *QueryNode) Init() error {
|
||||
}
|
||||
|
||||
node.factory.Init(paramtable.Get())
|
||||
// init analyzer options
|
||||
analyzer.InitOptions()
|
||||
|
||||
localRootPath := paramtable.Get().LocalStorageCfg.Path.GetValue()
|
||||
localUsedSize, err := segcore.GetLocalUsedSize(localRootPath)
|
||||
|
||||
@ -17,3 +17,7 @@ func NewAnalyzer(param string) (Analyzer, error) {
|
||||
func ValidateAnalyzer(param string) error {
|
||||
return canalyzer.ValidateAnalyzer(param)
|
||||
}
|
||||
|
||||
func InitOptions() {
|
||||
canalyzer.InitOptions()
|
||||
}
|
||||
|
||||
@ -10,21 +10,52 @@ import "C"
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"path"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
||||
"github.com/milvus-io/milvus/pkg/v2/log"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
|
||||
func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
||||
param, err := CheckAndFillParams(param)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
const (
|
||||
LinderaDictURLKey = "lindera_download_urls"
|
||||
ResourceMapKey = "resource_map"
|
||||
DictPathKey = "local_dict_path"
|
||||
ResourcePathKey = "resource_path"
|
||||
)
|
||||
|
||||
var initOnce sync.Once
|
||||
|
||||
func InitOptions() {
|
||||
initOnce.Do(func() {
|
||||
UpdateParams()
|
||||
})
|
||||
}
|
||||
|
||||
func UpdateParams() {
|
||||
cfg := paramtable.Get()
|
||||
params := map[string]any{}
|
||||
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
||||
params[DictPathKey] = cfg.FunctionCfg.LocalResourcePath.GetValue()
|
||||
|
||||
bytes, err := json.Marshal(params)
|
||||
if err != nil {
|
||||
log.Panic("init analyzer option failed", zap.Error(err))
|
||||
}
|
||||
|
||||
paramPtr := C.CString(string(bytes))
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
status := C.set_tokenizer_option(paramPtr)
|
||||
if err := HandleCStatus(&status, "failed to init segcore analyzer option"); err != nil {
|
||||
log.Panic("init analyzer option failed", zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
@ -38,11 +69,6 @@ func NewAnalyzer(param string) (interfaces.Analyzer, error) {
|
||||
}
|
||||
|
||||
func ValidateAnalyzer(param string) error {
|
||||
param, err := CheckAndFillParams(param)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
@ -52,91 +78,3 @@ func ValidateAnalyzer(param string) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func CheckAndFillParams(params string) (string, error) {
|
||||
if len(params) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
var paramMaps map[string]any
|
||||
flag := false
|
||||
err := json.Unmarshal([]byte(params), ¶mMaps)
|
||||
if err != nil {
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("unmarshal analyzer params failed with json error: %s", err.Error()))
|
||||
}
|
||||
|
||||
tokenizer, ok := paramMaps["tokenizer"]
|
||||
if !ok {
|
||||
// skip check if no tokenizer params
|
||||
return params, nil
|
||||
}
|
||||
|
||||
switch value := tokenizer.(type) {
|
||||
case string:
|
||||
// return if use build-in tokenizer
|
||||
return params, nil
|
||||
case map[string]any:
|
||||
flag, err = CheckAndFillTokenizerParams(value)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
default:
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("analyzer params set tokenizer with unknown type"))
|
||||
}
|
||||
|
||||
// remarshal json params if params map was changed.
|
||||
if flag {
|
||||
bytes, err := json.Marshal(paramMaps)
|
||||
if err != nil {
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("marshal analyzer params failed with json error: %s", err.Error()))
|
||||
}
|
||||
return string(bytes), nil
|
||||
}
|
||||
return params, nil
|
||||
}
|
||||
|
||||
// fill some milvus params to tokenizer params
|
||||
func CheckAndFillTokenizerParams(params map[string]any) (bool, error) {
|
||||
v, ok := params["type"]
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer must set type"))
|
||||
}
|
||||
|
||||
tokenizerType, ok := v.(string)
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer type must be string"))
|
||||
}
|
||||
|
||||
switch tokenizerType {
|
||||
case "lindera":
|
||||
cfg := paramtable.Get()
|
||||
|
||||
if _, ok := params["dict_build_dir"]; ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer dict_build_dir was system params, should not be set"))
|
||||
}
|
||||
// build lindera to LocalResourcePath/lindera/dict_kind
|
||||
params["dict_build_dir"] = path.Join(cfg.FunctionCfg.LocalResourcePath.GetValue(), "lindera")
|
||||
|
||||
v, ok := params["dict_kind"]
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer must set dict_kind"))
|
||||
}
|
||||
dictKind, ok := v.(string)
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer dict kind must be string"))
|
||||
}
|
||||
dictUrlsMap := cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
||||
|
||||
if _, ok := params["download_urls"]; ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer download_urls was system params, should not be set"))
|
||||
}
|
||||
|
||||
if value, ok := dictUrlsMap["."+dictKind]; ok {
|
||||
// use download urls set in milvus yaml
|
||||
params["download_urls"] = paramtable.ParseAsStings(value)
|
||||
}
|
||||
return true, nil
|
||||
default:
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
@ -12,7 +12,6 @@ import (
|
||||
"google.golang.org/grpc"
|
||||
|
||||
pb "github.com/milvus-io/milvus-proto/go-api/v2/tokenizerpb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
|
||||
type mockServer struct {
|
||||
@ -90,7 +89,7 @@ func TestAnalyzer(t *testing.T) {
|
||||
tokenStream := analyzer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途")
|
||||
defer tokenStream.Destroy()
|
||||
for tokenStream.Advance() {
|
||||
fmt.Println(tokenStream.Token())
|
||||
assert.NotEmpty(t, tokenStream.Token())
|
||||
}
|
||||
}
|
||||
|
||||
@ -152,6 +151,8 @@ func TestAnalyzer(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestValidateAnalyzer(t *testing.T) {
|
||||
InitOptions()
|
||||
|
||||
// valid analyzer
|
||||
{
|
||||
m := "{\"tokenizer\": \"standard\"}"
|
||||
@ -172,71 +173,3 @@ func TestValidateAnalyzer(t *testing.T) {
|
||||
assert.Error(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckAndFillParams(t *testing.T) {
|
||||
paramtable.Init()
|
||||
paramtable.Get().SaveGroup(map[string]string{"function.analyzer.lindera.download_urls.ipadic": "/test/url"})
|
||||
|
||||
// normal case
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"jieba\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// fill lindera tokenizer download urls and dict local path
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// error with wrong json
|
||||
{
|
||||
m := "{invalid json"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// skip if use default analyzer
|
||||
{
|
||||
m := "{}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer without type
|
||||
{
|
||||
m := "{\"tokenizer\": {\"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer type not string
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": 1, \"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer params type
|
||||
{
|
||||
m := "{\"tokenizer\": 1}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error set dict_build_dir by user
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": \"lindera\", \"dict_kind\": \"ipadic\", \"dict_build_dir\": \"/tmp/milvus\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error lindera kind not set
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": \"lindera\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -141,7 +141,7 @@ func (p *functionConfig) init(base *BaseTable) {
|
||||
p.LocalResourcePath.Init(base.mgr)
|
||||
|
||||
p.LinderaDownloadUrls = ParamGroup{
|
||||
KeyPrefix: "function.analyzer.lindera.download_urls",
|
||||
KeyPrefix: "function.analyzer.lindera.download_urls.",
|
||||
Version: "2.5.16",
|
||||
}
|
||||
p.LinderaDownloadUrls.Init(base.mgr)
|
||||
|
||||
@ -1218,7 +1218,7 @@ func TestRunAnalyzer(t *testing.T) {
|
||||
|
||||
// run analyzer with invalid params
|
||||
_, err = mc.RunAnalyzer(ctx, client.NewRunAnalyzerOption("text doc").WithAnalyzerParamsStr("invalid params}"))
|
||||
common.CheckErr(t, err, false, "json error")
|
||||
common.CheckErr(t, err, false, "JsonError")
|
||||
|
||||
// run analyzer with custom analyzer
|
||||
tokens, err = mc.RunAnalyzer(ctx, client.NewRunAnalyzerOption("test doc").
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user