mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: support set lindera dict build dir and download url in yaml (#43541)
relate: https://github.com/milvus-io/milvus/issues/43120 --------- Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
This commit is contained in:
parent
e305a3fa35
commit
4f02b06abc
@ -1409,3 +1409,5 @@ function:
|
||||
enable: true # Whether to enable TEI rerank service
|
||||
vllm:
|
||||
enable: true # Whether to enable vllm rerank service
|
||||
analyzer:
|
||||
local_resource_path: /var/lib/milvus/analyzer
|
||||
|
||||
11
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
11
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -932,6 +932,16 @@ dependencies = [
|
||||
"num",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs4"
|
||||
version = "0.6.6"
|
||||
@ -3853,6 +3863,7 @@ dependencies = [
|
||||
"env_logger",
|
||||
"fancy-regex",
|
||||
"flate2",
|
||||
"fs2",
|
||||
"futures",
|
||||
"icu_segmenter",
|
||||
"jieba-rs",
|
||||
|
||||
@ -26,6 +26,7 @@ overflow-checks = false
|
||||
tantivy = { git = "https://github.com/zilliztech/tantivy.git" }
|
||||
tantivy-5 = { package = "tantivy", git = "https://github.com/zilliztech/tantivy", tag = "0.21.1-fix4" }
|
||||
lindera = "0.42.4"
|
||||
fs2 = "0.4"
|
||||
lindera-dictionary = "0.42.4"
|
||||
futures = "0.3.21"
|
||||
libc = "0.2"
|
||||
|
||||
@ -78,9 +78,9 @@ impl AnalyzerBuilder<'_> {
|
||||
for filter in filters {
|
||||
if filter.is_string() {
|
||||
let filter_name = filter.as_str().unwrap();
|
||||
let costum = self.filters.remove(filter_name);
|
||||
if !costum.is_none() {
|
||||
builder = costum.unwrap().transform(builder);
|
||||
let customize = self.filters.remove(filter_name);
|
||||
if !customize.is_none() {
|
||||
builder = customize.unwrap().transform(builder);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
use super::fetch;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lindera_dictionary::dictionary_builder::cc_cedict::CcCedictBuilder;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
#[cfg(feature = "lindera-cc-cedict")]
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
|
||||
#[cfg(not(feature = "lindera-cc-cedict"))]
|
||||
async fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, CcCedictBuilder::new())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch cc_cedict failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, CcCedictBuilder::new()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch cc_cedict failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "lindera-cc-cedict"))]
|
||||
@ -39,8 +36,7 @@ pub fn load_cc_cedict(
|
||||
params.download_urls = download_url
|
||||
}
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(download(¶ms))?;
|
||||
download(¶ms)?;
|
||||
fetch::load(¶ms)
|
||||
}
|
||||
|
||||
|
||||
@ -1,28 +1,59 @@
|
||||
use std::env;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::fs;
|
||||
use std::io::{self, Cursor, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use flate2::read::GzDecoder;
|
||||
use fs2::FileExt;
|
||||
use tar::Archive;
|
||||
|
||||
use lindera::dictionary::Dictionary;
|
||||
use lindera_dictionary::dictionary::character_definition::CharacterDefinition;
|
||||
use lindera_dictionary::dictionary::connection_cost_matrix::ConnectionCostMatrix;
|
||||
use lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary;
|
||||
use lindera_dictionary::dictionary::unknown_dictionary::UnknownDictionary;
|
||||
use lindera_dictionary::dictionary_builder::DictionaryBuilder;
|
||||
use log::{error, info, warn};
|
||||
use md5::Context;
|
||||
use rand::{rngs::SmallRng, seq::SliceRandom, SeedableRng};
|
||||
use reqwest::Client;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::time::sleep;
|
||||
use tokio::time::Duration;
|
||||
|
||||
use serde_json as json;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use super::common;
|
||||
use crate::error::TantivyBindingError;
|
||||
use lindera_dictionary::dictionary_builder::DictionaryBuilder;
|
||||
|
||||
const MAX_ROUND: usize = 3;
|
||||
|
||||
pub struct FileMutexGuard {
|
||||
file: Option<fs::File>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl FileMutexGuard {
|
||||
fn build(path: PathBuf) -> io::Result<FileMutexGuard> {
|
||||
let flock = fs::File::create(&path)?;
|
||||
flock.lock_exclusive()?;
|
||||
Ok(FileMutexGuard {
|
||||
file: Some(flock),
|
||||
path: path,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for FileMutexGuard {
|
||||
fn drop(&mut self) {
|
||||
if let Some(file) = self.file.take() {
|
||||
let _ = file.unlock();
|
||||
drop(file); // drop file before remove file
|
||||
let _ = std::fs::remove_file(&self.path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FetchParams {
|
||||
pub lindera_dir: String,
|
||||
|
||||
@ -138,35 +169,20 @@ async fn download_with_retry(
|
||||
Err("Failed to download a valid file from all sources".into())
|
||||
}
|
||||
|
||||
/// Fetch the necessary assets and then build the dictionary using `builder`
|
||||
pub async fn fetch(
|
||||
pub fn build(
|
||||
params: &FetchParams,
|
||||
builder: impl DictionaryBuilder,
|
||||
build_dir: &PathBuf,
|
||||
input_dir: &PathBuf,
|
||||
output_dir: &PathBuf,
|
||||
tmp_dir: &PathBuf,
|
||||
) -> Result<(), Box<dyn Error>> {
|
||||
use std::env;
|
||||
use std::fs::{rename, File};
|
||||
use std::io::{self, Cursor, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use flate2::read::GzDecoder;
|
||||
use tar::Archive;
|
||||
|
||||
let start = Instant::now();
|
||||
info!(
|
||||
"start fetch lindera dictionary name: {}\n",
|
||||
params.file_name.as_str()
|
||||
"start donwload and build lindera dictionary. name: {} to {:?}\n",
|
||||
params.file_name.as_str(),
|
||||
output_dir,
|
||||
);
|
||||
let build_dir = PathBuf::from(params.lindera_dir.as_str());
|
||||
std::fs::create_dir_all(&build_dir)?;
|
||||
|
||||
let input_dir = build_dir.join(params.input_dir.as_str());
|
||||
let output_dir = build_dir.join(params.output_dir.as_str());
|
||||
|
||||
// Fast path where the data is already in cache
|
||||
if output_dir.is_dir() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Source file path for build package
|
||||
let source_path_for_build = &build_dir.join(params.file_name.as_str());
|
||||
@ -179,33 +195,32 @@ pub async fn fetch(
|
||||
.user_agent(format!("Lindera/{}", env!("CARGO_PKG_VERSION")))
|
||||
.build()?;
|
||||
|
||||
let mut dest = File::create(tmp_path.as_path())?;
|
||||
let content = download_with_retry(
|
||||
let mut dest = fs::File::create(tmp_path.as_path())?;
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
let content = rt.block_on(download_with_retry(
|
||||
&client,
|
||||
params.download_urls.iter().map(|s| s.as_str()).collect(),
|
||||
MAX_ROUND,
|
||||
params.md5_hash.as_str(),
|
||||
)
|
||||
.await?;
|
||||
))?;
|
||||
|
||||
io::copy(&mut Cursor::new(content.as_slice()), &mut dest)?;
|
||||
dest.flush()?;
|
||||
|
||||
rename(tmp_path.clone(), source_path_for_build)?;
|
||||
fs::rename(tmp_path.clone(), source_path_for_build)?;
|
||||
|
||||
// Decompress a tar.gz file
|
||||
let tmp_extract_path = Path::new(&build_dir).join(format!("tmp-archive-{}", params.input_dir));
|
||||
let tmp_extracted_path = tmp_extract_path.join(params.input_dir.as_str());
|
||||
let _ = std::fs::remove_dir_all(&tmp_extract_path);
|
||||
std::fs::create_dir_all(&tmp_extract_path)?;
|
||||
let tmp_extracted_path = tmp_dir.join(params.input_dir.as_str());
|
||||
let _ = std::fs::remove_dir_all(&tmp_dir);
|
||||
std::fs::create_dir_all(&tmp_dir)?;
|
||||
|
||||
let mut tar_gz = File::open(source_path_for_build)?;
|
||||
let mut tar_gz = fs::File::open(source_path_for_build)?;
|
||||
let mut buffer = Vec::new();
|
||||
tar_gz.read_to_end(&mut buffer)?;
|
||||
let cursor = Cursor::new(buffer);
|
||||
let decoder = GzDecoder::new(cursor);
|
||||
let mut archive = Archive::new(decoder);
|
||||
archive.unpack(&tmp_extract_path)?;
|
||||
archive.unpack(&tmp_dir)?;
|
||||
|
||||
#[cfg(target_os = "windows")]
|
||||
{
|
||||
@ -229,10 +244,9 @@ pub async fn fetch(
|
||||
{
|
||||
// Empty the input directory first to avoid conflicts when renaming the directory later on Linux and macOS systems (which do not support overwriting directories).
|
||||
empty_directory(&input_dir)?;
|
||||
rename(tmp_extracted_path, &input_dir)?;
|
||||
fs::rename(tmp_extracted_path, &input_dir)?;
|
||||
}
|
||||
|
||||
let _ = std::fs::remove_dir_all(&tmp_extract_path);
|
||||
drop(dest);
|
||||
let _ = std::fs::remove_file(source_path_for_build);
|
||||
|
||||
@ -265,19 +279,54 @@ pub async fn fetch(
|
||||
empty_directory(&output_dir)?;
|
||||
|
||||
// Rename tmp_path to output_dir
|
||||
rename(tmp_path, &output_dir)?;
|
||||
fs::rename(tmp_path, &output_dir)?;
|
||||
}
|
||||
|
||||
let _ = std::fs::remove_dir_all(&input_dir);
|
||||
|
||||
info!(
|
||||
"finish fetch lindera dictionary name: {} duration: {} ms\n",
|
||||
"finish donwload and build lindera dictionary. name: {} duration: {} ms\n",
|
||||
params.file_name.as_str(),
|
||||
start.elapsed().as_millis()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fetch the necessary assets and then build the dictionary using `builder`
|
||||
pub fn fetch(params: &FetchParams, builder: impl DictionaryBuilder) -> Result<(), Box<dyn Error>> {
|
||||
let build_dir = PathBuf::from(params.lindera_dir.as_str());
|
||||
std::fs::create_dir_all(&build_dir)?;
|
||||
|
||||
let input_dir = build_dir.join(params.input_dir.as_str());
|
||||
let output_dir = build_dir.join(params.output_dir.as_str());
|
||||
let lock_path = build_dir.join(format!("lindera-{}.lock", params.file_name.as_str()));
|
||||
|
||||
// Skip create fs lock if already in cache
|
||||
if output_dir.is_dir() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let _flock_guard = FileMutexGuard::build(lock_path)?;
|
||||
|
||||
// Fast path where the data is already in cache
|
||||
if output_dir.is_dir() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Decompress a tar.gz file
|
||||
let tmp_dir = Path::new(&build_dir).join(format!("tmp-archive-{}", params.input_dir));
|
||||
|
||||
let build_result = build(
|
||||
params,
|
||||
builder,
|
||||
&build_dir,
|
||||
&input_dir,
|
||||
&output_dir,
|
||||
&tmp_dir,
|
||||
);
|
||||
let _ = std::fs::remove_dir_all(&tmp_dir);
|
||||
let _ = std::fs::remove_dir_all(&input_dir);
|
||||
|
||||
build_result
|
||||
}
|
||||
|
||||
pub fn load(params: &FetchParams) -> Result<lindera::dictionary::Dictionary, TantivyBindingError> {
|
||||
let dict_dir = PathBuf::from(params.lindera_dir.clone()).join(params.output_dir.clone());
|
||||
let da_data = fs::read(dict_dir.join(common::DA_DATA))?;
|
||||
@ -286,7 +335,7 @@ pub fn load(params: &FetchParams) -> Result<lindera::dictionary::Dictionary, Tan
|
||||
let words_data = fs::read(dict_dir.join(common::WORDS_DATA))?;
|
||||
let connection_data = fs::read(dict_dir.join(common::CONNECTION_DATA))?;
|
||||
let char_definition_data = fs::read(dict_dir.join(common::CHAR_DEFINITION_DATA))?;
|
||||
let unkonwn_data = fs::read(dict_dir.join(common::UNKNOWN_DATA))?;
|
||||
let unknown_data = fs::read(dict_dir.join(common::UNKNOWN_DATA))?;
|
||||
|
||||
let dict = Dictionary {
|
||||
prefix_dictionary: PrefixDictionary::load(
|
||||
@ -305,7 +354,7 @@ pub fn load(params: &FetchParams) -> Result<lindera::dictionary::Dictionary, Tan
|
||||
))
|
||||
},
|
||||
)?,
|
||||
unknown_dictionary: UnknownDictionary::load(unkonwn_data.as_slice()).map_err(|e| {
|
||||
unknown_dictionary: UnknownDictionary::load(unknown_data.as_slice()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"lindera load unknown dictionary failed, err:{}",
|
||||
e
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
use super::fetch;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lindera_dictionary::dictionary_builder::ipadic::IpadicBuilder;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
#[cfg(feature = "lindera-ipadic")]
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
|
||||
#[cfg(not(feature = "lindera-ipadic"))]
|
||||
async fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, IpadicBuilder::new())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ipadic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, IpadicBuilder::new()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ipadic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "lindera-ipadic"))]
|
||||
@ -40,8 +37,7 @@ pub fn load_ipadic(
|
||||
params.download_urls = download_url
|
||||
}
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(download(¶ms))?;
|
||||
download(¶ms)?;
|
||||
fetch::load(¶ms)
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,17 @@
|
||||
use super::fetch;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lindera_dictionary::dictionary_builder::ipadic_neologd::IpadicNeologdBuilder;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
#[cfg(feature = "lindera-ipadic-neologd")]
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
use lindera_dictionary::dictionary_builder::ipadic_neologd::IpadicNeologdBuilder;
|
||||
|
||||
#[cfg(not(feature = "lindera-ipadic-neologd"))]
|
||||
async fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, IpadicNeologdBuilder::new())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ipadic-neologd failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, IpadicNeologdBuilder::new()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ipadic-neologd failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "lindera-ipadic-neologd"))]
|
||||
@ -39,8 +35,7 @@ pub fn load_ipadic_neologd(
|
||||
params.download_urls = download_url
|
||||
}
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(download(¶ms))?;
|
||||
download(¶ms)?;
|
||||
fetch::load(¶ms)
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
use super::fetch;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lindera_dictionary::dictionary_builder::ko_dic::KoDicBuilder;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
#[cfg(feature = "lindera-ko-dic")]
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
|
||||
#[cfg(not(feature = "lindera-ko-dic"))]
|
||||
async fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, KoDicBuilder::new())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ko-dic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, KoDicBuilder::new()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch ko-dic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "lindera-ko-dic"))]
|
||||
@ -40,8 +37,7 @@ pub fn load_ko_dic(
|
||||
params.download_urls = download_url
|
||||
}
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(download(¶ms))?;
|
||||
download(¶ms)?;
|
||||
fetch::load(¶ms)
|
||||
}
|
||||
|
||||
|
||||
@ -1,21 +1,18 @@
|
||||
use super::fetch;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use lindera_dictionary::dictionary_builder::unidic::UnidicBuilder;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
#[cfg(feature = "lindera-unidic")]
|
||||
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
|
||||
|
||||
#[cfg(not(feature = "lindera-unidic"))]
|
||||
async fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, UnidicBuilder::new())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch unidic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
fn download(params: &fetch::FetchParams) -> Result<()> {
|
||||
fetch::fetch(params, UnidicBuilder::new()).map_err(|e| {
|
||||
TantivyBindingError::InternalError(format!(
|
||||
"fetch unidic failed with error: {}",
|
||||
e.to_string()
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "lindera-unidic"))]
|
||||
@ -39,8 +36,7 @@ pub fn load_unidic(
|
||||
params.download_urls = download_url
|
||||
}
|
||||
|
||||
let rt = Runtime::new().unwrap();
|
||||
rt.block_on(download(¶ms))?;
|
||||
download(¶ms)?;
|
||||
fetch::load(¶ms)
|
||||
}
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ use serde_json as json;
|
||||
use super::stop_words;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
|
||||
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
pub fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
|
||||
if !value.is_array() {
|
||||
return Err(TantivyBindingError::InternalError(
|
||||
format!("{} should be array", label).to_string(),
|
||||
|
||||
@ -16,6 +16,7 @@ use lindera::token_filter::korean_stop_tags::KoreanStopTagsTokenFilter;
|
||||
use lindera::token_filter::BoxTokenFilter as LTokenFilter;
|
||||
|
||||
use crate::analyzer::dict::lindera::load_dictionary_from_kind;
|
||||
use crate::analyzer::filter::get_string_list;
|
||||
use crate::error::{Result, TantivyBindingError};
|
||||
use serde_json as json;
|
||||
|
||||
@ -25,6 +26,8 @@ pub struct LinderaTokenStream<'a> {
|
||||
}
|
||||
|
||||
const DICTKINDKEY: &str = "dict_kind";
|
||||
const DICTBUILDDIRKEY: &str = "dict_build_dir";
|
||||
const DICTDOWNLOADURLKEY: &str = "download_urls";
|
||||
const FILTERKEY: &str = "filter";
|
||||
|
||||
impl<'a> TokenStream for LinderaTokenStream<'a> {
|
||||
@ -62,8 +65,12 @@ impl LinderaTokenizer {
|
||||
/// This function will create a new `LinderaTokenizer` with json parameters.
|
||||
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
|
||||
let kind: DictionaryKind = fetch_lindera_kind(params)?;
|
||||
let dictionary =
|
||||
load_dictionary_from_kind(&kind, "/var/lib/milvus/dict/lindera".to_string(), vec![])?;
|
||||
|
||||
// for download dict online
|
||||
let build_dir = fetch_dict_build_dir(params)?;
|
||||
let download_urls = fetch_dict_download_urls(params)?;
|
||||
|
||||
let dictionary = load_dictionary_from_kind(&kind, build_dir, download_urls)?;
|
||||
|
||||
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
|
||||
let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
|
||||
@ -126,18 +133,34 @@ impl DictionaryKindParser for &str {
|
||||
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
|
||||
params
|
||||
.get(DICTKINDKEY)
|
||||
.ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!("lindera tokenizer dict_kind must be set"))
|
||||
})?
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict_kind must be set"
|
||||
)))?
|
||||
.as_str()
|
||||
.ok_or_else(|| {
|
||||
TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict kind should be string"
|
||||
))
|
||||
})?
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"lindera tokenizer dict kind should be string"
|
||||
)))?
|
||||
.into_dict_kind()
|
||||
}
|
||||
|
||||
fn fetch_dict_build_dir(params: &json::Map<String, json::Value>) -> Result<String> {
|
||||
params
|
||||
.get(DICTBUILDDIRKEY)
|
||||
.map_or(Ok("/var/lib/milvus/dict/lindera".to_string()), |v| {
|
||||
v.as_str()
|
||||
.ok_or(TantivyBindingError::InvalidArgument(format!(
|
||||
"dict build dir must be string"
|
||||
)))
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_dict_download_urls(params: &json::Map<String, json::Value>) -> Result<Vec<String>> {
|
||||
params.get(DICTDOWNLOADURLKEY).map_or(Ok(vec![]), |v| {
|
||||
get_string_list(v, "lindera dict download urls")
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_lindera_tags_from_params(
|
||||
params: &json::Map<String, json::Value>,
|
||||
) -> Result<HashSet<String>> {
|
||||
@ -330,7 +353,6 @@ mod tests {
|
||||
use tantivy::tokenizer::Tokenizer;
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lindera-ipadic")]
|
||||
fn test_lindera_tokenizer() {
|
||||
let params = r#"{
|
||||
"type": "lindera",
|
||||
@ -358,7 +380,6 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "lindera-cc-cedict")]
|
||||
fn test_lindera_tokenizer_cc() {
|
||||
let params = r#"{
|
||||
"type": "lindera",
|
||||
|
||||
@ -89,7 +89,7 @@ pub fn get_builder_with_tokenizer(
|
||||
}
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InvalidArgument(format!(
|
||||
"costum tokenizer must set type"
|
||||
"customized tokenizer must set type"
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
@ -82,9 +82,9 @@ impl AnalyzerBuilder<'_> {
|
||||
for filter in filters {
|
||||
if filter.is_string() {
|
||||
let filter_name = filter.as_str().unwrap();
|
||||
let costum = self.filters.remove(filter_name);
|
||||
if !costum.is_none() {
|
||||
builder = costum.unwrap().transform(builder);
|
||||
let customize = self.filters.remove(filter_name);
|
||||
if !customize.is_none() {
|
||||
builder = customize.unwrap().transform(builder);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@ pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBu
|
||||
}
|
||||
_ => {
|
||||
return Err(TantivyBindingError::InvalidArgument(
|
||||
"costum tokenizer must set type".to_string(),
|
||||
"customized tokenizer must set type".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@ -9,12 +9,22 @@ package ctokenizer
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"path"
|
||||
"unsafe"
|
||||
|
||||
"github.com/milvus-io/milvus/internal/util/tokenizerapi"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
|
||||
func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) {
|
||||
param, err := CheckAndFillParams(param)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
@ -27,7 +37,100 @@ func NewTokenizer(param string) (tokenizerapi.Tokenizer, error) {
|
||||
return NewCTokenizer(ptr), nil
|
||||
}
|
||||
|
||||
func CheckAndFillParams(params string) (string, error) {
|
||||
if len(params) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
var paramMaps map[string]any
|
||||
flag := false
|
||||
err := json.Unmarshal([]byte(params), ¶mMaps)
|
||||
if err != nil {
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("unmarshal analyzer params failed with json error: %s", err.Error()))
|
||||
}
|
||||
|
||||
tokenizer, ok := paramMaps["tokenizer"]
|
||||
if !ok {
|
||||
// skip check if no tokenizer params
|
||||
return params, nil
|
||||
}
|
||||
|
||||
switch value := tokenizer.(type) {
|
||||
case string:
|
||||
// return if use build-in tokenizer
|
||||
return params, nil
|
||||
case map[string]any:
|
||||
flag, err = CheckAndFillTokenizerParams(value)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
default:
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("analyzer params set tokenizer with unknown type"))
|
||||
}
|
||||
|
||||
// remarshal json params if params map was changed.
|
||||
if flag {
|
||||
bytes, err := json.Marshal(paramMaps)
|
||||
if err != nil {
|
||||
return "", merr.WrapErrAsInputError(fmt.Errorf("marshal analyzer params failed with json error: %s", err.Error()))
|
||||
}
|
||||
return string(bytes), nil
|
||||
}
|
||||
return params, nil
|
||||
}
|
||||
|
||||
// fill some milvus params to tokenizer params
|
||||
func CheckAndFillTokenizerParams(params map[string]any) (bool, error) {
|
||||
v, ok := params["type"]
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer must set type"))
|
||||
}
|
||||
|
||||
tokenizerType, ok := v.(string)
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer type must be string"))
|
||||
}
|
||||
|
||||
switch tokenizerType {
|
||||
case "lindera":
|
||||
cfg := paramtable.Get()
|
||||
|
||||
if _, ok := params["dict_build_dir"]; ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer dict_build_dir was system params, should not be set"))
|
||||
}
|
||||
// build lindera to LocalResourcePath/lindera/dict_kind
|
||||
params["dict_build_dir"] = path.Join(cfg.FunctionCfg.LocalResourcePath.GetValue(), "lindera")
|
||||
|
||||
v, ok := params["dict_kind"]
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer must set dict_kind"))
|
||||
}
|
||||
dictKind, ok := v.(string)
|
||||
if !ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("lindera tokenizer dict kind must be string"))
|
||||
}
|
||||
dictUrlsMap := cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
||||
|
||||
if _, ok := params["download_urls"]; ok {
|
||||
return false, merr.WrapErrAsInputError(fmt.Errorf("costom tokenizer download_urls was system params, should not be set"))
|
||||
}
|
||||
|
||||
if value, ok := dictUrlsMap["."+dictKind]; ok {
|
||||
// use download urls set in milvus yaml
|
||||
params["download_urls"] = paramtable.ParseAsStings(value)
|
||||
}
|
||||
return true, nil
|
||||
default:
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
func ValidateTokenizer(param string) error {
|
||||
param, err := CheckAndFillParams(param)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
paramPtr := C.CString(param)
|
||||
defer C.free(unsafe.Pointer(paramPtr))
|
||||
|
||||
|
||||
@ -5,14 +5,19 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
)
|
||||
|
||||
func TestTokenizer(t *testing.T) {
|
||||
paramtable.Init()
|
||||
|
||||
// default tokenizer.
|
||||
{
|
||||
m := "{\"tokenizer\": \"standard\"}"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
require.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
tokenStream := tokenizer.NewTokenStream("football, basketball, pingpang")
|
||||
@ -26,7 +31,7 @@ func TestTokenizer(t *testing.T) {
|
||||
{
|
||||
m := "{\"tokenizer\": \"jieba\"}"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
assert.NoError(t, err)
|
||||
require.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
tokenStream := tokenizer.NewTokenStream("张华考上了北京大学;李萍进了中等技术学校;我在百货公司当售货员:我们都有光明的前途")
|
||||
@ -35,6 +40,20 @@ func TestTokenizer(t *testing.T) {
|
||||
fmt.Println(tokenStream.Token())
|
||||
}
|
||||
}
|
||||
|
||||
// lindera tokenizer.
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
||||
tokenizer, err := NewTokenizer(m)
|
||||
require.NoError(t, err)
|
||||
defer tokenizer.Destroy()
|
||||
|
||||
tokenStream := tokenizer.NewTokenStream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です")
|
||||
defer tokenStream.Destroy()
|
||||
for tokenStream.Advance() {
|
||||
fmt.Println(tokenStream.Token())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidateTokenizer(t *testing.T) {
|
||||
@ -58,3 +77,71 @@ func TestValidateTokenizer(t *testing.T) {
|
||||
assert.Error(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckAndFillParams(t *testing.T) {
|
||||
paramtable.Init()
|
||||
paramtable.Get().SaveGroup(map[string]string{"function.analyzer.lindera.download_urls.ipadic": "/test/url"})
|
||||
|
||||
// normal case
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"jieba\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// fill lindera tokenizer download urls and dict local path
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\":\"lindera\", \"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// error with wrong json
|
||||
{
|
||||
m := "{invalid json"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// skip if use default analyzer
|
||||
{
|
||||
m := "{}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.NoError(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer without type
|
||||
{
|
||||
m := "{\"tokenizer\": {\"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer type not string
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": 1, \"dict_kind\": \"ipadic\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error tokenizer params type
|
||||
{
|
||||
m := "{\"tokenizer\": 1}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error set dict_build_dir by user
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": \"lindera\", \"dict_kind\": \"ipadic\", \"dict_build_dir\": \"/tmp/milvus\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
|
||||
// error lindera kind not set
|
||||
{
|
||||
m := "{\"tokenizer\": {\"type\": \"lindera\"}}"
|
||||
_, err := CheckAndFillParams(m)
|
||||
assert.Error(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -23,6 +23,8 @@ import (
|
||||
type functionConfig struct {
|
||||
TextEmbeddingProviders ParamGroup `refreshable:"true"`
|
||||
RerankModelProviders ParamGroup `refreshable:"true"`
|
||||
LocalResourcePath ParamItem `refreshable:"true"`
|
||||
LinderaDownloadUrls ParamGroup `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *functionConfig) init(base *BaseTable) {
|
||||
@ -91,6 +93,20 @@ func (p *functionConfig) init(base *BaseTable) {
|
||||
},
|
||||
}
|
||||
p.RerankModelProviders.Init(base.mgr)
|
||||
|
||||
p.LocalResourcePath = ParamItem{
|
||||
Key: "function.analyzer.local_resource_path",
|
||||
Version: "2.5.16",
|
||||
Export: true,
|
||||
DefaultValue: "/var/lib/milvus/analyzer",
|
||||
}
|
||||
p.LocalResourcePath.Init(base.mgr)
|
||||
|
||||
p.LinderaDownloadUrls = ParamGroup{
|
||||
KeyPrefix: "function.analyzer.lindera.download_urls",
|
||||
Version: "2.5.16",
|
||||
}
|
||||
p.LinderaDownloadUrls.Init(base.mgr)
|
||||
}
|
||||
|
||||
const (
|
||||
|
||||
@ -1218,7 +1218,7 @@ func TestRunAnalyzer(t *testing.T) {
|
||||
|
||||
// run analyzer with invalid params
|
||||
_, err = mc.RunAnalyzer(ctx, client.NewRunAnalyzerOption("text doc").WithAnalyzerParamsStr("invalid params}"))
|
||||
common.CheckErr(t, err, false, "JsonError")
|
||||
common.CheckErr(t, err, false, "json error")
|
||||
|
||||
// run analyzer with custom analyzer
|
||||
tokens, err = mc.RunAnalyzer(ctx, client.NewRunAnalyzerOption("test doc").
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user