mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
Support crate analyzer with file resource info, and return used file resource ids when validate analyzer. Save the related resource ids in collection schema. relate: https://github.com/milvus-io/milvus/issues/43687 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: analyzer file-resource resolution is deterministic and traceable by threading a FileResourcePathHelper (collecting used resource IDs in a HashSet) through all tokenizer/analyzer construction and validation paths; validate_analyzer(params, extra_info) returns the collected Vec<i64) which is propagated through C/Rust/Go layers to callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 → querypb.ValidateAnalyzerResponse.ResourceIds → CollectionSchema.FileResourceIds). - Logic removed/simplified: ad‑hoc, scattered resource-path lookups and per-filter file helpers (e.g., read_synonyms_file and other inline file-reading logic) were consolidated into ResourceInfo + FileResourcePathHelper and a centralized get_resource_path(helper, ...) API; filter/tokenizer builder APIs now accept &mut FileResourcePathHelper so all file path resolution and ID collection use the same path and bookkeeping logic (redundant duplicated lookups removed). - Why no data loss or behavior regression: changes are additive and default-preserving — existing call sites pass extra_info = "" so analyzer creation/validation behavior and error paths remain unchanged; new Collection.FileResourceIds is populated from resp.ResourceIds in validateSchema and round‑tripped through marshal/unmarshal (model.Collection ↔ schemapb.CollectionSchema) so schema persistence uses the new list without overwriting other schema fields; proto change adds a repeated field (resource_ids) which is wire‑compatible (older clients ignore extra field). Concrete code paths: analyzer creation still uses create_analyzer (now with extra_info ""), tokenizer validation still returns errors as before but now also returns IDs via CValidateResult/RustResult, and rootcoord.validateSchema assigns resp.ResourceIds → schema.FileResourceIds. - New capability added: end‑to‑end discovery, return, and persistence of file resource IDs used by analyzers — validate flows now return resource IDs and the system stores them in collection schema (affects tantivy analyzer binding, canalyzer C bindings, internal/util analyzer APIs, querynode ValidateAnalyzer response, and rootcoord/create_collection flow). <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
111 lines
3.6 KiB
C++
111 lines
3.6 KiB
C++
#pragma once
|
|
|
|
#include "tantivy-binding.h"
|
|
#include "rust-binding.h"
|
|
#include "rust-hashmap.h"
|
|
#include "tantivy/rust-array.h"
|
|
#include "token-stream.h"
|
|
#include "log/Log.h"
|
|
|
|
namespace milvus::tantivy {
|
|
|
|
struct Tokenizer {
|
|
public:
|
|
NO_COPY_OR_ASSIGN(Tokenizer);
|
|
|
|
explicit Tokenizer(std::string&& params) {
|
|
auto shared_params = std::make_shared<std::string>(params);
|
|
auto res = RustResultWrapper(
|
|
tantivy_create_analyzer(shared_params->c_str(), ""));
|
|
AssertInfo(res.result_->success,
|
|
"Tokenizer creation failed: {}",
|
|
res.result_->error);
|
|
ptr_ = res.result_->value.ptr._0;
|
|
}
|
|
|
|
explicit Tokenizer(std::string&& params, std::string&& extra_info) {
|
|
auto shared_params = std::make_shared<std::string>(params);
|
|
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
|
auto res = RustResultWrapper(tantivy_create_analyzer(
|
|
shared_params->c_str(), shared_extra_info->c_str()));
|
|
AssertInfo(res.result_->success,
|
|
"Tokenizer creation failed: {}",
|
|
res.result_->error);
|
|
ptr_ = res.result_->value.ptr._0;
|
|
}
|
|
|
|
explicit Tokenizer(void* _ptr) : ptr_(_ptr) {
|
|
}
|
|
|
|
~Tokenizer() {
|
|
if (ptr_ != nullptr) {
|
|
tantivy_free_analyzer(ptr_);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<TokenStream>
|
|
CreateTokenStream(std::string&& text) {
|
|
auto shared_text = std::make_shared<std::string>(std::move(text));
|
|
auto token_stream =
|
|
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
|
return std::make_unique<TokenStream>(token_stream, shared_text);
|
|
}
|
|
|
|
std::unique_ptr<Tokenizer>
|
|
Clone() {
|
|
auto newptr = tantivy_clone_analyzer(ptr_);
|
|
return std::make_unique<milvus::tantivy::Tokenizer>(newptr);
|
|
}
|
|
|
|
// CreateTokenStreamCopyText will copy the text and then create token stream based on the text.
|
|
std::unique_ptr<TokenStream>
|
|
CreateTokenStreamCopyText(const std::string& text) {
|
|
auto shared_text = std::make_shared<std::string>(text);
|
|
auto token_stream =
|
|
tantivy_create_token_stream(ptr_, shared_text->c_str());
|
|
return std::make_unique<TokenStream>(token_stream, shared_text);
|
|
}
|
|
|
|
private:
|
|
void* ptr_;
|
|
};
|
|
|
|
void
|
|
set_tokenizer_options(std::string&& params) {
|
|
auto shared_params = std::make_shared<std::string>(params);
|
|
auto res =
|
|
RustResultWrapper(tantivy_set_analyzer_options(shared_params->c_str()));
|
|
AssertInfo(res.result_->success,
|
|
"Set analyzer option failed: {}",
|
|
res.result_->error);
|
|
}
|
|
|
|
inline std::pair<int64_t*, size_t>
|
|
validate_analyzer(std::string&& params, std::string&& extra_info) {
|
|
auto shared_params = std::make_shared<std::string>(params);
|
|
auto shared_extra_info = std::make_shared<std::string>(extra_info);
|
|
auto res = RustResultWrapper(tantivy_validate_analyzer(
|
|
shared_params->c_str(), shared_extra_info->c_str()));
|
|
AssertInfo(res.result_->success,
|
|
"Validate analyzer params failed: {}",
|
|
res.result_->error);
|
|
auto array_wrapper =
|
|
RustArrayI64Wrapper(std::move(res.result_->value.rust_array_i64._0));
|
|
auto* array = array_wrapper.array_.array;
|
|
auto len = array_wrapper.array_.len;
|
|
|
|
int64_t* result = nullptr;
|
|
if (len > 0) {
|
|
result = static_cast<int64_t*>(malloc(len * sizeof(int64_t)));
|
|
if (result == nullptr) {
|
|
throw std::bad_alloc();
|
|
}
|
|
std::memcpy(result,
|
|
array,
|
|
len * sizeof(int64_t)); // Copy the array to the result
|
|
}
|
|
return {result, len};
|
|
}
|
|
|
|
} // namespace milvus::tantivy
|