mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
Support crate analyzer with file resource info, and return used file resource ids when validate analyzer. Save the related resource ids in collection schema. relate: https://github.com/milvus-io/milvus/issues/43687 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: analyzer file-resource resolution is deterministic and traceable by threading a FileResourcePathHelper (collecting used resource IDs in a HashSet) through all tokenizer/analyzer construction and validation paths; validate_analyzer(params, extra_info) returns the collected Vec<i64) which is propagated through C/Rust/Go layers to callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 → querypb.ValidateAnalyzerResponse.ResourceIds → CollectionSchema.FileResourceIds). - Logic removed/simplified: ad‑hoc, scattered resource-path lookups and per-filter file helpers (e.g., read_synonyms_file and other inline file-reading logic) were consolidated into ResourceInfo + FileResourcePathHelper and a centralized get_resource_path(helper, ...) API; filter/tokenizer builder APIs now accept &mut FileResourcePathHelper so all file path resolution and ID collection use the same path and bookkeeping logic (redundant duplicated lookups removed). - Why no data loss or behavior regression: changes are additive and default-preserving — existing call sites pass extra_info = "" so analyzer creation/validation behavior and error paths remain unchanged; new Collection.FileResourceIds is populated from resp.ResourceIds in validateSchema and round‑tripped through marshal/unmarshal (model.Collection ↔ schemapb.CollectionSchema) so schema persistence uses the new list without overwriting other schema fields; proto change adds a repeated field (resource_ids) which is wire‑compatible (older clients ignore extra field). Concrete code paths: analyzer creation still uses create_analyzer (now with extra_info ""), tokenizer validation still returns errors as before but now also returns IDs via CValidateResult/RustResult, and rootcoord.validateSchema assigns resp.ResourceIds → schema.FileResourceIds. - New capability added: end‑to‑end discovery, return, and persistence of file resource IDs used by analyzers — validate flows now return resource IDs and the system stores them in collection schema (affects tantivy analyzer binding, canalyzer C bindings, internal/util analyzer APIs, querynode ValidateAnalyzer response, and rootcoord/create_collection flow). <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
110 lines
2.9 KiB
Go
110 lines
2.9 KiB
Go
package canalyzer
|
|
|
|
/*
|
|
#cgo pkg-config: milvus_core
|
|
#include <stdlib.h> // free
|
|
#include "segcore/tokenizer_c.h"
|
|
#include "segcore/token_stream_c.h"
|
|
*/
|
|
import "C"
|
|
|
|
import (
|
|
"encoding/json"
|
|
"sync"
|
|
"unsafe"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
|
|
"github.com/milvus-io/milvus/internal/util/pathutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
)
|
|
|
|
const (
|
|
LinderaDictURLKey = "lindera_download_urls"
|
|
ResourceMapKey = "resource_map"
|
|
ResourcePathKey = "resource_path"
|
|
StorageNameKey = "storage_name"
|
|
)
|
|
|
|
var initOnce sync.Once
|
|
|
|
func InitOptions() {
|
|
initOnce.Do(func() {
|
|
UpdateParams()
|
|
})
|
|
}
|
|
|
|
func UpdateParams() {
|
|
cfg := paramtable.Get()
|
|
params := map[string]any{}
|
|
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
|
|
params[ResourcePathKey] = pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
|
|
|
|
bytes, err := json.Marshal(params)
|
|
if err != nil {
|
|
log.Panic("init analyzer option failed", zap.Error(err))
|
|
}
|
|
|
|
paramPtr := C.CString(string(bytes))
|
|
defer C.free(unsafe.Pointer(paramPtr))
|
|
|
|
status := C.set_tokenizer_option(paramPtr)
|
|
if err := HandleCStatus(&status, "failed to init segcore analyzer option"); err != nil {
|
|
log.Panic("init analyzer option failed", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
|
|
bytes, err := json.Marshal(map[string]any{"resource_map": resourceMap})
|
|
if err != nil {
|
|
return errors.Wrap(err, "marshal global resource info failed")
|
|
}
|
|
|
|
paramPtr := C.CString(string(bytes))
|
|
defer C.free(unsafe.Pointer(paramPtr))
|
|
|
|
status := C.set_tokenizer_option(paramPtr)
|
|
if err := HandleCStatus(&status, "failed to update global resource info"); err != nil {
|
|
return errors.Wrap(err, "update global resource info failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func NewAnalyzer(param string, extraInfo string) (interfaces.Analyzer, error) {
|
|
paramPtr := C.CString(param)
|
|
defer C.free(unsafe.Pointer(paramPtr))
|
|
|
|
extraInfoPtr := C.CString(extraInfo)
|
|
defer C.free(unsafe.Pointer(extraInfoPtr))
|
|
|
|
var ptr C.CTokenizer
|
|
status := C.create_tokenizer(paramPtr, extraInfoPtr, &ptr)
|
|
if err := HandleCStatus(&status, "failed to create analyzer"); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return NewCAnalyzer(ptr), nil
|
|
}
|
|
|
|
func ValidateAnalyzer(param string, extraInfo string) ([]int64, error) {
|
|
paramPtr := C.CString(param)
|
|
defer C.free(unsafe.Pointer(paramPtr))
|
|
|
|
extraInfoPtr := C.CString(extraInfo)
|
|
defer C.free(unsafe.Pointer(extraInfoPtr))
|
|
|
|
result := C.validate_tokenizer(paramPtr, extraInfoPtr)
|
|
if err := HandleCStatus(&result.status, "failed to validate tokenizer"); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
cIds := unsafe.Slice((*int64)(unsafe.Pointer(result.resource_ids)), result.resource_ids_count)
|
|
goIds := make([]int64, len(cIds))
|
|
copy(goIds, cIds)
|
|
C.free(unsafe.Pointer(result.resource_ids))
|
|
return goIds, nil
|
|
}
|