milvus/internal/util/analyzer/canalyzer/c_analyzer_factory.go
aoiasd 55feb7ded8
feat: set related resource ids in collection schema (#46423)
Support crate analyzer with file resource info, and return used file
resource ids when validate analyzer.
Save the related resource ids in collection schema.
relate: https://github.com/milvus-io/milvus/issues/43687

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: analyzer file-resource resolution is deterministic and
traceable by threading a FileResourcePathHelper (collecting used
resource IDs in a HashSet) through all tokenizer/analyzer construction
and validation paths; validate_analyzer(params, extra_info) returns the
collected Vec<i64) which is propagated through C/Rust/Go layers to
callers (CValidateResult → RustResult::from_vec_i64 → Go []int64 →
querypb.ValidateAnalyzerResponse.ResourceIds →
CollectionSchema.FileResourceIds).

- Logic removed/simplified: ad‑hoc, scattered resource-path lookups and
per-filter file helpers (e.g., read_synonyms_file and other inline
file-reading logic) were consolidated into ResourceInfo +
FileResourcePathHelper and a centralized get_resource_path(helper, ...)
API; filter/tokenizer builder APIs now accept &mut
FileResourcePathHelper so all file path resolution and ID collection use
the same path and bookkeeping logic (redundant duplicated lookups
removed).

- Why no data loss or behavior regression: changes are additive and
default-preserving — existing call sites pass extra_info = "" so
analyzer creation/validation behavior and error paths remain unchanged;
new Collection.FileResourceIds is populated from resp.ResourceIds in
validateSchema and round‑tripped through marshal/unmarshal
(model.Collection ↔ schemapb.CollectionSchema) so schema persistence
uses the new list without overwriting other schema fields; proto change
adds a repeated field (resource_ids) which is wire‑compatible (older
clients ignore extra field). Concrete code paths: analyzer creation
still uses create_analyzer (now with extra_info ""), tokenizer
validation still returns errors as before but now also returns IDs via
CValidateResult/RustResult, and rootcoord.validateSchema assigns
resp.ResourceIds → schema.FileResourceIds.

- New capability added: end‑to‑end discovery, return, and persistence of
file resource IDs used by analyzers — validate flows now return resource
IDs and the system stores them in collection schema (affects tantivy
analyzer binding, canalyzer C bindings, internal/util analyzer APIs,
querynode ValidateAnalyzer response, and rootcoord/create_collection
flow).
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
2025-12-26 22:49:19 +08:00

110 lines
2.9 KiB
Go

package canalyzer
/*
#cgo pkg-config: milvus_core
#include <stdlib.h> // free
#include "segcore/tokenizer_c.h"
#include "segcore/token_stream_c.h"
*/
import "C"
import (
"encoding/json"
"sync"
"unsafe"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/util/analyzer/interfaces"
"github.com/milvus-io/milvus/internal/util/pathutil"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
const (
LinderaDictURLKey = "lindera_download_urls"
ResourceMapKey = "resource_map"
ResourcePathKey = "resource_path"
StorageNameKey = "storage_name"
)
var initOnce sync.Once
func InitOptions() {
initOnce.Do(func() {
UpdateParams()
})
}
func UpdateParams() {
cfg := paramtable.Get()
params := map[string]any{}
params[LinderaDictURLKey] = cfg.FunctionCfg.LinderaDownloadUrls.GetValue()
params[ResourcePathKey] = pathutil.GetPath(pathutil.FileResourcePath, paramtable.GetNodeID())
bytes, err := json.Marshal(params)
if err != nil {
log.Panic("init analyzer option failed", zap.Error(err))
}
paramPtr := C.CString(string(bytes))
defer C.free(unsafe.Pointer(paramPtr))
status := C.set_tokenizer_option(paramPtr)
if err := HandleCStatus(&status, "failed to init segcore analyzer option"); err != nil {
log.Panic("init analyzer option failed", zap.Error(err))
}
}
func UpdateGlobalResourceInfo(resourceMap map[string]int64) error {
bytes, err := json.Marshal(map[string]any{"resource_map": resourceMap})
if err != nil {
return errors.Wrap(err, "marshal global resource info failed")
}
paramPtr := C.CString(string(bytes))
defer C.free(unsafe.Pointer(paramPtr))
status := C.set_tokenizer_option(paramPtr)
if err := HandleCStatus(&status, "failed to update global resource info"); err != nil {
return errors.Wrap(err, "update global resource info failed")
}
return nil
}
func NewAnalyzer(param string, extraInfo string) (interfaces.Analyzer, error) {
paramPtr := C.CString(param)
defer C.free(unsafe.Pointer(paramPtr))
extraInfoPtr := C.CString(extraInfo)
defer C.free(unsafe.Pointer(extraInfoPtr))
var ptr C.CTokenizer
status := C.create_tokenizer(paramPtr, extraInfoPtr, &ptr)
if err := HandleCStatus(&status, "failed to create analyzer"); err != nil {
return nil, err
}
return NewCAnalyzer(ptr), nil
}
func ValidateAnalyzer(param string, extraInfo string) ([]int64, error) {
paramPtr := C.CString(param)
defer C.free(unsafe.Pointer(paramPtr))
extraInfoPtr := C.CString(extraInfo)
defer C.free(unsafe.Pointer(extraInfoPtr))
result := C.validate_tokenizer(paramPtr, extraInfoPtr)
if err := HandleCStatus(&result.status, "failed to validate tokenizer"); err != nil {
return nil, err
}
cIds := unsafe.Slice((*int64)(unsafe.Pointer(result.resource_ids)), result.resource_ids_count)
goIds := make([]int64, len(cIds))
copy(goIds, cIds)
C.free(unsafe.Pointer(result.resource_ids))
return goIds, nil
}