milvus/pkg/proto/worker.proto
aoiasd ee216877bb
enhance: support compaction with file resource in ref mode (#46399)
Add support for DataNode compaction using file resources in ref mode.
SortCompation and StatsJobs will build text indexes, which may use file
resources.
relate: https://github.com/milvus-io/milvus/issues/43687

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: file resources (analyzer binaries/metadata) are only
fetched, downloaded and used when the node is configured in Ref mode
(fileresource.IsRefMode via CommonCfg.QNFileResourceMode /
DNFileResourceMode); Sync now carries a version and managers track
per-resource versions/resource IDs so newer resource sets win and older
entries are pruned (RefManager/SynchManager resource maps).
- Logic removed / simplified: component-specific FileResourceMode flags
and an indirection through a long-lived BinlogIO wrapper were
consolidated — file-resource mode moved to CommonCfg, Sync/Download APIs
became version- and context-aware, and compaction/index tasks accept a
ChunkManager directly (binlog IO wrapper creation inlined). This
eliminates duplicated config checks and wrapper indirection while
preserving the same chunk/IO semantics.
- Why no data loss or behavior regression: all file-resource code paths
are gated by the configured mode (default remains "sync"); when not in
ref-mode or when no resources exist, compaction and stats flows follow
existing code paths unchanged. Versioned Sync + resourceID maps ensure
newly synced sets replace older ones and RefManager prunes stale files;
GetFileResources returns an error if requested IDs are missing (prevents
silent use of wrong resources). Analyzer naming/parameter changes add
analyzer_extra_info but default-callers pass "" so existing analyzers
and index contents remain unchanged.
- New capability: DataNode compaction and StatsJobs can now build text
indexes using external file resources in Ref mode — DataCoord exposes
GetFileResources and populates CompactionPlan.file_resources;
SortCompaction/StatsTask download resources via fileresource.Manager,
produce an analyzer_extra_info JSON (storage + resource->id map) via
analyzer.BuildExtraResourceInfo, and propagate analyzer_extra_info into
BuildIndexInfo so the tantivy bindings can load custom analyzers during
text index creation.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
2026-01-06 16:31:31 +08:00

289 lines
7.4 KiB
Protocol Buffer

syntax = "proto3";
package milvus.proto.index;
option go_package = "github.com/milvus-io/milvus/pkg/v2/proto/workerpb";
import "common.proto";
import "schema.proto";
import "data_coord.proto";
import "index_coord.proto";
import "milvus.proto";
import "internal.proto";
service IndexNode {
// Deprecated
rpc CreateJob(CreateJobRequest) returns (common.Status) {}
// Deprecated
rpc QueryJobs(QueryJobsRequest) returns (QueryJobsResponse) {}
// Deprecated
rpc DropJobs(DropJobsRequest) returns (common.Status) {}
// Deprecated
rpc GetJobStats(GetJobStatsRequest) returns (GetJobStatsResponse) {}
// Deprecated
rpc CreateJobV2(CreateJobV2Request) returns (common.Status) {}
// Deprecated
rpc QueryJobsV2(QueryJobsV2Request) returns (QueryJobsV2Response) {}
// Deprecated
rpc DropJobsV2(DropJobsV2Request) returns (common.Status) {}
// https://wiki.lfaidata.foundation/display/MIL/MEP+8+--+Add+metrics+for+proxy
rpc GetMetrics(milvus.GetMetricsRequest)
returns (milvus.GetMetricsResponse) {
}
rpc CreateTask(CreateTaskRequest) returns (common.Status) {}
rpc QueryTask(QueryTaskRequest) returns (QueryTaskResponse) {}
rpc DropTask(DropTaskRequest) returns (common.Status) {}
}
message CreateTaskRequest {
// request body
bytes payload = 1;
// request properties, must contain:
// - clusterID
// - taskID
// - taskType
// - taskSlot
map<string, string> properties = 2;
}
message QueryTaskRequest {
// request properties, which contain:
// - clusterID
// - taskID
// - taskType
map<string, string> properties = 1;
}
message QueryTaskResponse {
common.Status status = 1;
// response body
bytes payload = 2;
// response properties, which contain:
// - taskState
map<string, string> properties = 3;
}
message DropTaskRequest {
// request properties, which contain:
// - clusterID
// - taskID
// - taskType
map<string, string> properties = 1;
}
// CreateJobRequest is CreateIndexRequest
message CreateJobRequest {
string clusterID = 1;
string index_file_prefix = 2;
int64 buildID = 3;
repeated string data_paths = 4;
int64 index_version = 5;
int64 indexID = 6;
string index_name = 7;
index.StorageConfig storage_config = 8;
repeated common.KeyValuePair index_params = 9;
repeated common.KeyValuePair type_params = 10;
int64 num_rows = 11;
int32 current_index_version = 12;
int64 collectionID = 13;
int64 partitionID = 14;
int64 segmentID = 15;
int64 fieldID = 16;
string field_name = 17;
schema.DataType field_type = 18;
string store_path = 19;
int64 store_version = 20;
string index_store_path = 21;
int64 dim = 22;
repeated int64 data_ids = 23;
repeated index.OptionalFieldInfo optional_scalar_fields = 24;
schema.FieldSchema field = 25;
bool partition_key_isolation = 26;
int32 current_scalar_index_version = 27;
int64 task_slot = 28;
int64 storage_version = 29;
int64 lack_binlog_rows = 30;
repeated data.FieldBinlog insert_logs = 31;
repeated common.KeyValuePair plugin_context = 32;
string manifest = 33;
}
message QueryJobsRequest {
string clusterID = 1;
repeated int64 taskIDs = 2;
}
message QueryJobsResponse {
common.Status status = 1;
string clusterID = 2;
repeated IndexTaskInfo index_infos = 3;
}
message DropJobsRequest {
string clusterID = 1;
repeated int64 taskIDs = 2;
}
message GetJobStatsRequest {
}
message GetJobStatsResponse {
common.Status status = 1;
int64 total_job_num = 2;
int64 in_progress_job_num = 3;
int64 enqueue_job_num = 4;
int64 available_slots = 5;
repeated index.JobInfo job_infos = 6;
// deprecated
bool enable_disk = 7;
int64 total_slots = 8;
}
message AnalyzeRequest {
string clusterID = 1;
int64 taskID = 2;
int64 collectionID = 3;
int64 partitionID = 4;
int64 fieldID = 5;
string fieldName = 6;
schema.DataType field_type = 7;
map<int64, index.SegmentStats> segment_stats = 8;
int64 version = 9;
index.StorageConfig storage_config = 10;
int64 dim = 11;
double max_train_size_ratio = 12;
int64 num_clusters = 13;
schema.FieldSchema field = 14;
double min_cluster_size_ratio = 15;
double max_cluster_size_ratio = 16;
int64 max_cluster_size = 17;
int64 task_slot = 18;
repeated common.KeyValuePair plugin_context = 19;
}
message CreateStatsRequest {
string clusterID = 1;
int64 taskID = 2;
int64 collectionID = 3;
int64 partitionID = 4;
string insert_channel = 5;
int64 segmentID = 6;
repeated data.FieldBinlog insert_logs = 7;
// deprecated, after sort stats moved, its not used.
repeated data.FieldBinlog delta_logs = 8;
index.StorageConfig storage_config = 9;
schema.CollectionSchema schema = 10;
index.StatsSubJob subJobType = 11;
int64 targetSegmentID = 12;
int64 startLogID = 13;
int64 endLogID = 14;
int64 num_rows = 15;
// deprecated, after sort stats moved, its not used.
int64 collection_ttl = 16;
// deprecated, after sort stats moved, its not used.
uint64 current_ts = 17;
int64 task_version = 18;
// deprecated, after sort stats moved, its not used.
uint64 binlogMaxSize = 19;
bool enable_json_key_stats = 20;
int64 json_key_stats_tantivy_memory = 21;
int64 json_key_stats_data_format = 22;
// deprecated, the sort logic has been moved into the compaction process.
bool enable_json_key_stats_in_sort = 23;
int64 task_slot = 24;
int64 storage_version = 25;
int32 current_scalar_index_version = 26;
repeated common.KeyValuePair plugin_context = 27;
int64 json_stats_max_shredding_columns = 28;
double json_stats_shredding_ratio_threshold = 29;
int64 json_stats_write_batch_size = 30;
string manifest_path = 31;
repeated internal.FileResourceInfo file_resources = 32;
}
message CreateJobV2Request {
string clusterID = 1;
int64 taskID = 2;
index.JobType job_type = 3;
oneof request {
AnalyzeRequest analyze_request = 4;
CreateJobRequest index_request = 5;
CreateStatsRequest stats_request = 6;
}
}
message QueryJobsV2Request {
string clusterID = 1;
repeated int64 taskIDs = 2;
index.JobType job_type = 3;
}
message IndexTaskInfo {
int64 buildID = 1;
common.IndexState state = 2;
repeated string index_file_keys = 3;
uint64 serialized_size = 4;
string fail_reason = 5;
int32 current_index_version = 6;
int64 index_store_version = 7;
uint64 mem_size = 8;
int32 current_scalar_index_version = 9;
}
message IndexJobResults {
repeated IndexTaskInfo results = 1;
}
message AnalyzeResult {
int64 taskID = 1;
index.JobState state = 2;
string fail_reason = 3;
string centroids_file = 4;
}
message AnalyzeResults {
repeated AnalyzeResult results = 1;
}
message StatsResult {
int64 taskID = 1;
index.JobState state = 2;
string fail_reason = 3;
int64 collectionID = 4;
int64 partitionID = 5;
int64 segmentID = 6;
string channel = 7;
repeated data.FieldBinlog insert_logs = 8;
repeated data.FieldBinlog stats_logs = 9;
map<int64, data.TextIndexStats> text_stats_logs = 10;
int64 num_rows = 11;
repeated data.FieldBinlog bm25_logs = 12;
map<int64, data.JsonKeyStats> json_key_stats_logs = 13;
}
message StatsResults {
repeated StatsResult results = 1;
}
message QueryJobsV2Response {
common.Status status = 1;
string clusterID = 2;
oneof result {
IndexJobResults index_job_results = 3;
AnalyzeResults analyze_job_results = 4;
StatsResults stats_job_results = 5;
}
}
message DropJobsV2Request {
string clusterID = 1;
repeated int64 taskIDs = 2;
index.JobType job_type = 3;
}