milvus/pkg/taskcommon/properties.go
wei liu 975c91df16
feat: Add comprehensive snapshot functionality for collections (#44361)
issue: #44358

Implement complete snapshot management system including creation,
deletion, listing, description, and restoration capabilities across all
system components.

Key features:
- Create snapshots for entire collections
- Drop snapshots by name with proper cleanup
- List snapshots with collection filtering
- Describe snapshot details and metadata

Components added/modified:
- Client SDK with full snapshot API support and options
- DataCoord snapshot service with metadata management
- Proxy layer with task-based snapshot operations
- Protocol buffer definitions for snapshot RPCs
- Comprehensive unit tests with mockey framework
- Integration tests for end-to-end validation

Technical implementation:
- Snapshot metadata storage in etcd with proper indexing
- File-based snapshot data persistence in object storage
- Garbage collection integration for snapshot cleanup
- Error handling and validation across all operations
- Thread-safe operations with proper locking mechanisms

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant/assumption: snapshots are immutable point‑in‑time
captures identified by (collection, snapshot name/ID); etcd snapshot
metadata is authoritative for lifecycle (PENDING → COMMITTED → DELETING)
and per‑segment manifests live in object storage (Avro / StorageV2). GC
and restore logic must see snapshotRefIndex loaded
(snapshotMeta.IsRefIndexLoaded) before reclaiming or relying on
segment/index files.

- New capability added: full end‑to‑end snapshot subsystem — client SDK
APIs (Create/Drop/List/Describe/Restore + restore job queries),
DataCoord SnapshotWriter/Reader (Avro + StorageV2 manifests),
snapshotMeta in meta, SnapshotManager orchestration
(create/drop/describe/list/restore), copy‑segment restore
tasks/inspector/checker, proxy & RPC surface, GC integration, and
docs/tests — enabling point‑in‑time collection snapshots persisted to
object storage and restorations orchestrated across components.

- Logic removed/simplified and why: duplicated recursive
compaction/delta‑log traversal and ad‑hoc lookup code were consolidated
behind two focused APIs/owners (Handler.GetDeltaLogFromCompactTo for
delta traversal and SnapshotManager/SnapshotReader for snapshot I/O).
MixCoord/coordinator broker paths were converted to thin RPC proxies.
This eliminates multiple implementations of the same traversal/lookup,
reducing divergence and simplifying responsibility boundaries.

- Why this does NOT introduce data loss or regressions: snapshot
create/drop use explicit two‑phase semantics (PENDING → COMMIT/DELETING)
with SnapshotWriter writing manifests and metadata before commit; GC
uses snapshotRefIndex guards and
IsRefIndexLoaded/GetSnapshotBySegment/GetSnapshotByIndex checks to avoid
removing referenced files; restore flow pre‑allocates job IDs, validates
resources (partitions/indexes), performs rollback on failure
(rollbackRestoreSnapshot), and converts/updates segment/index metadata
only after successful copy tasks. Extensive unit and integration tests
exercise pending/deleting/GC/restore/error paths to ensure idempotence
and protection against premature deletion.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2026-01-06 10:15:24 +08:00

186 lines
4.7 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package taskcommon
import (
"fmt"
"strconv"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
)
// properties keys
const (
// request
ClusterIDKey = "cluster_id"
TaskIDKey = "task_id"
TypeKey = "task_type"
SubTypeKey = "task_sub_type" // optional, only for Stats
SlotKey = "task_slot"
NumRowsKey = "num_row" // optional, only for Index, Stats
TaskVersionKey = "task_version" // optional, only for Index, Stats and Analyze
// result
StateKey = "task_state"
ReasonKey = "task_reason"
)
type Properties map[string]string
func NewProperties(properties map[string]string) Properties {
if properties == nil {
properties = map[string]string{}
}
return properties
}
func WrapErrTaskPropertyLack(lackProperty string, taskID any) error {
return fmt.Errorf("cannot find property '%s' for task '%v'", lackProperty, taskID)
}
func (p Properties) AppendClusterID(clusterID string) {
p[ClusterIDKey] = clusterID
}
func (p Properties) AppendTaskID(taskID int64) {
p[TaskIDKey] = fmt.Sprintf("%d", taskID)
}
func (p Properties) AppendType(t Type) {
switch t {
case PreImport, Import, Compaction, Index, Stats, Analyze, CopySegment:
p[TypeKey] = t
default:
p[TypeKey] = TypeNone
}
}
func (p Properties) AppendSubType(subType string) {
p[SubTypeKey] = subType
}
func (p Properties) AppendTaskSlot(slot int64) {
p[SlotKey] = fmt.Sprintf("%d", slot)
}
func (p Properties) AppendNumRows(rows int64) {
p[NumRowsKey] = fmt.Sprintf("%d", rows)
}
func (p Properties) AppendTaskVersion(version int64) {
p[TaskVersionKey] = fmt.Sprintf("%d", version)
}
func (p Properties) AppendReason(reason string) {
p[ReasonKey] = reason
}
func (p Properties) AppendTaskState(state State) {
p[StateKey] = state.String()
}
func (p Properties) GetTaskType() (Type, error) {
if _, ok := p[TypeKey]; !ok {
return "", WrapErrTaskPropertyLack(TypeKey, p[TaskIDKey])
}
switch p[TypeKey] {
case PreImport, Import, Compaction, Index, Stats, Analyze, CopySegment:
return p[TypeKey], nil
default:
return p[TypeKey], fmt.Errorf("unrecognized task type '%s', taskID=%s", p[TypeKey], p[TaskIDKey])
}
}
func (p Properties) GetJobType() (indexpb.JobType, error) {
taskType, err := p.GetTaskType()
if err != nil {
return indexpb.JobType_JobTypeNone, err
}
switch taskType {
case Index:
return indexpb.JobType_JobTypeIndexJob, nil
case Stats:
return indexpb.JobType_JobTypeStatsJob, nil
case Analyze:
return indexpb.JobType_JobTypeAnalyzeJob, nil
default:
return indexpb.JobType_JobTypeNone, nil
}
}
func (p Properties) GetSubTaskType() string {
return p[SubTypeKey]
}
func (p Properties) GetClusterID() (string, error) {
if _, ok := p[ClusterIDKey]; !ok {
return "", WrapErrTaskPropertyLack(ClusterIDKey, p[TaskIDKey])
}
return p[ClusterIDKey], nil
}
func (p Properties) GetTaskID() (int64, error) {
if _, ok := p[TaskIDKey]; !ok {
return 0, WrapErrTaskPropertyLack(TaskIDKey, 0)
}
return strconv.ParseInt(p[TaskIDKey], 10, 64)
}
func (p Properties) GetTaskState() (State, error) {
if _, ok := p[StateKey]; !ok {
return 0, WrapErrTaskPropertyLack(StateKey, p[TaskIDKey])
}
stateStr := p[StateKey]
if _, ok := indexpb.JobState_value[stateStr]; !ok {
return None, fmt.Errorf("invalid task state '%v', taskID=%s", stateStr, p[TaskIDKey])
}
return State(indexpb.JobState_value[stateStr]), nil
}
func (p Properties) GetTaskReason() string {
return p[ReasonKey]
}
func (p Properties) GetTaskSlot() (int64, error) {
if _, ok := p[SlotKey]; !ok {
return 0, WrapErrTaskPropertyLack(SlotKey, p[TaskIDKey])
}
return strconv.ParseInt(p[SlotKey], 10, 64)
}
func (p Properties) GetNumRows() int64 {
if _, ok := p[NumRowsKey]; !ok {
return 0
}
rows, err := strconv.ParseInt(p[NumRowsKey], 10, 64)
if err != nil {
return 0
}
return rows
}
func (p Properties) GetTaskVersion() int64 {
if _, ok := p[TaskVersionKey]; !ok {
return 0
}
version, err := strconv.ParseInt(p[TaskVersionKey], 10, 64)
if err != nil {
return 0
}
return version
}