wei liu 975c91df16
feat: Add comprehensive snapshot functionality for collections (#44361)
issue: #44358

Implement complete snapshot management system including creation,
deletion, listing, description, and restoration capabilities across all
system components.

Key features:
- Create snapshots for entire collections
- Drop snapshots by name with proper cleanup
- List snapshots with collection filtering
- Describe snapshot details and metadata

Components added/modified:
- Client SDK with full snapshot API support and options
- DataCoord snapshot service with metadata management
- Proxy layer with task-based snapshot operations
- Protocol buffer definitions for snapshot RPCs
- Comprehensive unit tests with mockey framework
- Integration tests for end-to-end validation

Technical implementation:
- Snapshot metadata storage in etcd with proper indexing
- File-based snapshot data persistence in object storage
- Garbage collection integration for snapshot cleanup
- Error handling and validation across all operations
- Thread-safe operations with proper locking mechanisms

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant/assumption: snapshots are immutable point‑in‑time
captures identified by (collection, snapshot name/ID); etcd snapshot
metadata is authoritative for lifecycle (PENDING → COMMITTED → DELETING)
and per‑segment manifests live in object storage (Avro / StorageV2). GC
and restore logic must see snapshotRefIndex loaded
(snapshotMeta.IsRefIndexLoaded) before reclaiming or relying on
segment/index files.

- New capability added: full end‑to‑end snapshot subsystem — client SDK
APIs (Create/Drop/List/Describe/Restore + restore job queries),
DataCoord SnapshotWriter/Reader (Avro + StorageV2 manifests),
snapshotMeta in meta, SnapshotManager orchestration
(create/drop/describe/list/restore), copy‑segment restore
tasks/inspector/checker, proxy & RPC surface, GC integration, and
docs/tests — enabling point‑in‑time collection snapshots persisted to
object storage and restorations orchestrated across components.

- Logic removed/simplified and why: duplicated recursive
compaction/delta‑log traversal and ad‑hoc lookup code were consolidated
behind two focused APIs/owners (Handler.GetDeltaLogFromCompactTo for
delta traversal and SnapshotManager/SnapshotReader for snapshot I/O).
MixCoord/coordinator broker paths were converted to thin RPC proxies.
This eliminates multiple implementations of the same traversal/lookup,
reducing divergence and simplifying responsibility boundaries.

- Why this does NOT introduce data loss or regressions: snapshot
create/drop use explicit two‑phase semantics (PENDING → COMMIT/DELETING)
with SnapshotWriter writing manifests and metadata before commit; GC
uses snapshotRefIndex guards and
IsRefIndexLoaded/GetSnapshotBySegment/GetSnapshotByIndex checks to avoid
removing referenced files; restore flow pre‑allocates job IDs, validates
resources (partitions/indexes), performs rollback on failure
(rollbackRestoreSnapshot), and converts/updates segment/index metadata
only after successful copy tasks. Extensive unit and integration tests
exercise pending/deleting/GC/restore/error paths to ensure idempotence
and protection against premature deletion.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2026-01-06 10:15:24 +08:00

203 lines
5.9 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importv2
import (
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/util/conc"
)
type TaskType int
const (
PreImportTaskType TaskType = 0
ImportTaskType TaskType = 1
L0PreImportTaskType TaskType = 2
L0ImportTaskType TaskType = 3
CopySegmentTaskType TaskType = 4
)
var ImportTaskTypeName = map[TaskType]string{
0: "PreImportTask",
1: "ImportTask",
2: "L0PreImportTaskType",
3: "L0ImportTaskType",
4: "CopySegmentTask",
}
func (t TaskType) String() string {
return ImportTaskTypeName[t]
}
type TaskFilter func(task Task) bool
func WithStates(states ...datapb.ImportTaskStateV2) TaskFilter {
return func(task Task) bool {
for _, state := range states {
if task.GetState() == state {
return true
}
}
return false
}
}
func WithType(taskType TaskType) TaskFilter {
return func(task Task) bool {
return task.GetType() == taskType
}
}
type UpdateAction func(task Task)
func UpdateState(state datapb.ImportTaskStateV2) UpdateAction {
return func(t Task) {
switch t.GetType() {
case PreImportTaskType:
t.(*PreImportTask).PreImportTask.State = state
case ImportTaskType:
t.(*ImportTask).ImportTaskV2.State = state
case L0PreImportTaskType:
t.(*L0PreImportTask).PreImportTask.State = state
case L0ImportTaskType:
t.(*L0ImportTask).ImportTaskV2.State = state
case CopySegmentTaskType:
t.(*CopySegmentTask).state = state
}
}
}
func UpdateReason(reason string) UpdateAction {
return func(t Task) {
switch t.GetType() {
case PreImportTaskType:
t.(*PreImportTask).PreImportTask.Reason = reason
case ImportTaskType:
t.(*ImportTask).ImportTaskV2.Reason = reason
case L0PreImportTaskType:
t.(*L0PreImportTask).PreImportTask.Reason = reason
case L0ImportTaskType:
t.(*L0ImportTask).ImportTaskV2.Reason = reason
case CopySegmentTaskType:
t.(*CopySegmentTask).reason = reason
}
}
}
func UpdateFileStat(idx int, fileStat *datapb.ImportFileStats) UpdateAction {
return func(task Task) {
var t *datapb.PreImportTask
switch it := task.(type) {
case *PreImportTask:
t = it.PreImportTask
case *L0PreImportTask:
t = it.PreImportTask
}
if t != nil {
t.FileStats[idx].FileSize = fileStat.GetFileSize()
t.FileStats[idx].TotalRows = fileStat.GetTotalRows()
t.FileStats[idx].TotalMemorySize = fileStat.GetTotalMemorySize()
t.FileStats[idx].HashedStats = fileStat.GetHashedStats()
}
}
}
func UpdateSegmentInfo(info *datapb.ImportSegmentInfo) UpdateAction {
mergeFn := func(current []*datapb.FieldBinlog, new []*datapb.FieldBinlog) []*datapb.FieldBinlog {
for _, binlog := range new {
fieldBinlogs, ok := lo.Find(current, func(log *datapb.FieldBinlog) bool {
return log.GetFieldID() == binlog.GetFieldID()
})
if !ok || fieldBinlogs == nil {
current = append(current, binlog)
} else {
fieldBinlogs.Binlogs = append(fieldBinlogs.Binlogs, binlog.Binlogs...)
}
}
return current
}
return func(task Task) {
var segmentsInfo map[int64]*datapb.ImportSegmentInfo
switch it := task.(type) {
case *ImportTask:
segmentsInfo = it.segmentsInfo
case *L0ImportTask:
segmentsInfo = it.segmentsInfo
}
if segmentsInfo != nil {
segment := info.GetSegmentID()
if _, ok := segmentsInfo[segment]; ok {
segmentsInfo[segment].ImportedRows = info.GetImportedRows()
segmentsInfo[segment].Binlogs = mergeFn(segmentsInfo[segment].Binlogs, info.GetBinlogs())
segmentsInfo[segment].Statslogs = mergeFn(segmentsInfo[segment].Statslogs, info.GetStatslogs())
segmentsInfo[segment].Deltalogs = mergeFn(segmentsInfo[segment].Deltalogs, info.GetDeltalogs())
segmentsInfo[segment].Bm25Logs = mergeFn(segmentsInfo[segment].Bm25Logs, info.GetBm25Logs())
return
}
segmentsInfo[segment] = info
}
}
}
// UpdateSegmentResult updates segment result for CopySegmentTask
// This includes both binlog information and index metadata
//
// Note: In CopySegmentTask, each source segment maps to a unique target segment (1:1),
// so each segment is only updated once. No merging is needed.
func UpdateSegmentResult(result *datapb.CopySegmentResult) UpdateAction {
return func(task Task) {
if it, ok := task.(*CopySegmentTask); ok {
segment := result.GetSegmentId()
// Directly replace the segment result since each segment is only updated once
// The initial empty result was created in NewCopySegmentTask()
it.segmentResults[segment] = result
}
}
}
type Task interface {
Execute() []*conc.Future[any]
GetJobID() int64
GetTaskID() int64
GetCollectionID() int64
GetPartitionIDs() []int64
GetVchannels() []string
GetType() TaskType
GetState() datapb.ImportTaskStateV2
GetReason() string
GetSchema() *schemapb.CollectionSchema
GetSlots() int64
GetBufferSize() int64
Cancel()
Clone() Task
}
func WrapLogFields(task Task, fields ...zap.Field) []zap.Field {
res := []zap.Field{
zap.Int64("taskID", task.GetTaskID()),
zap.Int64("jobID", task.GetJobID()),
zap.Int64("collectionID", task.GetCollectionID()),
zap.String("type", task.GetType().String()),
}
res = append(res, fields...)
return res
}