mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
issue: #44358 Implement complete snapshot management system including creation, deletion, listing, description, and restoration capabilities across all system components. Key features: - Create snapshots for entire collections - Drop snapshots by name with proper cleanup - List snapshots with collection filtering - Describe snapshot details and metadata Components added/modified: - Client SDK with full snapshot API support and options - DataCoord snapshot service with metadata management - Proxy layer with task-based snapshot operations - Protocol buffer definitions for snapshot RPCs - Comprehensive unit tests with mockey framework - Integration tests for end-to-end validation Technical implementation: - Snapshot metadata storage in etcd with proper indexing - File-based snapshot data persistence in object storage - Garbage collection integration for snapshot cleanup - Error handling and validation across all operations - Thread-safe operations with proper locking mechanisms <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant/assumption: snapshots are immutable point‑in‑time captures identified by (collection, snapshot name/ID); etcd snapshot metadata is authoritative for lifecycle (PENDING → COMMITTED → DELETING) and per‑segment manifests live in object storage (Avro / StorageV2). GC and restore logic must see snapshotRefIndex loaded (snapshotMeta.IsRefIndexLoaded) before reclaiming or relying on segment/index files. - New capability added: full end‑to‑end snapshot subsystem — client SDK APIs (Create/Drop/List/Describe/Restore + restore job queries), DataCoord SnapshotWriter/Reader (Avro + StorageV2 manifests), snapshotMeta in meta, SnapshotManager orchestration (create/drop/describe/list/restore), copy‑segment restore tasks/inspector/checker, proxy & RPC surface, GC integration, and docs/tests — enabling point‑in‑time collection snapshots persisted to object storage and restorations orchestrated across components. - Logic removed/simplified and why: duplicated recursive compaction/delta‑log traversal and ad‑hoc lookup code were consolidated behind two focused APIs/owners (Handler.GetDeltaLogFromCompactTo for delta traversal and SnapshotManager/SnapshotReader for snapshot I/O). MixCoord/coordinator broker paths were converted to thin RPC proxies. This eliminates multiple implementations of the same traversal/lookup, reducing divergence and simplifying responsibility boundaries. - Why this does NOT introduce data loss or regressions: snapshot create/drop use explicit two‑phase semantics (PENDING → COMMIT/DELETING) with SnapshotWriter writing manifests and metadata before commit; GC uses snapshotRefIndex guards and IsRefIndexLoaded/GetSnapshotBySegment/GetSnapshotByIndex checks to avoid removing referenced files; restore flow pre‑allocates job IDs, validates resources (partitions/indexes), performs rollback on failure (rollbackRestoreSnapshot), and converts/updates segment/index metadata only after successful copy tasks. Extensive unit and integration tests exercise pending/deleting/GC/restore/error paths to ensure idempotence and protection against premature deletion. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Wei Liu <wei.liu@zilliz.com>
203 lines
5.9 KiB
Go
203 lines
5.9 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package importv2
|
|
|
|
import (
|
|
"github.com/samber/lo"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/conc"
|
|
)
|
|
|
|
type TaskType int
|
|
|
|
const (
|
|
PreImportTaskType TaskType = 0
|
|
ImportTaskType TaskType = 1
|
|
L0PreImportTaskType TaskType = 2
|
|
L0ImportTaskType TaskType = 3
|
|
CopySegmentTaskType TaskType = 4
|
|
)
|
|
|
|
var ImportTaskTypeName = map[TaskType]string{
|
|
0: "PreImportTask",
|
|
1: "ImportTask",
|
|
2: "L0PreImportTaskType",
|
|
3: "L0ImportTaskType",
|
|
4: "CopySegmentTask",
|
|
}
|
|
|
|
func (t TaskType) String() string {
|
|
return ImportTaskTypeName[t]
|
|
}
|
|
|
|
type TaskFilter func(task Task) bool
|
|
|
|
func WithStates(states ...datapb.ImportTaskStateV2) TaskFilter {
|
|
return func(task Task) bool {
|
|
for _, state := range states {
|
|
if task.GetState() == state {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
}
|
|
|
|
func WithType(taskType TaskType) TaskFilter {
|
|
return func(task Task) bool {
|
|
return task.GetType() == taskType
|
|
}
|
|
}
|
|
|
|
type UpdateAction func(task Task)
|
|
|
|
func UpdateState(state datapb.ImportTaskStateV2) UpdateAction {
|
|
return func(t Task) {
|
|
switch t.GetType() {
|
|
case PreImportTaskType:
|
|
t.(*PreImportTask).PreImportTask.State = state
|
|
case ImportTaskType:
|
|
t.(*ImportTask).ImportTaskV2.State = state
|
|
case L0PreImportTaskType:
|
|
t.(*L0PreImportTask).PreImportTask.State = state
|
|
case L0ImportTaskType:
|
|
t.(*L0ImportTask).ImportTaskV2.State = state
|
|
case CopySegmentTaskType:
|
|
t.(*CopySegmentTask).state = state
|
|
}
|
|
}
|
|
}
|
|
|
|
func UpdateReason(reason string) UpdateAction {
|
|
return func(t Task) {
|
|
switch t.GetType() {
|
|
case PreImportTaskType:
|
|
t.(*PreImportTask).PreImportTask.Reason = reason
|
|
case ImportTaskType:
|
|
t.(*ImportTask).ImportTaskV2.Reason = reason
|
|
case L0PreImportTaskType:
|
|
t.(*L0PreImportTask).PreImportTask.Reason = reason
|
|
case L0ImportTaskType:
|
|
t.(*L0ImportTask).ImportTaskV2.Reason = reason
|
|
case CopySegmentTaskType:
|
|
t.(*CopySegmentTask).reason = reason
|
|
}
|
|
}
|
|
}
|
|
|
|
func UpdateFileStat(idx int, fileStat *datapb.ImportFileStats) UpdateAction {
|
|
return func(task Task) {
|
|
var t *datapb.PreImportTask
|
|
switch it := task.(type) {
|
|
case *PreImportTask:
|
|
t = it.PreImportTask
|
|
case *L0PreImportTask:
|
|
t = it.PreImportTask
|
|
}
|
|
if t != nil {
|
|
t.FileStats[idx].FileSize = fileStat.GetFileSize()
|
|
t.FileStats[idx].TotalRows = fileStat.GetTotalRows()
|
|
t.FileStats[idx].TotalMemorySize = fileStat.GetTotalMemorySize()
|
|
t.FileStats[idx].HashedStats = fileStat.GetHashedStats()
|
|
}
|
|
}
|
|
}
|
|
|
|
func UpdateSegmentInfo(info *datapb.ImportSegmentInfo) UpdateAction {
|
|
mergeFn := func(current []*datapb.FieldBinlog, new []*datapb.FieldBinlog) []*datapb.FieldBinlog {
|
|
for _, binlog := range new {
|
|
fieldBinlogs, ok := lo.Find(current, func(log *datapb.FieldBinlog) bool {
|
|
return log.GetFieldID() == binlog.GetFieldID()
|
|
})
|
|
if !ok || fieldBinlogs == nil {
|
|
current = append(current, binlog)
|
|
} else {
|
|
fieldBinlogs.Binlogs = append(fieldBinlogs.Binlogs, binlog.Binlogs...)
|
|
}
|
|
}
|
|
return current
|
|
}
|
|
return func(task Task) {
|
|
var segmentsInfo map[int64]*datapb.ImportSegmentInfo
|
|
switch it := task.(type) {
|
|
case *ImportTask:
|
|
segmentsInfo = it.segmentsInfo
|
|
case *L0ImportTask:
|
|
segmentsInfo = it.segmentsInfo
|
|
}
|
|
if segmentsInfo != nil {
|
|
segment := info.GetSegmentID()
|
|
if _, ok := segmentsInfo[segment]; ok {
|
|
segmentsInfo[segment].ImportedRows = info.GetImportedRows()
|
|
segmentsInfo[segment].Binlogs = mergeFn(segmentsInfo[segment].Binlogs, info.GetBinlogs())
|
|
segmentsInfo[segment].Statslogs = mergeFn(segmentsInfo[segment].Statslogs, info.GetStatslogs())
|
|
segmentsInfo[segment].Deltalogs = mergeFn(segmentsInfo[segment].Deltalogs, info.GetDeltalogs())
|
|
segmentsInfo[segment].Bm25Logs = mergeFn(segmentsInfo[segment].Bm25Logs, info.GetBm25Logs())
|
|
return
|
|
}
|
|
segmentsInfo[segment] = info
|
|
}
|
|
}
|
|
}
|
|
|
|
// UpdateSegmentResult updates segment result for CopySegmentTask
|
|
// This includes both binlog information and index metadata
|
|
//
|
|
// Note: In CopySegmentTask, each source segment maps to a unique target segment (1:1),
|
|
// so each segment is only updated once. No merging is needed.
|
|
func UpdateSegmentResult(result *datapb.CopySegmentResult) UpdateAction {
|
|
return func(task Task) {
|
|
if it, ok := task.(*CopySegmentTask); ok {
|
|
segment := result.GetSegmentId()
|
|
// Directly replace the segment result since each segment is only updated once
|
|
// The initial empty result was created in NewCopySegmentTask()
|
|
it.segmentResults[segment] = result
|
|
}
|
|
}
|
|
}
|
|
|
|
type Task interface {
|
|
Execute() []*conc.Future[any]
|
|
GetJobID() int64
|
|
GetTaskID() int64
|
|
GetCollectionID() int64
|
|
GetPartitionIDs() []int64
|
|
GetVchannels() []string
|
|
GetType() TaskType
|
|
GetState() datapb.ImportTaskStateV2
|
|
GetReason() string
|
|
GetSchema() *schemapb.CollectionSchema
|
|
GetSlots() int64
|
|
GetBufferSize() int64
|
|
Cancel()
|
|
Clone() Task
|
|
}
|
|
|
|
func WrapLogFields(task Task, fields ...zap.Field) []zap.Field {
|
|
res := []zap.Field{
|
|
zap.Int64("taskID", task.GetTaskID()),
|
|
zap.Int64("jobID", task.GetJobID()),
|
|
zap.Int64("collectionID", task.GetCollectionID()),
|
|
zap.String("type", task.GetType().String()),
|
|
}
|
|
res = append(res, fields...)
|
|
return res
|
|
}
|