mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
issue: #44358 Implement complete snapshot management system including creation, deletion, listing, description, and restoration capabilities across all system components. Key features: - Create snapshots for entire collections - Drop snapshots by name with proper cleanup - List snapshots with collection filtering - Describe snapshot details and metadata Components added/modified: - Client SDK with full snapshot API support and options - DataCoord snapshot service with metadata management - Proxy layer with task-based snapshot operations - Protocol buffer definitions for snapshot RPCs - Comprehensive unit tests with mockey framework - Integration tests for end-to-end validation Technical implementation: - Snapshot metadata storage in etcd with proper indexing - File-based snapshot data persistence in object storage - Garbage collection integration for snapshot cleanup - Error handling and validation across all operations - Thread-safe operations with proper locking mechanisms <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant/assumption: snapshots are immutable point‑in‑time captures identified by (collection, snapshot name/ID); etcd snapshot metadata is authoritative for lifecycle (PENDING → COMMITTED → DELETING) and per‑segment manifests live in object storage (Avro / StorageV2). GC and restore logic must see snapshotRefIndex loaded (snapshotMeta.IsRefIndexLoaded) before reclaiming or relying on segment/index files. - New capability added: full end‑to‑end snapshot subsystem — client SDK APIs (Create/Drop/List/Describe/Restore + restore job queries), DataCoord SnapshotWriter/Reader (Avro + StorageV2 manifests), snapshotMeta in meta, SnapshotManager orchestration (create/drop/describe/list/restore), copy‑segment restore tasks/inspector/checker, proxy & RPC surface, GC integration, and docs/tests — enabling point‑in‑time collection snapshots persisted to object storage and restorations orchestrated across components. - Logic removed/simplified and why: duplicated recursive compaction/delta‑log traversal and ad‑hoc lookup code were consolidated behind two focused APIs/owners (Handler.GetDeltaLogFromCompactTo for delta traversal and SnapshotManager/SnapshotReader for snapshot I/O). MixCoord/coordinator broker paths were converted to thin RPC proxies. This eliminates multiple implementations of the same traversal/lookup, reducing divergence and simplifying responsibility boundaries. - Why this does NOT introduce data loss or regressions: snapshot create/drop use explicit two‑phase semantics (PENDING → COMMIT/DELETING) with SnapshotWriter writing manifests and metadata before commit; GC uses snapshotRefIndex guards and IsRefIndexLoaded/GetSnapshotBySegment/GetSnapshotByIndex checks to avoid removing referenced files; restore flow pre‑allocates job IDs, validates resources (partitions/indexes), performs rollback on failure (rollbackRestoreSnapshot), and converts/updates segment/index metadata only after successful copy tasks. Extensive unit and integration tests exercise pending/deleting/GC/restore/error paths to ensure idempotence and protection against premature deletion. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Wei Liu <wei.liu@zilliz.com>
331 lines
13 KiB
Go
331 lines
13 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package proxy
|
|
|
|
import (
|
|
"context"
|
|
"strconv"
|
|
|
|
"go.opentelemetry.io/otel"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/metrics"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/timerecord"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
func (node *Proxy) CreateSnapshot(ctx context.Context, req *milvuspb.CreateSnapshotRequest) (*commonpb.Status, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-CreateSnapshot")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("snapshotName", req.GetName()),
|
|
zap.String("collectionName", req.GetCollectionName()),
|
|
)
|
|
|
|
method := "CreateSnapshot"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, req.GetDbName(), req.GetCollectionName()).Inc()
|
|
t := &createSnapshotTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, req.GetDbName(), req.GetCollectionName()).Inc()
|
|
log.Warn("CreateSnapshot failed to Enqueue",
|
|
zap.Error(err))
|
|
return merr.Status(err), nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, req.GetDbName(), req.GetCollectionName()).Inc()
|
|
log.Warn("CreateSnapshot failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return merr.Status(err), nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, req.GetDbName(), req.GetCollectionName()).Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) DropSnapshot(ctx context.Context, req *milvuspb.DropSnapshotRequest) (*commonpb.Status, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DropSnapshot")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("snapshotName", req.GetName()),
|
|
)
|
|
|
|
method := "DropSnapshot"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, "", "").Inc()
|
|
t := &dropSnapshotTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, "", "").Inc()
|
|
log.Warn("DropSnapshot failed to Enqueue",
|
|
zap.Error(err))
|
|
return merr.Status(err), nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, "", "").Inc()
|
|
log.Warn("DropSnapshot failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return merr.Status(err), nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, "", "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) DescribeSnapshot(ctx context.Context, req *milvuspb.DescribeSnapshotRequest) (*milvuspb.DescribeSnapshotResponse, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DescribeSnapshot")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("snapshotName", req.GetName()),
|
|
)
|
|
|
|
method := "DescribeSnapshot"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, "", "").Inc()
|
|
t := &describeSnapshotTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, "", "").Inc()
|
|
log.Warn("DescribeSnapshot failed to Enqueue",
|
|
zap.Error(err))
|
|
return &milvuspb.DescribeSnapshotResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, "", "").Inc()
|
|
log.Warn("DescribeSnapshot failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return &milvuspb.DescribeSnapshotResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, "", "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) ListSnapshots(ctx context.Context, req *milvuspb.ListSnapshotsRequest) (*milvuspb.ListSnapshotsResponse, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-ListSnapshots")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("collectionName", req.GetCollectionName()))
|
|
|
|
method := "ListSnapshots"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, req.GetCollectionName(), "").Inc()
|
|
t := &listSnapshotsTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, req.GetCollectionName(), "").Inc()
|
|
log.Warn("ListSnapshots failed to Enqueue",
|
|
zap.Error(err))
|
|
return &milvuspb.ListSnapshotsResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, req.GetCollectionName(), "").Inc()
|
|
log.Warn("ListSnapshots failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return &milvuspb.ListSnapshotsResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, req.GetCollectionName(), "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) RestoreSnapshot(ctx context.Context, req *milvuspb.RestoreSnapshotRequest) (*milvuspb.RestoreSnapshotResponse, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-RestoreSnapshot")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("snapshotName", req.GetName()),
|
|
)
|
|
|
|
method := "RestoreSnapshot"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, "", "").Inc()
|
|
t := &restoreSnapshotTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, "", "").Inc()
|
|
log.Warn("RestoreSnapshot failed to Enqueue",
|
|
zap.Error(err))
|
|
return &milvuspb.RestoreSnapshotResponse{Status: merr.Status(err)}, nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, "", "").Inc()
|
|
log.Warn("RestoreSnapshot failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return &milvuspb.RestoreSnapshotResponse{Status: merr.Status(err)}, nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, "", "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) GetRestoreSnapshotState(ctx context.Context, req *milvuspb.GetRestoreSnapshotStateRequest) (*milvuspb.GetRestoreSnapshotStateResponse, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-GetRestoreSnapshotState")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.Int64("jobID", req.GetJobId()),
|
|
)
|
|
|
|
method := "GetRestoreSnapshotState"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, "", "").Inc()
|
|
t := &getRestoreSnapshotStateTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, "", "").Inc()
|
|
log.Warn("GetRestoreSnapshotState failed to Enqueue",
|
|
zap.Error(err))
|
|
return &milvuspb.GetRestoreSnapshotStateResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, "", "").Inc()
|
|
log.Warn("GetRestoreSnapshotState failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return &milvuspb.GetRestoreSnapshotStateResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, "", "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|
|
|
|
func (node *Proxy) ListRestoreSnapshotJobs(ctx context.Context, req *milvuspb.ListRestoreSnapshotJobsRequest) (*milvuspb.ListRestoreSnapshotJobsResponse, error) {
|
|
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-ListRestoreSnapshotJobs")
|
|
defer sp.End()
|
|
|
|
log := log.Ctx(ctx).With(
|
|
zap.String("collectionName", req.GetCollectionName()),
|
|
)
|
|
|
|
method := "ListRestoreSnapshotJobs"
|
|
tr := timerecord.NewTimeRecorder(method)
|
|
log.Info(rpcReceived(method))
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.TotalLabel, req.GetCollectionName(), "").Inc()
|
|
t := &listRestoreSnapshotJobsTask{
|
|
req: req,
|
|
ctx: ctx,
|
|
Condition: NewTaskCondition(ctx),
|
|
mixCoord: node.mixCoord,
|
|
}
|
|
|
|
err := node.sched.ddQueue.Enqueue(t)
|
|
if err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.AbandonLabel, req.GetCollectionName(), "").Inc()
|
|
log.Warn("ListRestoreSnapshotJobs failed to Enqueue",
|
|
zap.Error(err))
|
|
return &milvuspb.ListRestoreSnapshotJobsResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
if err := t.WaitToFinish(); err != nil {
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.FailLabel, req.GetCollectionName(), "").Inc()
|
|
log.Warn("ListRestoreSnapshotJobs failed to WaitToFinish",
|
|
zap.Error(err))
|
|
return &milvuspb.ListRestoreSnapshotJobsResponse{
|
|
Status: merr.Status(err),
|
|
}, nil
|
|
}
|
|
|
|
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method, metrics.SuccessLabel, req.GetCollectionName(), "").Inc()
|
|
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
|
|
return t.result, nil
|
|
}
|