From 512884524b361f90a3200ba52ad47d5d7476e234 Mon Sep 17 00:00:00 2001 From: "yihao.dai" Date: Fri, 26 Dec 2025 18:59:20 +0800 Subject: [PATCH] enhance: Maintain compatibility with the legacy FlushAll (#46564) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue: https://github.com/milvus-io/milvus/issues/45919 - Core invariant: FlushAll verification must accept both per-channel FlushAllTss (new schema) and the legacy single FlushAllTs; GetFlushAllState chooses the verification path based on which field is present and treats a channel as flushed only if its channel checkpoint timestamp >= the applicable threshold (per-channel timestamp or legacy FlushAllTs). - Logic removed/simplified: The previous mixed/ambiguous checks were split into two focused routines—verifyFlushAllStateByChannelFlushAllTs(logger, channel, flushAllTss) and verifyFlushAllStateByLegacyFlushAllTs(logger, channel, flushAllTs)—and GetFlushAllState now selects one path. This centralizes compatibility logic, eliminates interleaved/duplicated checks, and retains the outer-loop short-circuiting on the first unflushed channel. - Why this does NOT cause data loss or regressions: Changes only affect read-only verification paths (GetFlushAllState/GetFlushState) that compare in-memory channel checkpoints (meta.GetChannelCheckpoint) to provided thresholds; no writes to checkpoints or persisted state occur and FlushAll enqueue/wait behavior is unchanged. Unit tests were added to cover legacy FlushAllTs behavior and the new FlushAllMsgs→FlushAllTs extraction, exercising both code paths. - Enhancement scope and location: Adds backward-compatible support and concrete FlushAllTs extraction from streaming FlushAllMsgs in Proxy (internal/proxy/task_flush_all_streaming.go) and compatibility verifiers in DataCoord (internal/datacoord/services.go), plus corresponding tests (internal/datacoord/services_test.go, internal/proxy/*_test.go). --------- Signed-off-by: bigsheeper --- internal/datacoord/services.go | 67 ++++++++++++---- internal/datacoord/services_test.go | 76 +++++++++++++++++++ internal/proxy/impl.go | 13 ++-- internal/proxy/impl_test.go | 19 ++++- internal/proxy/task_flush_all_streaming.go | 10 +++ .../proxy/task_flush_all_streaming_test.go | 22 ++++++ 6 files changed, 183 insertions(+), 24 deletions(-) diff --git a/internal/datacoord/services.go b/internal/datacoord/services.go index 436e601f62..8a45aba160 100644 --- a/internal/datacoord/services.go +++ b/internal/datacoord/services.go @@ -1550,35 +1550,70 @@ OUTER: return resp, nil } for _, channel := range describeColRsp.GetVirtualChannelNames() { - channelCP := s.meta.GetChannelCheckpoint(channel) - pchannel := funcutil.ToPhysicalChannel(channel) - flushAllTs, ok := req.GetFlushAllTss()[pchannel] - if !ok || flushAllTs == 0 { - log.Warn("FlushAllTs not found for pchannel", zap.String("pchannel", pchannel), zap.Uint64("flushAllTs", flushAllTs)) - resp.Status = merr.Status(merr.WrapErrParameterInvalidMsg("FlushAllTs not found for pchannel %s", pchannel)) + if len(req.GetFlushAllTss()) > 0 { + ok, err := s.verifyFlushAllStateByChannelFlushAllTs(log, channel, req.GetFlushAllTss()) + if err != nil { + resp.Status = merr.Status(err) + return resp, nil + } + if !ok { + allFlushed = false + break OUTER + } + } else if req.GetFlushAllTs() != 0 { + // For compatibility, if deprecated FlushAllTs is provided, use it to verify the flush state. + if !s.verifyFlushAllStateByLegacyFlushAllTs(log, channel, req.GetFlushAllTs()) { + allFlushed = false + break OUTER + } + } else { + resp.Status = merr.Status(merr.WrapErrParameterInvalidMsg("FlushAllTss or FlushAllTs is required")) return resp, nil } - if channelCP == nil || channelCP.GetTimestamp() < flushAllTs { - allFlushed = false - log.RatedInfo(10, "channel unflushed", - zap.String("vchannel", channel), - zap.Uint64("flushAllTs", flushAllTs), - zap.Uint64("channelCP", channelCP.GetTimestamp()), - ) - break OUTER - } } } } if allFlushed { - log.Info("GetFlushAllState all flushed", zap.Any("flushAllTss", req.GetFlushAllTss())) + log.Info("GetFlushAllState all flushed", zap.Any("flushAllTss", req.GetFlushAllTss()), zap.Uint64("FlushAllTs", req.GetFlushAllTs())) } resp.Flushed = allFlushed return resp, nil } +func (s *Server) verifyFlushAllStateByChannelFlushAllTs(logger *log.MLogger, channel string, flushAllTss map[string]uint64) (bool, error) { + channelCP := s.meta.GetChannelCheckpoint(channel) + pchannel := funcutil.ToPhysicalChannel(channel) + flushAllTs, ok := flushAllTss[pchannel] + if !ok || flushAllTs == 0 { + logger.Warn("FlushAllTs not found for pchannel", zap.String("pchannel", pchannel), zap.Uint64("flushAllTs", flushAllTs)) + return false, merr.WrapErrParameterInvalidMsg("FlushAllTs not found for pchannel %s", pchannel) + } + if channelCP == nil || channelCP.GetTimestamp() < flushAllTs { + logger.RatedInfo(10, "channel unflushed", + zap.String("vchannel", channel), + zap.Uint64("flushAllTs", flushAllTs), + zap.Uint64("channelCP", channelCP.GetTimestamp()), + ) + return false, nil + } + return true, nil +} + +func (s *Server) verifyFlushAllStateByLegacyFlushAllTs(logger *log.MLogger, channel string, flushAllTs uint64) bool { + channelCP := s.meta.GetChannelCheckpoint(channel) + if channelCP == nil || channelCP.GetTimestamp() < flushAllTs { + logger.RatedInfo(10, "channel unflushed", + zap.String("vchannel", channel), + zap.Uint64("flushAllTs", flushAllTs), + zap.Uint64("channelCP", channelCP.GetTimestamp()), + ) + return false + } + return true +} + // Deprecated // UpdateSegmentStatistics updates a segment's stats. func (s *Server) UpdateSegmentStatistics(ctx context.Context, req *datapb.UpdateSegmentStatisticsRequest) (*commonpb.Status, error) { diff --git a/internal/datacoord/services_test.go b/internal/datacoord/services_test.go index eeb28f476a..641776638c 100644 --- a/internal/datacoord/services_test.go +++ b/internal/datacoord/services_test.go @@ -2334,6 +2334,82 @@ func TestServer_GetFlushAllState(t *testing.T) { assert.NoError(t, merr.CheckRPCCall(resp, err)) assert.False(t, resp.GetFlushed()) }) + + t.Run("test legacy FlushAllTs provided and flushed", func(t *testing.T) { + server := createTestGetFlushAllStateServer() + + // Mock ListDatabases + mockListDatabases := mockey.Mock(mockey.GetMethod(server.broker, "ListDatabases")).Return(&milvuspb.ListDatabasesResponse{ + Status: merr.Success(), + DbNames: []string{"test-db"}, + }, nil).Build() + defer mockListDatabases.UnPatch() + + // Mock ShowCollections + mockShowCollections := mockey.Mock(mockey.GetMethod(server.broker, "ShowCollections")).Return(&milvuspb.ShowCollectionsResponse{ + Status: merr.Success(), + CollectionIds: []int64{100}, + CollectionNames: []string{"collection1"}, + }, nil).Build() + defer mockShowCollections.UnPatch() + + // Mock DescribeCollectionInternal + mockDescribeCollection := mockey.Mock(mockey.GetMethod(server.broker, "DescribeCollectionInternal")).Return(&milvuspb.DescribeCollectionResponse{ + Status: merr.Success(), + VirtualChannelNames: []string{"channel1"}, + }, nil).Build() + defer mockDescribeCollection.UnPatch() + + // Setup channel checkpoint with timestamp >= deprecated FlushAllTs + server.meta.channelCPs.checkpoints["channel1"] = &msgpb.MsgPosition{Timestamp: 15000} + + req := &milvuspb.GetFlushAllStateRequest{ + FlushAllTs: 15000, // deprecated field + } + + resp, err := server.GetFlushAllState(context.Background(), req) + + assert.NoError(t, merr.CheckRPCCall(resp, err)) + assert.True(t, resp.GetFlushed()) + }) + + t.Run("test legacy FlushAllTs provided and not flushed", func(t *testing.T) { + server := createTestGetFlushAllStateServer() + + // Mock ListDatabases + mockListDatabases := mockey.Mock(mockey.GetMethod(server.broker, "ListDatabases")).Return(&milvuspb.ListDatabasesResponse{ + Status: merr.Success(), + DbNames: []string{"test-db"}, + }, nil).Build() + defer mockListDatabases.UnPatch() + + // Mock ShowCollections + mockShowCollections := mockey.Mock(mockey.GetMethod(server.broker, "ShowCollections")).Return(&milvuspb.ShowCollectionsResponse{ + Status: merr.Success(), + CollectionIds: []int64{100}, + CollectionNames: []string{"collection1"}, + }, nil).Build() + defer mockShowCollections.UnPatch() + + // Mock DescribeCollectionInternal + mockDescribeCollection := mockey.Mock(mockey.GetMethod(server.broker, "DescribeCollectionInternal")).Return(&milvuspb.DescribeCollectionResponse{ + Status: merr.Success(), + VirtualChannelNames: []string{"channel1"}, + }, nil).Build() + defer mockDescribeCollection.UnPatch() + + // Setup channel checkpoint with timestamp < deprecated FlushAllTs + server.meta.channelCPs.checkpoints["channel1"] = &msgpb.MsgPosition{Timestamp: 10000} + + req := &milvuspb.GetFlushAllStateRequest{ + FlushAllTs: 15000, // deprecated field + } + + resp, err := server.GetFlushAllState(context.Background(), req) + + assert.NoError(t, merr.CheckRPCCall(resp, err)) + assert.False(t, resp.GetFlushed()) + }) } func getWatchKV(t *testing.T) kv.WatchKV { diff --git a/internal/proxy/impl.go b/internal/proxy/impl.go index 1b09f377c4..2d138ce2ce 100644 --- a/internal/proxy/impl.go +++ b/internal/proxy/impl.go @@ -4183,16 +4183,12 @@ func (node *Proxy) FlushAll(ctx context.Context, request *milvuspb.FlushAllReque return resp, nil } - logger.Debug(rpcEnqueued(method), - zap.Uint64("BeginTs", ft.BeginTs()), - zap.Uint64("EndTs", ft.EndTs())) + logger.Debug(rpcEnqueued(method)) if err := ft.WaitToFinish(); err != nil { logger.Warn( rpcFailedToWaitToFinish(method), - zap.Error(err), - zap.Uint64("BeginTs", ft.BeginTs()), - zap.Uint64("EndTs", ft.EndTs())) + zap.Error(err)) resp.Status = merr.Status(err) return resp, nil @@ -4868,7 +4864,10 @@ func (node *Proxy) GetFlushState(ctx context.Context, req *milvuspb.GetFlushStat func (node *Proxy) GetFlushAllState(ctx context.Context, req *milvuspb.GetFlushAllStateRequest) (*milvuspb.GetFlushAllStateResponse, error) { ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-GetFlushAllState") defer sp.End() - log := log.Ctx(ctx).With(zap.Any("FlushAllTss", req.GetFlushAllTss())) + log := log.Ctx(ctx).With( + zap.Any("FlushAllTss", req.GetFlushAllTss()), + zap.Uint64("FlushAllTs", req.GetFlushAllTs()), // for compatibility + ) log.Debug("receive GetFlushAllState request") var err error diff --git a/internal/proxy/impl_test.go b/internal/proxy/impl_test.go index d53a63c9a6..0b9dca19a5 100644 --- a/internal/proxy/impl_test.go +++ b/internal/proxy/impl_test.go @@ -24,6 +24,7 @@ import ( "net/http/httptest" "testing" + "github.com/apache/pulsar-client-go/pulsar" "github.com/bytedance/mockey" "github.com/cockroachdb/errors" "github.com/gin-gonic/gin" @@ -49,6 +50,8 @@ import ( "github.com/milvus-io/milvus/pkg/v2/proto/proxypb" "github.com/milvus-io/milvus/pkg/v2/proto/querypb" "github.com/milvus-io/milvus/pkg/v2/proto/rootcoordpb" + "github.com/milvus-io/milvus/pkg/v2/streaming/util/message" + pulsar2 "github.com/milvus-io/milvus/pkg/v2/streaming/walimpls/impls/pulsar" "github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/paramtable" "github.com/milvus-io/milvus/pkg/v2/util/ratelimitutil" @@ -494,10 +497,24 @@ func TestProxy_FlushAll_Success(t *testing.T) { node := createTestProxy() defer node.sched.Close() + messageID := pulsar2.NewPulsarID(pulsar.EarliestMessageID()) + msg := message.NewFlushAllMessageBuilderV2(). + WithVChannel("test-vchannel"). + WithHeader(&message.FlushAllMessageHeader{}). + WithBody(&message.FlushAllMessageBody{}). + MustBuildMutable().WithTimeTick(1000). + WithLastConfirmed(messageID) + milvusMsg := message.ImmutableMessageToMilvusMessage(commonpb.WALName_Pulsar.String(), msg.IntoImmutableMessage(messageID)) + mixcoord := &grpcmixcoordclient.Client{} node.mixCoord = mixcoord mockey.Mock((*grpcmixcoordclient.Client).FlushAll).To(func(ctx context.Context, req *datapb.FlushAllRequest, opts ...grpc.CallOption) (*datapb.FlushAllResponse, error) { - return &datapb.FlushAllResponse{Status: successStatus}, nil + return &datapb.FlushAllResponse{ + Status: successStatus, + FlushAllMsgs: map[string]*commonpb.ImmutableMessage{ + "channel1": milvusMsg, + }, + }, nil }).Build() resp, err := node.FlushAll(context.Background(), &milvuspb.FlushAllRequest{}) diff --git a/internal/proxy/task_flush_all_streaming.go b/internal/proxy/task_flush_all_streaming.go index c750f37a36..ff220e0946 100644 --- a/internal/proxy/task_flush_all_streaming.go +++ b/internal/proxy/task_flush_all_streaming.go @@ -20,9 +20,12 @@ import ( "context" "fmt" + "github.com/samber/lo" + "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb" + "github.com/milvus-io/milvus/pkg/v2/streaming/util/message" "github.com/milvus-io/milvus/pkg/v2/util/commonpbutil" "github.com/milvus-io/milvus/pkg/v2/util/merr" ) @@ -39,5 +42,12 @@ func (t *flushAllTask) Execute(ctx context.Context) error { FlushAllMsgs: resp.GetFlushAllMsgs(), ClusterInfo: resp.GetClusterInfo(), } + + // Assign the flush all ts to the result for compatibility. + // Use the max time tick of the flush all messages as the flush all ts. + t.result.FlushAllTs = lo.MaxBy(message.MilvusMessagesToImmutableMessages(lo.Values(resp.GetFlushAllMsgs())), func(a, b message.ImmutableMessage) bool { + return a.TimeTick() > b.TimeTick() + }).TimeTick() + return nil } diff --git a/internal/proxy/task_flush_all_streaming_test.go b/internal/proxy/task_flush_all_streaming_test.go index 8e02b4285d..c1fc080749 100644 --- a/internal/proxy/task_flush_all_streaming_test.go +++ b/internal/proxy/task_flush_all_streaming_test.go @@ -20,12 +20,16 @@ import ( "context" "testing" + "github.com/apache/pulsar-client-go/pulsar" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" + "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus/internal/mocks" "github.com/milvus-io/milvus/pkg/v2/proto/datapb" + "github.com/milvus-io/milvus/pkg/v2/streaming/util/message" + pulsar2 "github.com/milvus-io/milvus/pkg/v2/streaming/walimpls/impls/pulsar" "github.com/milvus-io/milvus/pkg/v2/util/merr" ) @@ -39,13 +43,31 @@ func TestFlushAllTask_Success(t *testing.T) { mixCoord: mixCoord, } + messageID := pulsar2.NewPulsarID(pulsar.EarliestMessageID()) + msg := message.NewFlushAllMessageBuilderV2(). + WithVChannel("test-vchannel"). + WithHeader(&message.FlushAllMessageHeader{}). + WithBody(&message.FlushAllMessageBody{}). + MustBuildMutable().WithTimeTick(1000). + WithLastConfirmed(messageID) + milvusMsg := message.ImmutableMessageToMilvusMessage(commonpb.WALName_Pulsar.String(), msg.IntoImmutableMessage(messageID)) + mixCoord.EXPECT().FlushAll(mock.Anything, mock.Anything).Return(&datapb.FlushAllResponse{ Status: merr.Success(), + FlushAllMsgs: map[string]*commonpb.ImmutableMessage{ + "channel1": milvusMsg, + }, + ClusterInfo: &milvuspb.ClusterInfo{ + ClusterId: "cluster1", + }, }, nil) err := task.Execute(ctx) assert.NoError(t, err) assert.NotNil(t, task.result) + assert.True(t, len(task.result.FlushAllMsgs) > 0) + assert.True(t, task.result.FlushAllTs > 0) + assert.NotNil(t, task.result.ClusterInfo) } func TestFlushAllTask_Failed(t *testing.T) {