milvus/internal/querycoordv2/dist/dist_handler_test.go
wei liu 47949fd883
enhance: Implement rewatch mechanism for etcd failure scenarios (#43829) (#43920)
issue: #43828
pr: #43829 #43909
Implement robust rewatch mechanism to handle etcd connection failures
and node reconnection scenarios in DataCoord and QueryCoord, along with
heartbeat lag monitoring capabilities.

Changes include:
- Implement rewatchDataNodes/rewatchQueryNodes callbacks for etcd
reconnection scenarios
- Add idempotent rewatchNodes method to handle etcd session recovery
gracefully
- Add QueryCoordLastHeartbeatTimeStamp metric for monitoring node
heartbeat lag
- Clean up heartbeat metrics when nodes go down to prevent metric leaks

---------

---------

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
Co-authored-by: Zhen Ye <chyezh@outlook.com>
2025-10-15 14:12:01 +08:00

278 lines
9.2 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dist
import (
"context"
"fmt"
"testing"
"time"
"github.com/bytedance/mockey"
"github.com/cockroachdb/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
type DistHandlerSuite struct {
suite.Suite
ctx context.Context
meta *meta.Meta
broker *meta.MockBroker
nodeID int64
client *session.MockCluster
nodeManager *session.NodeManager
scheduler *task.MockScheduler
dispatchMockCall *mock.Call
executedFlagChan chan struct{}
dist *meta.DistributionManager
target *meta.MockTargetManager
handler *distHandler
}
func (suite *DistHandlerSuite) SetupSuite() {
paramtable.Init()
suite.nodeID = 1
suite.client = session.NewMockCluster(suite.T())
suite.nodeManager = session.NewNodeManager()
suite.scheduler = task.NewMockScheduler(suite.T())
suite.dist = meta.NewDistributionManager()
suite.target = meta.NewMockTargetManager(suite.T())
suite.ctx = context.Background()
suite.executedFlagChan = make(chan struct{}, 1)
suite.scheduler.EXPECT().GetExecutedFlag(mock.Anything).Return(suite.executedFlagChan).Maybe()
suite.target.EXPECT().GetSealedSegment(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe()
suite.target.EXPECT().GetDmChannel(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil).Maybe()
suite.target.EXPECT().GetCollectionTargetVersion(mock.Anything, mock.Anything, mock.Anything).Return(1011).Maybe()
}
func (suite *DistHandlerSuite) TestBasic() {
if suite.dispatchMockCall != nil {
suite.dispatchMockCall.Unset()
suite.dispatchMockCall = nil
}
suite.target.EXPECT().GetSealedSegmentsByChannel(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(map[int64]*datapb.SegmentInfo{})
suite.dispatchMockCall = suite.scheduler.EXPECT().Dispatch(mock.Anything).Maybe()
suite.nodeManager.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.client.EXPECT().GetDataDistribution(mock.Anything, mock.Anything, mock.Anything).Return(&querypb.GetDataDistributionResponse{
Status: merr.Success(),
NodeID: 1,
Channels: []*querypb.ChannelVersionInfo{
{
Channel: "test-channel-1",
Collection: 1,
Version: 1,
},
},
Segments: []*querypb.SegmentVersionInfo{
{
ID: 1,
Collection: 1,
Partition: 1,
Channel: "test-channel-1",
Version: 1,
},
},
LeaderViews: []*querypb.LeaderView{
{
Collection: 1,
Channel: "test-channel-1",
TargetVersion: 1011,
},
},
LastModifyTs: 1,
}, nil)
syncTargetVersionFn := func(collectionID int64) {}
suite.handler = newDistHandler(suite.ctx, suite.nodeID, suite.client, suite.nodeManager, suite.scheduler, suite.dist, suite.target, syncTargetVersionFn)
defer suite.handler.stop()
time.Sleep(3 * time.Second)
}
func (suite *DistHandlerSuite) TestGetDistributionFailed() {
if suite.dispatchMockCall != nil {
suite.dispatchMockCall.Unset()
suite.dispatchMockCall = nil
}
suite.target.EXPECT().GetSealedSegmentsByChannel(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(map[int64]*datapb.SegmentInfo{}).Maybe()
suite.dispatchMockCall = suite.scheduler.EXPECT().Dispatch(mock.Anything).Maybe()
suite.nodeManager.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.client.EXPECT().GetDataDistribution(mock.Anything, mock.Anything, mock.Anything).Return(nil, errors.New("fake error"))
syncTargetVersionFn := func(collectionID int64) {}
suite.handler = newDistHandler(suite.ctx, suite.nodeID, suite.client, suite.nodeManager, suite.scheduler, suite.dist, suite.target, syncTargetVersionFn)
defer suite.handler.stop()
time.Sleep(3 * time.Second)
}
func (suite *DistHandlerSuite) TestForcePullDist() {
if suite.dispatchMockCall != nil {
suite.dispatchMockCall.Unset()
suite.dispatchMockCall = nil
}
suite.target.EXPECT().GetSealedSegmentsByChannel(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(map[int64]*datapb.SegmentInfo{}).Maybe()
suite.nodeManager.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
}))
suite.client.EXPECT().GetDataDistribution(mock.Anything, mock.Anything, mock.Anything).Return(&querypb.GetDataDistributionResponse{
Status: merr.Success(),
NodeID: 1,
Channels: []*querypb.ChannelVersionInfo{
{
Channel: "test-channel-1",
Collection: 1,
Version: 1,
},
},
Segments: []*querypb.SegmentVersionInfo{
{
ID: 1,
Collection: 1,
Partition: 1,
Channel: "test-channel-1",
Version: 1,
},
},
LeaderViews: []*querypb.LeaderView{
{
Collection: 1,
Channel: "test-channel-1",
},
},
LastModifyTs: 1,
}, nil)
suite.executedFlagChan <- struct{}{}
syncTargetVersionFn := func(collectionID int64) {}
suite.handler = newDistHandler(suite.ctx, suite.nodeID, suite.client, suite.nodeManager, suite.scheduler, suite.dist, suite.target, syncTargetVersionFn)
defer suite.handler.stop()
time.Sleep(300 * time.Millisecond)
}
// TestHeartbeatMetricsRecording tests that heartbeat metrics are properly recorded
func TestHeartbeatMetricsRecording(t *testing.T) {
// Arrange: Create test response with a unique nodeID to avoid test interference
nodeID := time.Now().UnixNano() % 1000000 // Use timestamp-based unique ID
resp := &querypb.GetDataDistributionResponse{
Status: merr.Success(),
NodeID: nodeID,
LastModifyTs: 1,
}
// Create mock node
nodeManager := session.NewNodeManager()
nodeInfo := session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: nodeID,
Address: "localhost:19530",
Hostname: "localhost",
})
nodeManager.Add(nodeInfo)
// Mock time.Now() to get predictable timestamp
expectedTimestamp := time.Unix(1640995200, 0) // 2022-01-01 00:00:00 UTC
mockTimeNow := mockey.Mock(time.Now).Return(expectedTimestamp).Build()
defer mockTimeNow.UnPatch()
// Record the initial state of the metric for our specific nodeID
initialMetricValue := getMetricValueForNode(fmt.Sprint(nodeID))
// Create dist handler
ctx := context.Background()
handler := &distHandler{
nodeID: nodeID,
nodeManager: nodeManager,
dist: meta.NewDistributionManager(),
target: meta.NewTargetManager(nil, nil),
scheduler: task.NewScheduler(ctx, nil, nil, nil, nil, nil, nil),
}
// Act: Handle distribution response
handler.handleDistResp(ctx, resp, false)
// Assert: Verify our specific metric was recorded with the expected value
finalMetricValue := getMetricValueForNode(fmt.Sprint(nodeID))
// Check that the metric value changed and matches our expected timestamp
assert.NotEqual(t, initialMetricValue, finalMetricValue, "Metric value should have changed")
assert.Equal(t, float64(expectedTimestamp.UnixNano()), finalMetricValue, "Metric should record the expected timestamp")
// Clean up: Remove the test metric to avoid affecting other tests
metrics.QueryCoordLastHeartbeatTimeStamp.DeleteLabelValues(fmt.Sprint(nodeID))
}
// Helper function to get the current metric value for a specific nodeID
func getMetricValueForNode(nodeID string) float64 {
// Create a temporary registry to capture the current state
registry := prometheus.NewRegistry()
registry.MustRegister(metrics.QueryCoordLastHeartbeatTimeStamp)
metricFamilies, err := registry.Gather()
if err != nil {
return -1 // Return -1 if we can't gather metrics
}
for _, mf := range metricFamilies {
if mf.GetName() == "milvus_querycoord_last_heartbeat_timestamp" {
for _, metric := range mf.GetMetric() {
for _, label := range metric.GetLabel() {
if label.GetName() == "node_id" && label.GetValue() == nodeID {
return metric.GetGauge().GetValue()
}
}
}
}
}
return 0 // Return 0 if metric not found (default value)
}
func TestDistHandlerSuite(t *testing.T) {
suite.Run(t, new(DistHandlerSuite))
}