milvus/internal/datacoord/session/node_manager_test.go
wei liu 3e9e830074
enhance: Implement rewatch mechanism for etcd failure scenarios (#43829)
issue: #43828
Implement robust rewatch mechanism to handle etcd connection failures
and node reconnection scenarios in DataCoord and QueryCoord, along with
heartbeat lag monitoring capabilities.

Changes include:
- Implement rewatchDataNodes/rewatchQueryNodes callbacks for etcd
reconnection scenarios
- Add idempotent rewatchNodes method to handle etcd session recovery
gracefully
- Add QueryCoordLastHeartbeatTimeStamp metric for monitoring node
heartbeat lag
- Clean up heartbeat metrics when nodes go down to prevent metric leaks

---------

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2025-08-14 10:31:44 +08:00

251 lines
7.2 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package session
import (
"context"
"testing"
"github.com/bytedance/mockey"
"github.com/stretchr/testify/assert"
datanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client"
"github.com/milvus-io/milvus/internal/mocks"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
func TestNodeManager_AddNode(t *testing.T) {
t.Run("add node successfully", func(t *testing.T) {
mockClient := mocks.NewMockDataNodeClient(t)
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return mockClient, nil
}
nm := NewNodeManager(nodeCreator)
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
client, err := nm.GetClient(1)
assert.NoError(t, err)
assert.Equal(t, mockClient, client)
})
t.Run("add node with error", func(t *testing.T) {
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return nil, assert.AnError
}
nm := NewNodeManager(nodeCreator)
err := nm.AddNode(1, "localhost:8080")
assert.Error(t, err)
})
}
func TestNodeManager_RemoveNode(t *testing.T) {
t.Run("remove existing node", func(t *testing.T) {
mockClient := mocks.NewMockDataNodeClient(t)
mockClient.EXPECT().Close().Return(nil)
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return mockClient, nil
}
nm := NewNodeManager(nodeCreator)
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
nm.RemoveNode(1)
client, err := nm.GetClient(1)
assert.Error(t, err)
assert.Nil(t, client)
})
t.Run("remove non-existing node", func(t *testing.T) {
nm := NewNodeManager(nil)
nm.RemoveNode(1) // Should not panic
})
}
func TestNodeManager_GetClient(t *testing.T) {
t.Run("get existing client", func(t *testing.T) {
mockClient := mocks.NewMockDataNodeClient(t)
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return mockClient, nil
}
nm := NewNodeManager(nodeCreator)
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
client, err := nm.GetClient(1)
assert.NoError(t, err)
assert.Equal(t, mockClient, client)
})
t.Run("get non-existing client", func(t *testing.T) {
nm := NewNodeManager(nil)
client, err := nm.GetClient(1)
assert.Error(t, err)
assert.True(t, merr.WrapErrNodeNotFound(1) != nil)
assert.Nil(t, client)
})
}
func TestNodeManager_GetClientIDs(t *testing.T) {
t.Run("get client IDs", func(t *testing.T) {
mockClient := mocks.NewMockDataNodeClient(t)
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return mockClient, nil
}
nm := NewNodeManager(nodeCreator)
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
err = nm.AddNode(2, "localhost:8081")
assert.NoError(t, err)
ids := nm.GetClientIDs()
assert.Len(t, ids, 2)
assert.Contains(t, ids, int64(1))
assert.Contains(t, ids, int64(2))
})
t.Run("get empty client IDs", func(t *testing.T) {
nm := NewNodeManager(nil)
ids := nm.GetClientIDs()
assert.Empty(t, ids)
})
}
func TestNodeManager_Startup_NewNodes(t *testing.T) {
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return datanodeclient.NewClient(ctx, addr, nodeID, paramtable.Get().DataCoordCfg.WithCredential.GetAsBool())
}
nm := NewNodeManager(nodeCreator)
// Define test nodes
nodes := []*NodeInfo{
{NodeID: 1, Address: "localhost:8080"},
{NodeID: 2, Address: "localhost:8081"},
}
err := nm.Startup(context.Background(), nodes)
assert.NoError(t, err)
// Verify nodes were added
ids := nm.GetClientIDs()
assert.Len(t, ids, 2)
assert.Contains(t, ids, int64(1))
assert.Contains(t, ids, int64(2))
// Verify clients are accessible
_, err = nm.GetClient(1)
assert.NoError(t, err)
_, err = nm.GetClient(2)
assert.NoError(t, err)
}
func TestNodeManager_Startup_RemoveOldNodes(t *testing.T) {
// Mock the Close method using mockey
mockClose := mockey.Mock((*mocks.MockDataNodeClient).Close).Return(nil).Build()
defer mockClose.UnPatch()
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return datanodeclient.NewClient(ctx, addr, nodeID, paramtable.Get().DataCoordCfg.WithCredential.GetAsBool())
}
nm := NewNodeManager(nodeCreator)
// Add initial nodes
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
err = nm.AddNode(2, "localhost:8081")
assert.NoError(t, err)
// Startup with new set of nodes (removes node 1, keeps node 2, adds node 3)
newNodes := []*NodeInfo{
{NodeID: 2, Address: "localhost:8081"}, // existing node
{NodeID: 3, Address: "localhost:8082"}, // new node
}
err = nm.Startup(context.Background(), newNodes)
assert.NoError(t, err)
// Verify final state
ids := nm.GetClientIDs()
assert.Len(t, ids, 2)
assert.Contains(t, ids, int64(2))
assert.Contains(t, ids, int64(3))
assert.NotContains(t, ids, int64(1))
// Verify node 1 is removed
_, err = nm.GetClient(1)
assert.Error(t, err)
// Verify nodes 2 and 3 are accessible
_, err = nm.GetClient(2)
assert.NoError(t, err)
_, err = nm.GetClient(3)
assert.NoError(t, err)
}
func TestNodeManager_Startup_EmptyNodes(t *testing.T) {
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
return datanodeclient.NewClient(ctx, addr, nodeID, paramtable.Get().DataCoordCfg.WithCredential.GetAsBool())
}
nm := NewNodeManager(nodeCreator)
// Add initial node
err := nm.AddNode(1, "localhost:8080")
assert.NoError(t, err)
// Startup with empty nodes (should remove all existing nodes)
err = nm.Startup(context.Background(), []*NodeInfo{})
assert.NoError(t, err)
// Verify all nodes are removed
ids := nm.GetClientIDs()
assert.Empty(t, ids)
}
func TestNodeManager_Startup_AddNodeError(t *testing.T) {
nodeCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
if nodeID == 1 {
return nil, assert.AnError
}
return datanodeclient.NewClient(ctx, addr, nodeID, paramtable.Get().DataCoordCfg.WithCredential.GetAsBool())
}
nm := NewNodeManager(nodeCreator)
nodes := []*NodeInfo{
{NodeID: 1, Address: "localhost:8080"}, // This will fail
{NodeID: 2, Address: "localhost:8081"},
}
err := nm.Startup(context.Background(), nodes)
assert.Error(t, err)
assert.Contains(t, err.Error(), "assert.AnError")
}