From ca8eee2c47fd418b802b243f93edb3a75d1a2e1c Mon Sep 17 00:00:00 2001 From: wei liu Date: Fri, 15 Mar 2024 10:23:03 +0800 Subject: [PATCH] fix: Set node unreachable when get shard client failed (#31277) issue: #30531 cause get client from `shardClientMgr`, doesn't means query node is unavailable. because of the ref counter policy in `shardClientMgr`, which will clean the client, if no collection use qn as shard leader. This PR fix that set node unreachable when get shard client failed. Signed-off-by: Wei Liu --- internal/proxy/look_aside_balancer.go | 5 ++--- internal/proxy/look_aside_balancer_test.go | 13 +++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/internal/proxy/look_aside_balancer.go b/internal/proxy/look_aside_balancer.go index 51d4a4b9a8..18e91a47e0 100644 --- a/internal/proxy/look_aside_balancer.go +++ b/internal/proxy/look_aside_balancer.go @@ -221,9 +221,8 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) { qn, err := b.clientMgr.GetClient(ctx, node) if err != nil { - if b.trySetQueryNodeUnReachable(node, err) { - log.Warn("get client failed, set node unreachable", zap.Int64("node", node), zap.Error(err)) - } + // get client from clientMgr failed, which means this qn isn't a shard leader anymore, skip it's health check + log.RatedInfo(10, "get client failed", zap.Int64("node", node), zap.Error(err)) return struct{}{}, nil } diff --git a/internal/proxy/look_aside_balancer_test.go b/internal/proxy/look_aside_balancer_test.go index cfb7b6ec19..ffbd53c409 100644 --- a/internal/proxy/look_aside_balancer_test.go +++ b/internal/proxy/look_aside_balancer_test.go @@ -334,6 +334,19 @@ func (suite *LookAsideBalancerSuite) TestCheckHealthLoop() { }, 5*time.Second, 100*time.Millisecond) } +func (suite *LookAsideBalancerSuite) TestGetClientFailed() { + suite.balancer.metricsUpdateTs.Insert(2, time.Now().UnixMilli()) + + // test get shard client from client mgr return nil + suite.clientMgr.ExpectedCalls = nil + suite.clientMgr.EXPECT().GetClient(mock.Anything, int64(2)).Return(nil, errors.New("shard client not found")) + failCounter := atomic.NewInt64(0) + suite.balancer.failedHeartBeatCounter.Insert(2, failCounter) + suite.Eventually(func() bool { + return failCounter.Load() == 0 + }, 10*time.Second, 1*time.Second) +} + func (suite *LookAsideBalancerSuite) TestNodeRecover() { // mock qn down for a while and then recover qn3 := mocks.NewMockQueryNodeClient(suite.T())