From 9abc868d15014eb5f0a600941b4a8995ac0f05e4 Mon Sep 17 00:00:00 2001 From: wei liu Date: Wed, 17 Jan 2024 11:22:52 +0800 Subject: [PATCH] fix: Remove heartbeat lag logic during get shard leaders (#29999) issue: #29677 #29838 during get shard leaders, if qeurynode doesn't ack the heartbeat than 10s, querycoord will treat it as unavailable, and won't return shard leader on it. but when querynode has a full cpu usage, it's easily to stuck for more than 10s without ack the heartbeat, which cause no shard leader to search/query. This PR remove heartbeat lag logic during get shard leaders Signed-off-by: Wei Liu --- internal/querycoordv2/handlers.go | 2 -- internal/querycoordv2/services_test.go | 6 ------ 2 files changed, 8 deletions(-) diff --git a/internal/querycoordv2/handlers.go b/internal/querycoordv2/handlers.go index 0d70128605..2bf5412968 100644 --- a/internal/querycoordv2/handlers.go +++ b/internal/querycoordv2/handlers.go @@ -359,8 +359,6 @@ func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*m func checkNodeAvailable(nodeID int64, info *session.NodeInfo) error { if info == nil { return merr.WrapErrNodeOffline(nodeID) - } else if time.Since(info.LastHeartbeat()) > Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) { - return merr.WrapErrNodeOffline(nodeID, fmt.Sprintf("lastHB=%v", info.LastHeartbeat())) } return nil } diff --git a/internal/querycoordv2/services_test.go b/internal/querycoordv2/services_test.go index e5defb3f58..77160df243 100644 --- a/internal/querycoordv2/services_test.go +++ b/internal/querycoordv2/services_test.go @@ -1521,12 +1521,6 @@ func (suite *ServiceSuite) TestGetShardLeadersFailed() { suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost")) } - // Last heartbeat response time too old - suite.fetchHeartbeats(time.Now().Add(-Params.QueryCoordCfg.HeartbeatAvailableInterval.GetAsDuration(time.Millisecond) - 1)) - resp, err = server.GetShardLeaders(ctx, req) - suite.NoError(err) - suite.Equal(commonpb.ErrorCode_NoReplicaAvailable, resp.GetStatus().GetErrorCode()) - // Segment not fully loaded for _, node := range suite.nodes { suite.dist.SegmentDistManager.Update(node)