From 17ac58a731913ba2be2310a9679e06764c2c2589 Mon Sep 17 00:00:00 2001 From: congqixia Date: Mon, 1 Dec 2025 14:03:09 +0800 Subject: [PATCH] fix: [2.6] always call handleNodeUp in rewatchNodes for proper stopping balance (#45963) Cherry-pick from master pr: #45961 Related to #45960 When QueryCoord restarts or reconnects to etcd, the rewatchNodes function previously skipped handleNodeUp for QueryNodes in stopping state. This caused stopping balance to fail because necessary components were not initialized: - Task scheduler executor was not added - Dist handler was not started - Node was not registered in resource manager This fix ensures handleNodeUp is always called for new nodes regardless of their stopping state, followed by handleNodeStopping if the node is stopping. This allows the graceful shutdown process to correctly migrate segments and channels away from stopping nodes. --------- Signed-off-by: Congqi Xia --- internal/querycoordv2/meta/resource_manager.go | 7 ++++++- internal/querycoordv2/server.go | 6 ++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/internal/querycoordv2/meta/resource_manager.go b/internal/querycoordv2/meta/resource_manager.go index 99402a6aa7..d5abfd6534 100644 --- a/internal/querycoordv2/meta/resource_manager.go +++ b/internal/querycoordv2/meta/resource_manager.go @@ -532,7 +532,12 @@ func (rm *ResourceManager) HandleNodeUp(ctx context.Context, node int64) { } func (rm *ResourceManager) handleNodeUp(ctx context.Context, node int64) { - if nodeInfo := rm.nodeMgr.Get(node); nodeInfo == nil || nodeInfo.IsEmbeddedQueryNodeInStreamingNode() { + nodeInfo := rm.nodeMgr.Get(node) + if nodeInfo == nil || nodeInfo.IsEmbeddedQueryNodeInStreamingNode() { + return + } + if nodeInfo.IsStoppingState() { + log.Warn("node is stopping, skip handle node up in resource manager", zap.Int64("node", node)) return } rm.incomingNode.Insert(node) diff --git a/internal/querycoordv2/server.go b/internal/querycoordv2/server.go index 53682a4440..48fccc55ff 100644 --- a/internal/querycoordv2/server.go +++ b/internal/querycoordv2/server.go @@ -730,11 +730,13 @@ func (s *Server) rewatchNodes(sessions map[string]*sessionutil.Session) error { Labels: nodeSession.GetServerLabel(), })) + // call handleNodeUp no matter what state new querynode is in + // all component need this op so that stopping balance could work correctly + s.handleNodeUp(nodeSession.GetServerID()) + if nodeSession.Stopping { s.nodeMgr.Stopping(nodeSession.ServerID) s.handleNodeStopping(nodeSession.ServerID) - } else { - s.handleNodeUp(nodeSession.GetServerID()) } } }