fix: Clean offline node from resource group after qc restart (#33233)

issue: #33200 #33207
pr: #33232
pr#33104 causes the offline node will be kept in resource group after qc
recover, and offline node will be assign to new replica as rwNode, then
request send to those node will fail by NodeNotFound.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2024-05-22 14:07:39 +08:00 committed by GitHub
parent 1f23c39700
commit 4b8680894f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -441,6 +441,8 @@ func (s *Server) startQueryCoord() error {
s.nodeMgr.Stopping(node.ServerID)
}
}
s.checkNodeStateInRG()
for _, node := range sessions {
s.handleNodeUp(node.ServerID)
}
@ -762,6 +764,20 @@ func (s *Server) handleNodeDown(node int64) {
s.meta.ResourceManager.HandleNodeDown(node)
}
func (s *Server) checkNodeStateInRG() {
for _, rgName := range s.meta.ListResourceGroups() {
rg := s.meta.ResourceManager.GetResourceGroup(rgName)
for _, node := range rg.GetNodes() {
info := s.nodeMgr.Get(node)
if info == nil {
s.meta.ResourceManager.HandleNodeDown(node)
} else if info.IsStoppingState() {
s.meta.ResourceManager.HandleNodeStopping(node)
}
}
}
}
func (s *Server) updateBalanceConfigLoop(ctx context.Context) {
success := s.updateBalanceConfig()
if success {