fix: Clean offline node from replica after qc recover (#33213)

issue: #33200 #33207

pr#33104 remove this logic by mistake, which cause the offline node will
be kept in replica after qc recover, and request send to offline qn will
go a NodeNotFound error.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2024-05-21 15:41:39 +08:00 committed by GitHub
parent cb480d17c8
commit 33bd6eed28
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -456,6 +456,7 @@ func (s *Server) startQueryCoord() error {
s.nodeMgr.Stopping(node.ServerID)
}
}
s.checkReplicas()
for _, node := range sessions {
s.handleNodeUp(node.ServerID)
}
@ -777,6 +778,33 @@ func (s *Server) handleNodeDown(node int64) {
s.meta.ResourceManager.HandleNodeDown(node)
}
// checkReplicas checks whether replica contains offline node, and remove those nodes
func (s *Server) checkReplicas() {
for _, collection := range s.meta.CollectionManager.GetAll() {
log := log.With(zap.Int64("collectionID", collection))
replicas := s.meta.ReplicaManager.GetByCollection(collection)
for _, replica := range replicas {
toRemove := make([]int64, 0)
for _, node := range replica.GetNodes() {
if s.nodeMgr.Get(node) == nil {
toRemove = append(toRemove, node)
}
}
if len(toRemove) > 0 {
log := log.With(
zap.Int64("replicaID", replica.GetID()),
zap.Int64s("offlineNodes", toRemove),
)
log.Info("some nodes are offline, remove them from replica", zap.Any("toRemove", toRemove))
if err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), toRemove...); err != nil {
log.Warn("failed to remove offline nodes from replica")
}
}
}
}
}
func (s *Server) updateBalanceConfigLoop(ctx context.Context) {
success := s.updateBalanceConfig()
if success {