From 73be0ba9415192130b013fa118cd31dac63ca751 Mon Sep 17 00:00:00 2001 From: wei liu Date: Thu, 29 Aug 2024 13:55:01 +0800 Subject: [PATCH] enhance: avoid the coexistence of the old coordinator and the new node/proxy (#35720) issue: #35719 In standalone mode, block the start process until the new coordinator is active to avoid the coexistence of the old coordinator and the new node/proxy 1. In the start/restart process, the new coordinator will become active immediately and will not be blocked 2. In the rolling upgrade process, the new coordinator will not be active until the old coordinator is down, and it will be blocked Signed-off-by: Wei Liu --- cmd/roles/roles.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cmd/roles/roles.go b/cmd/roles/roles.go index dcbe628300..9bc116900a 100644 --- a/cmd/roles/roles.go +++ b/cmd/roles/roles.go @@ -429,6 +429,35 @@ func (mr *MilvusRoles) Run() { componentMap[typeutil.QueryCoordRole] = queryCoord } + waitCoordBecomeHealthy := func() { + for { + select { + case <-ctx.Done(): + log.Info("wait all coord become healthy loop quit") + return + default: + rcState := rootCoord.Health(ctx) + dcState := dataCoord.Health(ctx) + icState := indexCoord.Health(ctx) + qcState := queryCoord.Health(ctx) + + if rcState == commonpb.StateCode_Healthy && dcState == commonpb.StateCode_Healthy && icState == commonpb.StateCode_Healthy && qcState == commonpb.StateCode_Healthy { + log.Info("all coord become healthy") + return + } + log.Info("wait all coord become healthy", zap.String("rootCoord", rcState.String()), zap.String("dataCoord", dcState.String()), zap.String("indexCoord", icState.String()), zap.String("queryCoord", qcState.String())) + time.Sleep(time.Second) + } + } + } + + // In standalone mode, block the start process until the new coordinator is active to avoid the coexistence of the old coordinator and the new node/proxy + // 1. In the start/restart process, the new coordinator will become active immediately and will not be blocked + // 2. In the rolling upgrade process, the new coordinator will not be active until the old coordinator is down, and it will be blocked + if mr.Local { + waitCoordBecomeHealthy() + } + if mr.EnableQueryNode { queryNode = mr.runQueryNode(ctx, local, &wg) componentMap[typeutil.QueryNodeRole] = queryNode