From 3e994242d6ab0dcf38bcf0cfb827620029d6bbdb Mon Sep 17 00:00:00 2001 From: chyezh Date: Tue, 30 Jan 2024 18:07:03 +0800 Subject: [PATCH] fix: panic with datanode negetive wait group counter (#30136) issue: #29170 pr: #30135 Signed-off-by: chyezh --- internal/datanode/data_node.go | 3 +-- internal/datanode/event_manager.go | 9 +++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/internal/datanode/data_node.go b/internal/datanode/data_node.go index 20e1e4640d..5e66be922d 100644 --- a/internal/datanode/data_node.go +++ b/internal/datanode/data_node.go @@ -357,9 +357,8 @@ func (node *DataNode) Start() error { node.timeTickSender.start() } - node.stopWaiter.Add(1) // Start node watch node - go node.StartWatchChannels(node.ctx) + node.startWatchChannelsAtBackground(node.ctx) node.stopWaiter.Add(1) go node.flowgraphManager.start(&node.stopWaiter) diff --git a/internal/datanode/event_manager.go b/internal/datanode/event_manager.go index 0077256227..3f45d4b882 100644 --- a/internal/datanode/event_manager.go +++ b/internal/datanode/event_manager.go @@ -38,6 +38,11 @@ import ( const retryWatchInterval = 20 * time.Second +func (node *DataNode) startWatchChannelsAtBackground(ctx context.Context) { + node.stopWaiter.Add(1) + go node.StartWatchChannels(ctx) +} + // StartWatchChannels start loop to watch channel allocation status via kv(etcd for now) func (node *DataNode) StartWatchChannels(ctx context.Context) { defer node.stopWaiter.Done() @@ -61,7 +66,7 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) { case event, ok := <-evtChan: if !ok { log.Warn("datanode failed to watch channel, return") - go node.StartWatchChannels(ctx) + node.startWatchChannelsAtBackground(ctx) return } @@ -69,7 +74,7 @@ func (node *DataNode) StartWatchChannels(ctx context.Context) { log.Warn("datanode watch channel canceled", zap.Error(event.Err())) // https://github.com/etcd-io/etcd/issues/8980 if event.Err() == v3rpc.ErrCompacted { - go node.StartWatchChannels(ctx) + node.startWatchChannelsAtBackground(ctx) return } // if watch loop return due to event canceled, the datanode is not functional anymore