fix: executor/scheduler should be latest replica meta but not replica copy (#45878)

issue: #45865
pr: #45877

---------

Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
Zhen Ye 2025-11-28 08:49:08 +08:00 committed by GitHub
parent 4cdeea5ddd
commit d20b332e63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 9 deletions

View File

@ -228,7 +228,7 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
) )
// get segment's replica first, then get shard leader by replica // get segment's replica first, then get shard leader by replica
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node()) replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil { if replica == nil {
msg := "node doesn't belong to any replica" msg := "node doesn't belong to any replica"
err := merr.WrapErrNodeNotAvailable(action.Node()) err := merr.WrapErrNodeNotAvailable(action.Node())
@ -331,7 +331,7 @@ func (ex *Executor) releaseSegment(task *SegmentTask, step int) {
if ex.meta.CollectionManager.Exist(ctx, task.CollectionID()) { if ex.meta.CollectionManager.Exist(ctx, task.CollectionID()) {
// get segment's replica first, then get shard leader by replica // get segment's replica first, then get shard leader by replica
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node()) replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil { if replica == nil {
msg := "node doesn't belong to any replica, try to send release to worker" msg := "node doesn't belong to any replica, try to send release to worker"
err := merr.WrapErrNodeNotAvailable(action.Node()) err := merr.WrapErrNodeNotAvailable(action.Node())
@ -577,7 +577,13 @@ func (ex *Executor) executeDropIndexAction(task *DropIndexTask, step int) {
ex.removeTask(task, step) ex.removeTask(task, step)
}() }()
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil {
err = merr.WrapErrNodeNotAvailable(action.Node())
log.Warn("node doesn't belong to any replica", zap.Error(err))
return
}
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), replica)
if view == nil { if view == nil {
err = merr.WrapErrChannelNotFound(task.Shard(), "shard delegator not found") err = merr.WrapErrChannelNotFound(task.Shard(), "shard delegator not found")
log.Warn("failed to get shard leader", zap.Error(err)) log.Warn("failed to get shard leader", zap.Error(err))

View File

@ -507,7 +507,7 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
taskType := GetTaskType(task) taskType := GetTaskType(task)
if taskType == TaskTypeMove { if taskType == TaskTypeMove {
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil { if leader == nil {
return merr.WrapErrServiceInternal("segment's delegator leader not found, stop balancing") return merr.WrapErrServiceInternal("segment's delegator leader not found, stop balancing")
} }
@ -590,6 +590,14 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
return nil return nil
} }
func (scheduler *taskScheduler) getReplicaShardLeader(channelName string, replicaID int64) *meta.DmChannel {
replica := scheduler.meta.ReplicaManager.Get(scheduler.ctx, replicaID)
if replica == nil {
return nil
}
return scheduler.distMgr.ChannelDistManager.GetShardLeader(channelName, replica)
}
func (scheduler *taskScheduler) tryPromoteAll() { func (scheduler *taskScheduler) tryPromoteAll() {
// Promote waiting tasks // Promote waiting tasks
toPromote := make([]Task, 0, scheduler.waitQueue.Len()) toPromote := make([]Task, 0, scheduler.waitQueue.Len())
@ -854,7 +862,7 @@ func (scheduler *taskScheduler) isRelated(task Task, node int64) bool {
if task.replica == nil { if task.replica == nil {
continue continue
} }
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil { if leader == nil {
continue continue
} }
@ -883,7 +891,7 @@ func (scheduler *taskScheduler) preProcess(task Task) bool {
case *ChannelAction: case *ChannelAction:
// wait for new delegator becomes leader, then try to remove old leader // wait for new delegator becomes leader, then try to remove old leader
task := task.(*ChannelTask) task := task.(*ChannelTask)
delegator := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) delegator := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Bool("delegator is Nil", delegator == nil)) log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Bool("delegator is Nil", delegator == nil))
if delegator != nil { if delegator != nil {
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Int64("delegator node", delegator.Node), log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Int64("delegator node", delegator.Node),
@ -1144,7 +1152,7 @@ func (scheduler *taskScheduler) checkSegmentTaskStale(task *SegmentTask) error {
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment") return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
} }
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil { if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task)...) log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task)...)
return merr.WrapErrChannelNotFound(segment.GetInsertChannel(), "failed to get shard delegator") return merr.WrapErrChannelNotFound(segment.GetInsertChannel(), "failed to get shard delegator")
@ -1198,14 +1206,14 @@ func (scheduler *taskScheduler) checkLeaderTaskStale(task *LeaderTask) error {
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment") return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
} }
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil { if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...) log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator") return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
} }
case ActionTypeReduce: case ActionTypeReduce:
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica) leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil { if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...) log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator") return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")

View File

@ -160,6 +160,7 @@ func (suite *TaskSuite) SetupTest() {
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue()) suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.store = querycoord.NewCatalog(suite.kv) suite.store = querycoord.NewCatalog(suite.kv)
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager()) suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager())
suite.meta.ReplicaManager.Put(suite.ctx, suite.replica)
suite.nodeMgr = session.NewNodeManager() suite.nodeMgr = session.NewNodeManager()
suite.dist = meta.NewDistributionManager(suite.nodeMgr) suite.dist = meta.NewDistributionManager(suite.nodeMgr)
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())