fix: executor/scheduler should be latest replica meta but not replica copy (#45878)

issue: #45865
pr: #45877

---------

Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
Zhen Ye 2025-11-28 08:49:08 +08:00 committed by GitHub
parent 4cdeea5ddd
commit d20b332e63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 24 additions and 9 deletions

View File

@ -228,7 +228,7 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
)
// get segment's replica first, then get shard leader by replica
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node())
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil {
msg := "node doesn't belong to any replica"
err := merr.WrapErrNodeNotAvailable(action.Node())
@ -331,7 +331,7 @@ func (ex *Executor) releaseSegment(task *SegmentTask, step int) {
if ex.meta.CollectionManager.Exist(ctx, task.CollectionID()) {
// get segment's replica first, then get shard leader by replica
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node())
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil {
msg := "node doesn't belong to any replica, try to send release to worker"
err := merr.WrapErrNodeNotAvailable(action.Node())
@ -577,7 +577,13 @@ func (ex *Executor) executeDropIndexAction(task *DropIndexTask, step int) {
ex.removeTask(task, step)
}()
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
if replica == nil {
err = merr.WrapErrNodeNotAvailable(action.Node())
log.Warn("node doesn't belong to any replica", zap.Error(err))
return
}
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), replica)
if view == nil {
err = merr.WrapErrChannelNotFound(task.Shard(), "shard delegator not found")
log.Warn("failed to get shard leader", zap.Error(err))

View File

@ -507,7 +507,7 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
taskType := GetTaskType(task)
if taskType == TaskTypeMove {
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil {
return merr.WrapErrServiceInternal("segment's delegator leader not found, stop balancing")
}
@ -590,6 +590,14 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
return nil
}
func (scheduler *taskScheduler) getReplicaShardLeader(channelName string, replicaID int64) *meta.DmChannel {
replica := scheduler.meta.ReplicaManager.Get(scheduler.ctx, replicaID)
if replica == nil {
return nil
}
return scheduler.distMgr.ChannelDistManager.GetShardLeader(channelName, replica)
}
func (scheduler *taskScheduler) tryPromoteAll() {
// Promote waiting tasks
toPromote := make([]Task, 0, scheduler.waitQueue.Len())
@ -854,7 +862,7 @@ func (scheduler *taskScheduler) isRelated(task Task, node int64) bool {
if task.replica == nil {
continue
}
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil {
continue
}
@ -883,7 +891,7 @@ func (scheduler *taskScheduler) preProcess(task Task) bool {
case *ChannelAction:
// wait for new delegator becomes leader, then try to remove old leader
task := task.(*ChannelTask)
delegator := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
delegator := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Bool("delegator is Nil", delegator == nil))
if delegator != nil {
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Int64("delegator node", delegator.Node),
@ -1144,7 +1152,7 @@ func (scheduler *taskScheduler) checkSegmentTaskStale(task *SegmentTask) error {
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
}
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task)...)
return merr.WrapErrChannelNotFound(segment.GetInsertChannel(), "failed to get shard delegator")
@ -1198,14 +1206,14 @@ func (scheduler *taskScheduler) checkLeaderTaskStale(task *LeaderTask) error {
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
}
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
}
case ActionTypeReduce:
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
if leader == nil {
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")

View File

@ -160,6 +160,7 @@ func (suite *TaskSuite) SetupTest() {
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.store = querycoord.NewCatalog(suite.kv)
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager())
suite.meta.ReplicaManager.Put(suite.ctx, suite.replica)
suite.nodeMgr = session.NewNodeManager()
suite.dist = meta.NewDistributionManager(suite.nodeMgr)
suite.broker = meta.NewMockBroker(suite.T())