mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
fix: executor/scheduler should be latest replica meta but not replica copy (#45878)
issue: #45865 pr: #45877 --------- Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
parent
4cdeea5ddd
commit
d20b332e63
@ -228,7 +228,7 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
|
|||||||
)
|
)
|
||||||
|
|
||||||
// get segment's replica first, then get shard leader by replica
|
// get segment's replica first, then get shard leader by replica
|
||||||
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node())
|
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
|
||||||
if replica == nil {
|
if replica == nil {
|
||||||
msg := "node doesn't belong to any replica"
|
msg := "node doesn't belong to any replica"
|
||||||
err := merr.WrapErrNodeNotAvailable(action.Node())
|
err := merr.WrapErrNodeNotAvailable(action.Node())
|
||||||
@ -331,7 +331,7 @@ func (ex *Executor) releaseSegment(task *SegmentTask, step int) {
|
|||||||
|
|
||||||
if ex.meta.CollectionManager.Exist(ctx, task.CollectionID()) {
|
if ex.meta.CollectionManager.Exist(ctx, task.CollectionID()) {
|
||||||
// get segment's replica first, then get shard leader by replica
|
// get segment's replica first, then get shard leader by replica
|
||||||
replica := ex.meta.ReplicaManager.GetByCollectionAndNode(ctx, task.CollectionID(), action.Node())
|
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
|
||||||
if replica == nil {
|
if replica == nil {
|
||||||
msg := "node doesn't belong to any replica, try to send release to worker"
|
msg := "node doesn't belong to any replica, try to send release to worker"
|
||||||
err := merr.WrapErrNodeNotAvailable(action.Node())
|
err := merr.WrapErrNodeNotAvailable(action.Node())
|
||||||
@ -577,7 +577,13 @@ func (ex *Executor) executeDropIndexAction(task *DropIndexTask, step int) {
|
|||||||
ex.removeTask(task, step)
|
ex.removeTask(task, step)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
replica := ex.meta.ReplicaManager.Get(ctx, task.ReplicaID())
|
||||||
|
if replica == nil {
|
||||||
|
err = merr.WrapErrNodeNotAvailable(action.Node())
|
||||||
|
log.Warn("node doesn't belong to any replica", zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
view := ex.dist.ChannelDistManager.GetShardLeader(task.Shard(), replica)
|
||||||
if view == nil {
|
if view == nil {
|
||||||
err = merr.WrapErrChannelNotFound(task.Shard(), "shard delegator not found")
|
err = merr.WrapErrChannelNotFound(task.Shard(), "shard delegator not found")
|
||||||
log.Warn("failed to get shard leader", zap.Error(err))
|
log.Warn("failed to get shard leader", zap.Error(err))
|
||||||
|
|||||||
@ -507,7 +507,7 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
|
|||||||
taskType := GetTaskType(task)
|
taskType := GetTaskType(task)
|
||||||
|
|
||||||
if taskType == TaskTypeMove {
|
if taskType == TaskTypeMove {
|
||||||
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
if leader == nil {
|
if leader == nil {
|
||||||
return merr.WrapErrServiceInternal("segment's delegator leader not found, stop balancing")
|
return merr.WrapErrServiceInternal("segment's delegator leader not found, stop balancing")
|
||||||
}
|
}
|
||||||
@ -590,6 +590,14 @@ func (scheduler *taskScheduler) preAdd(task Task) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (scheduler *taskScheduler) getReplicaShardLeader(channelName string, replicaID int64) *meta.DmChannel {
|
||||||
|
replica := scheduler.meta.ReplicaManager.Get(scheduler.ctx, replicaID)
|
||||||
|
if replica == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return scheduler.distMgr.ChannelDistManager.GetShardLeader(channelName, replica)
|
||||||
|
}
|
||||||
|
|
||||||
func (scheduler *taskScheduler) tryPromoteAll() {
|
func (scheduler *taskScheduler) tryPromoteAll() {
|
||||||
// Promote waiting tasks
|
// Promote waiting tasks
|
||||||
toPromote := make([]Task, 0, scheduler.waitQueue.Len())
|
toPromote := make([]Task, 0, scheduler.waitQueue.Len())
|
||||||
@ -854,7 +862,7 @@ func (scheduler *taskScheduler) isRelated(task Task, node int64) bool {
|
|||||||
if task.replica == nil {
|
if task.replica == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
if leader == nil {
|
if leader == nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -883,7 +891,7 @@ func (scheduler *taskScheduler) preProcess(task Task) bool {
|
|||||||
case *ChannelAction:
|
case *ChannelAction:
|
||||||
// wait for new delegator becomes leader, then try to remove old leader
|
// wait for new delegator becomes leader, then try to remove old leader
|
||||||
task := task.(*ChannelTask)
|
task := task.(*ChannelTask)
|
||||||
delegator := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
delegator := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Bool("delegator is Nil", delegator == nil))
|
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Bool("delegator is Nil", delegator == nil))
|
||||||
if delegator != nil {
|
if delegator != nil {
|
||||||
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Int64("delegator node", delegator.Node),
|
log.Ctx(scheduler.ctx).Debug("process channelAction", zap.Int64("delegator node", delegator.Node),
|
||||||
@ -1144,7 +1152,7 @@ func (scheduler *taskScheduler) checkSegmentTaskStale(task *SegmentTask) error {
|
|||||||
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
|
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
|
||||||
}
|
}
|
||||||
|
|
||||||
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
if leader == nil {
|
if leader == nil {
|
||||||
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task)...)
|
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task)...)
|
||||||
return merr.WrapErrChannelNotFound(segment.GetInsertChannel(), "failed to get shard delegator")
|
return merr.WrapErrChannelNotFound(segment.GetInsertChannel(), "failed to get shard delegator")
|
||||||
@ -1198,14 +1206,14 @@ func (scheduler *taskScheduler) checkLeaderTaskStale(task *LeaderTask) error {
|
|||||||
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
|
return merr.WrapErrSegmentReduplicate(task.SegmentID(), "target doesn't contain this segment")
|
||||||
}
|
}
|
||||||
|
|
||||||
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
if leader == nil {
|
if leader == nil {
|
||||||
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
|
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
|
||||||
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
|
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
|
||||||
}
|
}
|
||||||
|
|
||||||
case ActionTypeReduce:
|
case ActionTypeReduce:
|
||||||
leader := scheduler.distMgr.ChannelDistManager.GetShardLeader(task.Shard(), task.replica)
|
leader := scheduler.getReplicaShardLeader(task.Shard(), task.ReplicaID())
|
||||||
if leader == nil {
|
if leader == nil {
|
||||||
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
|
log.Ctx(task.Context()).Warn("task stale due to leader not found", WrapTaskLog(task, zap.Int64("leaderID", task.leaderID))...)
|
||||||
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
|
return merr.WrapErrChannelNotFound(task.Shard(), "failed to get shard delegator")
|
||||||
|
|||||||
@ -160,6 +160,7 @@ func (suite *TaskSuite) SetupTest() {
|
|||||||
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
|
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
|
||||||
suite.store = querycoord.NewCatalog(suite.kv)
|
suite.store = querycoord.NewCatalog(suite.kv)
|
||||||
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager())
|
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager())
|
||||||
|
suite.meta.ReplicaManager.Put(suite.ctx, suite.replica)
|
||||||
suite.nodeMgr = session.NewNodeManager()
|
suite.nodeMgr = session.NewNodeManager()
|
||||||
suite.dist = meta.NewDistributionManager(suite.nodeMgr)
|
suite.dist = meta.NewDistributionManager(suite.nodeMgr)
|
||||||
suite.broker = meta.NewMockBroker(suite.T())
|
suite.broker = meta.NewMockBroker(suite.T())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user