mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
fix: Fix task getting stuck after recovery (#42114)
Submit tasks into the global scheduler after recovery. issue: https://github.com/milvus-io/milvus/issues/42046 --------- Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
08a53c56b1
commit
79b51cbb73
@ -51,8 +51,9 @@ func (ai *analyzeInspector) Stop() {
|
|||||||
func (ai *analyzeInspector) reloadFromMeta() {
|
func (ai *analyzeInspector) reloadFromMeta() {
|
||||||
analyzeTasks := ai.mt.analyzeMeta.GetAllTasks()
|
analyzeTasks := ai.mt.analyzeMeta.GetAllTasks()
|
||||||
for _, t := range analyzeTasks {
|
for _, t := range analyzeTasks {
|
||||||
if t.GetState() == indexpb.JobState_JobStateFinished ||
|
if t.GetState() != indexpb.JobState_JobStateInit &&
|
||||||
t.GetState() == indexpb.JobState_JobStateFailed {
|
t.GetState() != indexpb.JobState_JobStateRetry &&
|
||||||
|
t.GetState() != indexpb.JobState_JobStateInProgress {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ai.scheduler.Enqueue(newAnalyzeTask(
|
ai.scheduler.Enqueue(newAnalyzeTask(
|
||||||
|
|||||||
@ -553,6 +553,7 @@ func (c *compactionInspector) submitTask(t CompactionTask) error {
|
|||||||
func (c *compactionInspector) restoreTask(t CompactionTask) {
|
func (c *compactionInspector) restoreTask(t CompactionTask) {
|
||||||
c.executingGuard.Lock()
|
c.executingGuard.Lock()
|
||||||
c.executingTasks[t.GetTaskProto().GetPlanID()] = t
|
c.executingTasks[t.GetTaskProto().GetPlanID()] = t
|
||||||
|
c.scheduler.Enqueue(t)
|
||||||
c.executingGuard.Unlock()
|
c.executingGuard.Unlock()
|
||||||
metrics.DataCoordCompactionTaskNum.WithLabelValues(fmt.Sprintf("%d", t.GetTaskProto().GetNodeID()), t.GetTaskProto().GetType().String(), metrics.Executing).Inc()
|
metrics.DataCoordCompactionTaskNum.WithLabelValues(fmt.Sprintf("%d", t.GetTaskProto().GetNodeID()), t.GetTaskProto().GetType().String(), metrics.Executing).Inc()
|
||||||
}
|
}
|
||||||
|
|||||||
@ -75,6 +75,7 @@ func (s *CompactionPlanHandlerSuite) TestScheduleEmpty() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *CompactionPlanHandlerSuite) generateInitTasksForSchedule() {
|
func (s *CompactionPlanHandlerSuite) generateInitTasksForSchedule() {
|
||||||
|
s.handler.scheduler.(*task.MockGlobalScheduler).EXPECT().Enqueue(mock.Anything).Return()
|
||||||
task1 := &mixCompactionTask{
|
task1 := &mixCompactionTask{
|
||||||
meta: s.mockMeta,
|
meta: s.mockMeta,
|
||||||
}
|
}
|
||||||
@ -407,6 +408,8 @@ func (s *CompactionPlanHandlerSuite) TestRemoveTasksByChannel() {
|
|||||||
s.SetupTest()
|
s.SetupTest()
|
||||||
ch := "ch1"
|
ch := "ch1"
|
||||||
|
|
||||||
|
s.handler.scheduler.(*task.MockGlobalScheduler).EXPECT().Enqueue(mock.Anything).Return()
|
||||||
|
|
||||||
t1 := newMixCompactionTask(&datapb.CompactionTask{
|
t1 := newMixCompactionTask(&datapb.CompactionTask{
|
||||||
PlanID: 19530,
|
PlanID: 19530,
|
||||||
Type: datapb.CompactionType_MixCompaction,
|
Type: datapb.CompactionType_MixCompaction,
|
||||||
|
|||||||
@ -62,6 +62,7 @@ func NewImportInspector(ctx context.Context, meta *meta, importMeta ImportMeta,
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *importInspector) Start() {
|
func (s *importInspector) Start() {
|
||||||
|
s.reloadFromMeta()
|
||||||
log.Ctx(s.ctx).Info("start import inspector")
|
log.Ctx(s.ctx).Info("start import inspector")
|
||||||
ticker := time.NewTicker(Params.DataCoordCfg.ImportScheduleInterval.GetAsDuration(time.Second))
|
ticker := time.NewTicker(Params.DataCoordCfg.ImportScheduleInterval.GetAsDuration(time.Second))
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
@ -82,6 +83,21 @@ func (s *importInspector) Close() {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *importInspector) reloadFromMeta() {
|
||||||
|
jobs := s.importMeta.GetJobBy(s.ctx)
|
||||||
|
sort.Slice(jobs, func(i, j int) bool {
|
||||||
|
return jobs[i].GetJobID() < jobs[j].GetJobID()
|
||||||
|
})
|
||||||
|
for _, job := range jobs {
|
||||||
|
tasks := s.importMeta.GetTaskBy(s.ctx, WithJob(job.GetJobID()))
|
||||||
|
for _, task := range tasks {
|
||||||
|
if task.GetState() == datapb.ImportTaskStateV2_InProgress {
|
||||||
|
s.scheduler.Enqueue(task)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *importInspector) inspect() {
|
func (s *importInspector) inspect() {
|
||||||
jobs := s.importMeta.GetJobBy(s.ctx)
|
jobs := s.importMeta.GetJobBy(s.ctx)
|
||||||
sort.Slice(jobs, func(i, j int) bool {
|
sort.Slice(jobs, func(i, j int) bool {
|
||||||
|
|||||||
@ -258,6 +258,77 @@ func (s *ImportInspectorSuite) TestProcessFailed() {
|
|||||||
s.Equal(0, len(task.(*importTask).GetSegmentIDs()))
|
s.Equal(0, len(task.(*importTask).GetSegmentIDs()))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ImportInspectorSuite) TestReloadFromMeta() {
|
||||||
|
// Test case 1: No jobs and tasks
|
||||||
|
s.catalog.EXPECT().ListImportJobs(mock.Anything).Return(nil, nil)
|
||||||
|
s.catalog.EXPECT().ListPreImportTasks(mock.Anything).Return(nil, nil)
|
||||||
|
s.catalog.EXPECT().ListImportTasks(mock.Anything).Return(nil, nil)
|
||||||
|
s.inspector.reloadFromMeta()
|
||||||
|
|
||||||
|
// Test case 2: Jobs with in-progress tasks
|
||||||
|
jobProto := &datapb.ImportJob{
|
||||||
|
JobID: 1,
|
||||||
|
CollectionID: s.collectionID,
|
||||||
|
TimeoutTs: math.MaxUint64,
|
||||||
|
Schema: &schemapb.CollectionSchema{},
|
||||||
|
}
|
||||||
|
job := &importJob{
|
||||||
|
ImportJob: jobProto,
|
||||||
|
tr: timerecord.NewTimeRecorder("import job"),
|
||||||
|
}
|
||||||
|
s.catalog.EXPECT().SaveImportJob(mock.Anything, mock.Anything).Return(nil)
|
||||||
|
err := s.importMeta.AddJob(context.TODO(), job)
|
||||||
|
s.NoError(err)
|
||||||
|
|
||||||
|
// Add an in-progress pre-import task
|
||||||
|
inprogressPreImportTask := &preImportTask{
|
||||||
|
importMeta: s.importMeta,
|
||||||
|
tr: timerecord.NewTimeRecorder("preimport task"),
|
||||||
|
}
|
||||||
|
inprogressPreImportTask.task.Store(&datapb.PreImportTask{
|
||||||
|
JobID: 1,
|
||||||
|
TaskID: 1,
|
||||||
|
CollectionID: s.collectionID,
|
||||||
|
State: datapb.ImportTaskStateV2_InProgress,
|
||||||
|
})
|
||||||
|
s.catalog.EXPECT().SavePreImportTask(mock.Anything, mock.Anything).Return(nil)
|
||||||
|
err = s.importMeta.AddTask(context.TODO(), inprogressPreImportTask)
|
||||||
|
s.NoError(err)
|
||||||
|
|
||||||
|
// Add an in-progress import task
|
||||||
|
inprogressImportTask := &importTask{
|
||||||
|
importMeta: s.importMeta,
|
||||||
|
tr: timerecord.NewTimeRecorder("import task"),
|
||||||
|
}
|
||||||
|
inprogressImportTask.task.Store(&datapb.ImportTaskV2{
|
||||||
|
JobID: 1,
|
||||||
|
TaskID: 2,
|
||||||
|
CollectionID: s.collectionID,
|
||||||
|
State: datapb.ImportTaskStateV2_InProgress,
|
||||||
|
})
|
||||||
|
s.catalog.EXPECT().SaveImportTask(mock.Anything, mock.Anything).Return(nil)
|
||||||
|
err = s.importMeta.AddTask(context.TODO(), inprogressImportTask)
|
||||||
|
s.NoError(err)
|
||||||
|
|
||||||
|
// Add an pending import task
|
||||||
|
pendingImportTask := &importTask{
|
||||||
|
importMeta: s.importMeta,
|
||||||
|
tr: timerecord.NewTimeRecorder("import task"),
|
||||||
|
}
|
||||||
|
pendingImportTask.task.Store(&datapb.ImportTaskV2{
|
||||||
|
JobID: 1,
|
||||||
|
TaskID: 3,
|
||||||
|
CollectionID: s.collectionID,
|
||||||
|
State: datapb.ImportTaskStateV2_Pending,
|
||||||
|
})
|
||||||
|
s.catalog.EXPECT().SaveImportTask(mock.Anything, mock.Anything).Return(nil)
|
||||||
|
err = s.importMeta.AddTask(context.TODO(), pendingImportTask)
|
||||||
|
|
||||||
|
// Mock scheduler expectations
|
||||||
|
s.inspector.scheduler.(*task2.MockGlobalScheduler).EXPECT().Enqueue(mock.Anything).Times(2)
|
||||||
|
s.inspector.reloadFromMeta()
|
||||||
|
}
|
||||||
|
|
||||||
func TestImportInspector(t *testing.T) {
|
func TestImportInspector(t *testing.T) {
|
||||||
suite.Run(t, new(ImportInspectorSuite))
|
suite.Run(t, new(ImportInspectorSuite))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -200,8 +200,9 @@ func (i *indexInspector) reloadFromMeta() {
|
|||||||
segments := i.meta.GetAllSegmentsUnsafe()
|
segments := i.meta.GetAllSegmentsUnsafe()
|
||||||
for _, segment := range segments {
|
for _, segment := range segments {
|
||||||
for _, segIndex := range i.meta.indexMeta.GetSegmentIndexes(segment.GetCollectionID(), segment.ID) {
|
for _, segIndex := range i.meta.indexMeta.GetSegmentIndexes(segment.GetCollectionID(), segment.ID) {
|
||||||
if segIndex.IsDeleted || segIndex.IndexState == commonpb.IndexState_Finished ||
|
if segIndex.IsDeleted || (segIndex.IndexState != commonpb.IndexState_Unissued &&
|
||||||
segIndex.IndexState == commonpb.IndexState_Failed {
|
segIndex.IndexState != commonpb.IndexState_Retry &&
|
||||||
|
segIndex.IndexState != commonpb.IndexState_InProgress) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -98,8 +98,9 @@ func (si *statsInspector) Stop() {
|
|||||||
func (si *statsInspector) reloadFromMeta() {
|
func (si *statsInspector) reloadFromMeta() {
|
||||||
tasks := si.mt.statsTaskMeta.GetAllTasks()
|
tasks := si.mt.statsTaskMeta.GetAllTasks()
|
||||||
for _, st := range tasks {
|
for _, st := range tasks {
|
||||||
if st.GetState() == indexpb.JobState_JobStateFinished ||
|
if st.GetState() != indexpb.JobState_JobStateInit &&
|
||||||
st.GetState() == indexpb.JobState_JobStateFailed {
|
st.GetState() != indexpb.JobState_JobStateRetry &&
|
||||||
|
st.GetState() != indexpb.JobState_JobStateInProgress {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
segment := si.mt.GetHealthySegment(si.ctx, st.GetSegmentID())
|
segment := si.mt.GetHealthySegment(si.ctx, st.GetSegmentID())
|
||||||
|
|||||||
@ -69,10 +69,11 @@ func (s *globalTaskScheduler) Enqueue(task Task) {
|
|||||||
case taskcommon.Init:
|
case taskcommon.Init:
|
||||||
task.SetTaskTime(taskcommon.TimeQueue, time.Now())
|
task.SetTaskTime(taskcommon.TimeQueue, time.Now())
|
||||||
s.pendingTasks.Push(task)
|
s.pendingTasks.Push(task)
|
||||||
case taskcommon.InProgress:
|
case taskcommon.InProgress, taskcommon.Retry:
|
||||||
task.SetTaskTime(taskcommon.TimeStart, time.Now())
|
task.SetTaskTime(taskcommon.TimeStart, time.Now())
|
||||||
s.runningTasks.Insert(task.GetTaskID(), task)
|
s.runningTasks.Insert(task.GetTaskID(), task)
|
||||||
}
|
}
|
||||||
|
log.Ctx(s.ctx).Info("task enqueued", WrapTaskLog(task)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *globalTaskScheduler) AbortAndRemoveTask(taskID int64) {
|
func (s *globalTaskScheduler) AbortAndRemoveTask(taskID int64) {
|
||||||
|
|||||||
@ -41,6 +41,7 @@ func TestGlobalScheduler_Enqueue(t *testing.T) {
|
|||||||
task := NewMockTask(t)
|
task := NewMockTask(t)
|
||||||
task.EXPECT().GetTaskID().Return(1)
|
task.EXPECT().GetTaskID().Return(1)
|
||||||
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
||||||
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
||||||
scheduler.Enqueue(task)
|
scheduler.Enqueue(task)
|
||||||
assert.Equal(t, 1, len(scheduler.(*globalTaskScheduler).pendingTasks.TaskIDs()))
|
assert.Equal(t, 1, len(scheduler.(*globalTaskScheduler).pendingTasks.TaskIDs()))
|
||||||
@ -50,6 +51,7 @@ func TestGlobalScheduler_Enqueue(t *testing.T) {
|
|||||||
task = NewMockTask(t)
|
task = NewMockTask(t)
|
||||||
task.EXPECT().GetTaskID().Return(2)
|
task.EXPECT().GetTaskID().Return(2)
|
||||||
task.EXPECT().GetTaskState().Return(taskcommon.InProgress)
|
task.EXPECT().GetTaskState().Return(taskcommon.InProgress)
|
||||||
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
||||||
scheduler.Enqueue(task)
|
scheduler.Enqueue(task)
|
||||||
assert.Equal(t, 1, scheduler.(*globalTaskScheduler).runningTasks.Len())
|
assert.Equal(t, 1, scheduler.(*globalTaskScheduler).runningTasks.Len())
|
||||||
@ -64,6 +66,7 @@ func TestGlobalScheduler_AbortAndRemoveTask(t *testing.T) {
|
|||||||
task := NewMockTask(t)
|
task := NewMockTask(t)
|
||||||
task.EXPECT().GetTaskID().Return(1)
|
task.EXPECT().GetTaskID().Return(1)
|
||||||
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
||||||
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
||||||
task.EXPECT().DropTaskOnWorker(mock.Anything).Return()
|
task.EXPECT().DropTaskOnWorker(mock.Anything).Return()
|
||||||
scheduler.Enqueue(task)
|
scheduler.Enqueue(task)
|
||||||
@ -74,6 +77,7 @@ func TestGlobalScheduler_AbortAndRemoveTask(t *testing.T) {
|
|||||||
task = NewMockTask(t)
|
task = NewMockTask(t)
|
||||||
task.EXPECT().GetTaskID().Return(2)
|
task.EXPECT().GetTaskID().Return(2)
|
||||||
task.EXPECT().GetTaskState().Return(taskcommon.InProgress)
|
task.EXPECT().GetTaskState().Return(taskcommon.InProgress)
|
||||||
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
||||||
task.EXPECT().DropTaskOnWorker(mock.Anything).Return()
|
task.EXPECT().DropTaskOnWorker(mock.Anything).Return()
|
||||||
scheduler.Enqueue(task)
|
scheduler.Enqueue(task)
|
||||||
@ -140,6 +144,7 @@ func TestGlobalScheduler_TestSchedule(t *testing.T) {
|
|||||||
task.EXPECT().GetTaskID().Return(1)
|
task.EXPECT().GetTaskID().Return(1)
|
||||||
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
task.EXPECT().GetTaskState().Return(taskcommon.Init)
|
||||||
|
task.EXPECT().GetTaskType().Return(taskcommon.Compaction)
|
||||||
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
task.EXPECT().SetTaskTime(mock.Anything, mock.Anything).Return()
|
||||||
task.EXPECT().GetTaskSlot().Return(1)
|
task.EXPECT().GetTaskSlot().Return(1)
|
||||||
return task
|
return task
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user