milvus/internal/datacoord/task_scheduler.go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package datacoord

import (
	"context"
	"sync"
	"time"

	"github.com/hashicorp/golang-lru/v2/expirable"
	"go.uber.org/zap"

	"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
	"github.com/milvus-io/milvus/internal/datacoord/allocator"
	"github.com/milvus-io/milvus/internal/datacoord/session"
	"github.com/milvus-io/milvus/internal/storage"
	"github.com/milvus-io/milvus/pkg/log"
	"github.com/milvus-io/milvus/pkg/metrics"
	"github.com/milvus-io/milvus/pkg/proto/indexpb"
	"github.com/milvus-io/milvus/pkg/proto/workerpb"
	"github.com/milvus-io/milvus/pkg/util/lock"
)

const (
	reqTimeoutInterval = time.Second * 10
)

type taskScheduler struct {
	sync.RWMutex

	ctx    context.Context
	cancel context.CancelFunc
	wg     sync.WaitGroup

	scheduleDuration       time.Duration
	collectMetricsDuration time.Duration

	// TODO @xiaocai2333: use priority queue
	tasks      map[int64]Task
	notifyChan chan struct{}
	taskLock   *lock.KeyLock[int64]

	meta *meta

	policy                    buildIndexPolicy
	nodeManager               session.WorkerManager
	chunkManager              storage.ChunkManager
	indexEngineVersionManager IndexEngineVersionManager
	handler                   Handler
	allocator                 allocator.Allocator
	compactionHandler         compactionPlanContext

	taskStats *expirable.LRU[UniqueID, Task]
}

func newTaskScheduler(
	ctx context.Context,
	metaTable *meta, nodeManager session.WorkerManager,
	chunkManager storage.ChunkManager,
	indexEngineVersionManager IndexEngineVersionManager,
	handler Handler,
	allocator allocator.Allocator,
	compactionHandler compactionPlanContext,
) *taskScheduler {
	ctx, cancel := context.WithCancel(ctx)

	ts := &taskScheduler{
		ctx:                       ctx,
		cancel:                    cancel,
		meta:                      metaTable,
		tasks:                     make(map[int64]Task),
		notifyChan:                make(chan struct{}, 1),
		taskLock:                  lock.NewKeyLock[int64](),
		scheduleDuration:          Params.DataCoordCfg.IndexTaskSchedulerInterval.GetAsDuration(time.Millisecond),
		collectMetricsDuration:    time.Minute,
		policy:                    defaultBuildIndexPolicy,
		nodeManager:               nodeManager,
		chunkManager:              chunkManager,
		handler:                   handler,
		indexEngineVersionManager: indexEngineVersionManager,
		allocator:                 allocator,
		taskStats:                 expirable.NewLRU[UniqueID, Task](512, nil, time.Minute*15),
		compactionHandler:         compactionHandler,
	}
	ts.reloadFromMeta()
	return ts
}

func (s *taskScheduler) Start() {
	s.wg.Add(2)
	go s.schedule()
	go s.collectTaskMetrics()
}

func (s *taskScheduler) Stop() {
	s.cancel()
	s.wg.Wait()
}

func (s *taskScheduler) reloadFromMeta() {
	segments := s.meta.GetAllSegmentsUnsafe()
	for _, segment := range segments {
		for _, segIndex := range s.meta.indexMeta.GetSegmentIndexes(segment.GetCollectionID(), segment.ID) {
			if segIndex.IsDeleted {
				continue
			}
			if segIndex.IndexState != commonpb.IndexState_Finished && segIndex.IndexState != commonpb.IndexState_Failed {
				s.enqueue(&indexBuildTask{
					taskID: segIndex.BuildID,
					nodeID: segIndex.NodeID,
					taskInfo: &workerpb.IndexTaskInfo{
						BuildID:    segIndex.BuildID,
						State:      segIndex.IndexState,
						FailReason: segIndex.FailReason,
					},
					queueTime: time.Now(),
					startTime: time.Now(),
					endTime:   time.Now(),
				})
			}
		}
	}

	allAnalyzeTasks := s.meta.analyzeMeta.GetAllTasks()
	for taskID, t := range allAnalyzeTasks {
		if t.State != indexpb.JobState_JobStateFinished && t.State != indexpb.JobState_JobStateFailed {
			s.enqueue(&analyzeTask{
				taskID: taskID,
				nodeID: t.NodeID,
				taskInfo: &workerpb.AnalyzeResult{
					TaskID:     taskID,
					State:      t.State,
					FailReason: t.FailReason,
				},
				queueTime: time.Now(),
				startTime: time.Now(),
				endTime:   time.Now(),
			})
		}
	}

	allStatsTasks := s.meta.statsTaskMeta.GetAllTasks()
	for taskID, t := range allStatsTasks {
		if t.GetState() != indexpb.JobState_JobStateFinished && t.GetState() != indexpb.JobState_JobStateFailed {
			if t.GetState() == indexpb.JobState_JobStateInProgress || t.GetState() == indexpb.JobState_JobStateRetry {
				if t.GetState() == indexpb.JobState_JobStateInProgress || t.GetState() == indexpb.JobState_JobStateRetry {
					exist, canDo := s.meta.CheckAndSetSegmentsCompacting(context.TODO(), []UniqueID{t.GetSegmentID()})
					if !exist || !canDo {
						log.Ctx(s.ctx).Warn("segment is not exist or is compacting, skip stats, but this should not have happened, try to remove the stats task",
							zap.Int64("taskID", taskID), zap.Bool("exist", exist), zap.Bool("canDo", canDo))
						err := s.meta.statsTaskMeta.DropStatsTask(t.GetTaskID())
						if err == nil {
							continue
						}
						log.Ctx(s.ctx).Warn("remove stats task failed, set to failed", zap.Int64("taskID", taskID), zap.Error(err))
						t.State = indexpb.JobState_JobStateFailed
						t.FailReason = "segment is not exist or is compacting"
					} else {
						if !s.compactionHandler.checkAndSetSegmentStating(t.GetInsertChannel(), t.GetSegmentID()) {
							s.meta.SetSegmentsCompacting(context.TODO(), []UniqueID{t.GetSegmentID()}, false)
							err := s.meta.statsTaskMeta.DropStatsTask(t.GetTaskID())
							if err == nil {
								continue
							}
							log.Ctx(s.ctx).Warn("remove stats task failed, set to failed", zap.Int64("taskID", taskID), zap.Error(err))
							t.State = indexpb.JobState_JobStateFailed
							t.FailReason = "segment is not exist or is l0 compacting"
						}
					}
				}
			}
			s.enqueue(&statsTask{
				taskID:          taskID,
				segmentID:       t.GetSegmentID(),
				targetSegmentID: t.GetTargetSegmentID(),
				nodeID:          t.NodeID,
				taskInfo: &workerpb.StatsResult{
					TaskID:     taskID,
					State:      t.GetState(),
					FailReason: t.GetFailReason(),
				},
				queueTime:  time.Now(),
				startTime:  time.Now(),
				endTime:    time.Now(),
				subJobType: t.GetSubJobType(),
			})
		}
	}
}

// notify is an unblocked notify function
func (s *taskScheduler) notify() {
	select {
	case s.notifyChan <- struct{}{}:
	default:
	}
}

func (s *taskScheduler) enqueue(task Task) {
	defer s.notify()

	s.Lock()
	defer s.Unlock()
	taskID := task.GetTaskID()
	if _, ok := s.tasks[taskID]; !ok {
		s.tasks[taskID] = task
		s.taskStats.Add(taskID, task)
		task.SetQueueTime(time.Now())
		log.Info("taskScheduler enqueue task", zap.Int64("taskID", taskID))
	}
}

func (s *taskScheduler) AbortTask(taskID int64) {
	log.Info("task scheduler receive abort task request", zap.Int64("taskID", taskID))
	s.RLock()
	task, ok := s.tasks[taskID]
	s.RUnlock()
	if ok {
		s.taskLock.Lock(taskID)
		task.SetState(indexpb.JobState_JobStateFailed, "canceled")
		s.taskLock.Unlock(taskID)
	}
}

func (s *taskScheduler) schedule() {
	// receive notifyChan
	// time ticker
	log.Ctx(s.ctx).Info("task scheduler loop start")
	defer s.wg.Done()
	ticker := time.NewTicker(s.scheduleDuration)
	defer ticker.Stop()
	for {
		select {
		case <-s.ctx.Done():
			log.Ctx(s.ctx).Warn("task scheduler ctx done")
			return
		case _, ok := <-s.notifyChan:
			if ok {
				s.run()
			}
			// !ok means indexBuild is closed.
		case <-ticker.C:
			s.run()
		}
	}
}

func (s *taskScheduler) getTask(taskID UniqueID) Task {
	s.RLock()
	defer s.RUnlock()

	return s.tasks[taskID]
}

func (s *taskScheduler) run() {
	// schedule policy
	s.RLock()
	taskIDs := make([]UniqueID, 0, len(s.tasks))
	for tID := range s.tasks {
		taskIDs = append(taskIDs, tID)
	}
	s.RUnlock()
	if len(taskIDs) > 0 {
		log.Ctx(s.ctx).Info("task scheduler", zap.Int("task num", len(taskIDs)))
	}

	s.policy(taskIDs)

	for _, taskID := range taskIDs {
		s.taskLock.Lock(taskID)
		ok := s.process(taskID)
		if !ok {
			s.taskLock.Unlock(taskID)
			log.Ctx(s.ctx).Info("there is no idle indexing node, waiting for retry...")
			break
		}
		s.taskLock.Unlock(taskID)
	}
}

func (s *taskScheduler) removeTask(taskID UniqueID) {
	s.Lock()
	defer s.Unlock()
	delete(s.tasks, taskID)
}

func (s *taskScheduler) process(taskID UniqueID) bool {
	task := s.getTask(taskID)

	if !task.CheckTaskHealthy(s.meta) {
		s.removeTask(taskID)
		return true
	}
	state := task.GetState()
	log.Ctx(s.ctx).Info("task is processing", zap.Int64("taskID", taskID),
		zap.String("task type", task.GetTaskType()), zap.String("state", state.String()))

	switch state {
	case indexpb.JobState_JobStateNone:
		s.removeTask(taskID)

	case indexpb.JobState_JobStateInit:
		return s.processInit(task)
	case indexpb.JobState_JobStateFinished, indexpb.JobState_JobStateFailed:
		return s.processFinished(task)
	case indexpb.JobState_JobStateRetry:
		return s.processRetry(task)
	default:
		// state: in_progress
		return s.processInProgress(task)
	}
	return true
}

func (s *taskScheduler) collectTaskMetrics() {
	defer s.wg.Done()

	ticker := time.NewTicker(s.collectMetricsDuration)
	defer ticker.Stop()
	for {
		select {
		case <-s.ctx.Done():
			log.Warn("task scheduler context done")
			return
		case <-ticker.C:
			s.RLock()
			taskIDs := make([]UniqueID, 0, len(s.tasks))
			for tID := range s.tasks {
				taskIDs = append(taskIDs, tID)
			}
			s.RUnlock()

			maxTaskQueueingTime := make(map[string]int64)
			maxTaskRunningTime := make(map[string]int64)

			collectMetricsFunc := func(taskID int64) {
				task := s.getTask(taskID)
				if task == nil {
					return
				}
				s.taskLock.Lock(taskID)
				defer s.taskLock.Unlock(taskID)

				state := task.GetState()
				switch state {
				case indexpb.JobState_JobStateNone:
					return
				case indexpb.JobState_JobStateInit:
					queueingTime := time.Since(task.GetQueueTime())
					if queueingTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) {
						log.Warn("task queueing time is too long", zap.Int64("taskID", taskID),
							zap.Int64("queueing time(ms)", queueingTime.Milliseconds()))
					}

					maxQueueingTime, ok := maxTaskQueueingTime[task.GetTaskType()]
					if !ok || maxQueueingTime < queueingTime.Milliseconds() {
						maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
					}
				case indexpb.JobState_JobStateInProgress:
					runningTime := time.Since(task.GetStartTime())
					if runningTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) {
						log.Warn("task running time is too long", zap.Int64("taskID", taskID),
							zap.Int64("running time(ms)", runningTime.Milliseconds()))
					}

					maxRunningTime, ok := maxTaskRunningTime[task.GetTaskType()]
					if !ok || maxRunningTime < runningTime.Milliseconds() {
						maxTaskRunningTime[task.GetTaskType()] = runningTime.Milliseconds()
					}
				}
			}

			for _, taskID := range taskIDs {
				collectMetricsFunc(taskID)
			}

			for taskType, queueingTime := range maxTaskQueueingTime {
				metrics.DataCoordTaskExecuteLatency.
					WithLabelValues(taskType, metrics.Pending).Observe(float64(queueingTime))
			}

			for taskType, runningTime := range maxTaskRunningTime {
				metrics.DataCoordTaskExecuteLatency.
					WithLabelValues(taskType, metrics.Executing).Observe(float64(runningTime))
			}
		}
	}
}

func (s *taskScheduler) processInit(task Task) bool {
	// 0. pre check task
	// Determine whether the task can be performed or if it is truly necessary.
	// for example: flat index doesn't need to actually build. checkPass is false.
	checkPass := task.PreCheck(s.ctx, s)
	if !checkPass {
		return true
	}

	// 1. pick an indexNode client
	nodeID, client := s.nodeManager.PickClient()
	if client == nil {
		log.Ctx(s.ctx).Debug("pick client failed")
		return false
	}
	log.Ctx(s.ctx).Info("pick client success", zap.Int64("taskID", task.GetTaskID()), zap.Int64("nodeID", nodeID))

	// 2. update version
	if err := task.UpdateVersion(s.ctx, nodeID, s.meta, s.compactionHandler); err != nil {
		log.Ctx(s.ctx).Warn("update task version failed", zap.Int64("taskID", task.GetTaskID()), zap.Error(err))
		return false
	}
	log.Ctx(s.ctx).Info("update task version success", zap.Int64("taskID", task.GetTaskID()))

	// 3. assign task to indexNode
	success := task.AssignTask(s.ctx, client, s.meta)
	if !success {
		log.Ctx(s.ctx).Warn("assign task to client failed", zap.Int64("taskID", task.GetTaskID()),
			zap.String("new state", task.GetState().String()), zap.String("fail reason", task.GetFailReason()))
		// If the problem is caused by the task itself, subsequent tasks will not be skipped.
		// If etcd fails or fails to send tasks to the node, the subsequent tasks will be skipped.
		return false
	}
	log.Ctx(s.ctx).Info("assign task to client success", zap.Int64("taskID", task.GetTaskID()), zap.Int64("nodeID", nodeID))

	// 4. update meta state
	if err := task.UpdateMetaBuildingState(s.meta); err != nil {
		log.Ctx(s.ctx).Warn("update meta building state failed", zap.Int64("taskID", task.GetTaskID()), zap.Error(err))
		task.SetState(indexpb.JobState_JobStateRetry, "update meta building state failed")
		return false
	}
	task.SetStartTime(time.Now())
	queueingTime := task.GetStartTime().Sub(task.GetQueueTime())
	if queueingTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) {
		log.Warn("task queueing time is too long", zap.Int64("taskID", task.GetTaskID()),
			zap.Int64("queueing time(ms)", queueingTime.Milliseconds()))
	}
	metrics.DataCoordTaskExecuteLatency.
		WithLabelValues(task.GetTaskType(), metrics.Pending).Observe(float64(queueingTime.Milliseconds()))
	log.Ctx(s.ctx).Info("update task meta state to InProgress success", zap.Int64("taskID", task.GetTaskID()),
		zap.Int64("nodeID", nodeID))
	return s.processInProgress(task)
}

func (s *taskScheduler) processFinished(task Task) bool {
	if err := task.SetJobInfo(s.meta); err != nil {
		log.Ctx(s.ctx).Warn("update task info failed", zap.Error(err))
		return true
	}
	task.SetEndTime(time.Now())
	runningTime := task.GetEndTime().Sub(task.GetStartTime())
	if runningTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) {
		log.Warn("task running time is too long", zap.Int64("taskID", task.GetTaskID()),
			zap.Int64("running time(ms)", runningTime.Milliseconds()))
	}
	metrics.DataCoordTaskExecuteLatency.
		WithLabelValues(task.GetTaskType(), metrics.Executing).Observe(float64(runningTime.Milliseconds()))
	client, exist := s.nodeManager.GetClientByID(task.GetNodeID())
	if exist {
		if !task.DropTaskOnWorker(s.ctx, client) {
			return true
		}
	}
	s.removeTask(task.GetTaskID())
	return true
}

func (s *taskScheduler) processRetry(task Task) bool {
	client, exist := s.nodeManager.GetClientByID(task.GetNodeID())
	if exist {
		if !task.DropTaskOnWorker(s.ctx, client) {
			return true
		}
	}
	task.SetState(indexpb.JobState_JobStateInit, "")
	task.ResetTask(s.meta)
	return true
}

func (s *taskScheduler) processInProgress(task Task) bool {
	client, exist := s.nodeManager.GetClientByID(task.GetNodeID())
	if exist {
		task.QueryResult(s.ctx, client)
		if task.GetState() == indexpb.JobState_JobStateFinished || task.GetState() == indexpb.JobState_JobStateFailed {
			task.ResetTask(s.meta)
			return s.processFinished(task)
		}
		return true
	}
	task.SetState(indexpb.JobState_JobStateRetry, "node does not exist")
	return true
}