enhance: Add task version monitoring (#42023)

issue: https://github.com/milvus-io/milvus/issues/41123

---------

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2025-05-22 23:24:28 +08:00 committed by GitHub
parent 244aa30076
commit e04e5b41ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 105 additions and 8 deletions

View File

@ -88,6 +88,10 @@ func (t *clusteringCompactionTask) GetTaskTime(timeType taskcommon.TimeType) tim
return timeType.GetTaskTime(t.times) return timeType.GetTaskTime(t.times)
} }
func (t *clusteringCompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *clusteringCompactionTask) retryOnError(err error) { func (t *clusteringCompactionTask) retryOnError(err error) {
if err != nil { if err != nil {
log.Warn("clustering compaction task failed", zap.Error(err)) log.Warn("clustering compaction task failed", zap.Error(err))

View File

@ -81,6 +81,10 @@ func (t *l0CompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(t.times) return timeType.GetTaskTime(t.times)
} }
func (t *l0CompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) { func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID())) log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID()))
plan, err := t.BuildCompactionRequest() plan, err := t.BuildCompactionRequest()

View File

@ -57,6 +57,10 @@ func (t *mixCompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time
return timeType.GetTaskTime(t.times) return timeType.GetTaskTime(t.times)
} }
func (t *mixCompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) { func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()),
zap.Int64("PlanID", t.GetTaskProto().GetPlanID()), zap.Int64("PlanID", t.GetTaskProto().GetPlanID()),

View File

@ -50,6 +50,7 @@ type importTask struct {
imeta ImportMeta imeta ImportMeta
tr *timerecord.TimeRecorder tr *timerecord.TimeRecorder
times *taskcommon.Times times *taskcommon.Times
retryTimes int64
} }
func (t *importTask) GetJobID() int64 { func (t *importTask) GetJobID() int64 {
@ -80,6 +81,10 @@ func (t *importTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(t.times) return timeType.GetTaskTime(t.times)
} }
func (t *importTask) GetTaskVersion() int64 {
return t.retryTimes
}
func (t *importTask) GetReason() string { func (t *importTask) GetReason() string {
return t.task.Load().GetReason() return t.task.Load().GetReason()
} }
@ -142,6 +147,7 @@ func (t *importTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
err = cluster.CreateImport(nodeID, req, t.GetTaskSlot()) err = cluster.CreateImport(nodeID, req, t.GetTaskSlot())
if err != nil { if err != nil {
log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...) log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...)
t.retryTimes++
return return
} }
err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(), err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(),

View File

@ -44,6 +44,7 @@ type preImportTask struct {
imeta ImportMeta imeta ImportMeta
tr *timerecord.TimeRecorder tr *timerecord.TimeRecorder
times *taskcommon.Times times *taskcommon.Times
retryTimes int64
} }
func (p *preImportTask) GetJobID() int64 { func (p *preImportTask) GetJobID() int64 {
@ -102,6 +103,10 @@ func (p *preImportTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(p.times) return timeType.GetTaskTime(p.times)
} }
func (p *preImportTask) GetTaskVersion() int64 {
return p.retryTimes
}
func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) { func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log.Info("processing pending preimport task...", WrapTaskLog(p)...) log.Info("processing pending preimport task...", WrapTaskLog(p)...)
job := p.imeta.GetJob(context.TODO(), p.GetJobID()) job := p.imeta.GetJob(context.TODO(), p.GetJobID())
@ -110,6 +115,7 @@ func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster
err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot()) err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot())
if err != nil { if err != nil {
log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...) log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...)
p.retryTimes++
return return
} }
err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(), err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(),

View File

@ -251,6 +251,8 @@ func (s *globalTaskScheduler) updateTaskTimeMetrics() {
if !ok || maxQueueingTime < queueingTime.Milliseconds() { if !ok || maxQueueingTime < queueingTime.Milliseconds() {
maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds() maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
} }
metrics.TaskVersion.WithLabelValues(task.GetTaskType()).Observe(float64(task.GetTaskVersion()))
} }
collectRunningMetricsFunc := func(task Task) { collectRunningMetricsFunc := func(task Task) {

View File

@ -319,6 +319,51 @@ func (_c *MockTask_GetTaskType_Call) RunAndReturn(run func() string) *MockTask_G
return _c return _c
} }
// GetTaskVersion provides a mock function with no fields
func (_m *MockTask) GetTaskVersion() int64 {
ret := _m.Called()
if len(ret) == 0 {
panic("no return value specified for GetTaskVersion")
}
var r0 int64
if rf, ok := ret.Get(0).(func() int64); ok {
r0 = rf()
} else {
r0 = ret.Get(0).(int64)
}
return r0
}
// MockTask_GetTaskVersion_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetTaskVersion'
type MockTask_GetTaskVersion_Call struct {
*mock.Call
}
// GetTaskVersion is a helper method to define mock.On call
func (_e *MockTask_Expecter) GetTaskVersion() *MockTask_GetTaskVersion_Call {
return &MockTask_GetTaskVersion_Call{Call: _e.mock.On("GetTaskVersion")}
}
func (_c *MockTask_GetTaskVersion_Call) Run(run func()) *MockTask_GetTaskVersion_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *MockTask_GetTaskVersion_Call) Return(_a0 int64) *MockTask_GetTaskVersion_Call {
_c.Call.Return(_a0)
return _c
}
func (_c *MockTask_GetTaskVersion_Call) RunAndReturn(run func() int64) *MockTask_GetTaskVersion_Call {
_c.Call.Return(run)
return _c
}
// QueryTaskOnWorker provides a mock function with given fields: cluster // QueryTaskOnWorker provides a mock function with given fields: cluster
func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) { func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) {
_m.Called(cluster) _m.Called(cluster)

View File

@ -32,6 +32,7 @@ type Task interface {
GetTaskSlot() int64 GetTaskSlot() int64
SetTaskTime(timeType taskcommon.TimeType, time time.Time) SetTaskTime(timeType taskcommon.TimeType, time time.Time)
GetTaskTime(timeType taskcommon.TimeType) time.Time GetTaskTime(timeType taskcommon.TimeType) time.Time
GetTaskVersion() int64
CreateTaskOnWorker(nodeID int64, cluster session.Cluster) CreateTaskOnWorker(nodeID int64, cluster session.Cluster)
QueryTaskOnWorker(cluster session.Cluster) QueryTaskOnWorker(cluster session.Cluster)

View File

@ -56,6 +56,10 @@ func (at *analyzeTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(at.times) return timeType.GetTaskTime(at.times)
} }
func (at *analyzeTask) GetTaskVersion() int64 {
return at.GetVersion()
}
func (at *analyzeTask) GetTaskType() taskcommon.Type { func (at *analyzeTask) GetTaskType() taskcommon.Type {
return taskcommon.Analyze return taskcommon.Analyze
} }

View File

@ -100,6 +100,10 @@ func (it *indexBuildTask) GetTaskType() taskcommon.Type {
return taskcommon.Index return taskcommon.Index
} }
func (it *indexBuildTask) GetTaskVersion() int64 {
return it.IndexVersion
}
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) { func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
it.IndexState = commonpb.IndexState(state) it.IndexState = commonpb.IndexState(state)
it.FailReason = failReason it.FailReason = failReason

View File

@ -96,6 +96,10 @@ func (st *statsTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(st.times) return timeType.GetTaskTime(st.times)
} }
func (st *statsTask) GetTaskVersion() int64 {
return st.GetVersion()
}
func (st *statsTask) SetState(state indexpb.JobState, failReason string) { func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
st.State = state st.State = state
st.FailReason = failReason st.FailReason = failReason

View File

@ -363,6 +363,18 @@ var (
Name: "task_count", Name: "task_count",
Help: "number of index tasks of each type", Help: "number of index tasks of each type",
}, []string{TaskTypeLabel, TaskStateLabel}) }, []string{TaskTypeLabel, TaskStateLabel})
// TaskVersion records the version of task(retry times of task).
TaskVersion = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.DataCoordRole,
Name: "task_version",
Help: "version of task",
Buckets: buckets,
}, []string{
TaskTypeLabel,
})
) )
// RegisterDataCoord registers DataCoord metrics // RegisterDataCoord registers DataCoord metrics
@ -395,6 +407,7 @@ func RegisterDataCoord(registry *prometheus.Registry) {
registry.MustRegister(GarbageCollectorRunCount) registry.MustRegister(GarbageCollectorRunCount)
registry.MustRegister(DataCoordTaskExecuteLatency) registry.MustRegister(DataCoordTaskExecuteLatency)
registry.MustRegister(TaskNum) registry.MustRegister(TaskNum)
registry.MustRegister(TaskVersion)
registerStreamingCoord(registry) registerStreamingCoord(registry)
} }