enhance: Add task version monitoring (#42023)

issue: https://github.com/milvus-io/milvus/issues/41123

---------

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2025-05-22 23:24:28 +08:00 committed by GitHub
parent 244aa30076
commit e04e5b41ca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 105 additions and 8 deletions

View File

@ -88,6 +88,10 @@ func (t *clusteringCompactionTask) GetTaskTime(timeType taskcommon.TimeType) tim
return timeType.GetTaskTime(t.times)
}
func (t *clusteringCompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *clusteringCompactionTask) retryOnError(err error) {
if err != nil {
log.Warn("clustering compaction task failed", zap.Error(err))

View File

@ -81,6 +81,10 @@ func (t *l0CompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(t.times)
}
func (t *l0CompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID()))
plan, err := t.BuildCompactionRequest()

View File

@ -57,6 +57,10 @@ func (t *mixCompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time
return timeType.GetTaskTime(t.times)
}
func (t *mixCompactionTask) GetTaskVersion() int64 {
return int64(t.GetTaskProto().GetRetryTimes())
}
func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()),
zap.Int64("PlanID", t.GetTaskProto().GetPlanID()),

View File

@ -50,6 +50,7 @@ type importTask struct {
imeta ImportMeta
tr *timerecord.TimeRecorder
times *taskcommon.Times
retryTimes int64
}
func (t *importTask) GetJobID() int64 {
@ -80,6 +81,10 @@ func (t *importTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(t.times)
}
func (t *importTask) GetTaskVersion() int64 {
return t.retryTimes
}
func (t *importTask) GetReason() string {
return t.task.Load().GetReason()
}
@ -142,6 +147,7 @@ func (t *importTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
err = cluster.CreateImport(nodeID, req, t.GetTaskSlot())
if err != nil {
log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...)
t.retryTimes++
return
}
err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(),

View File

@ -44,6 +44,7 @@ type preImportTask struct {
imeta ImportMeta
tr *timerecord.TimeRecorder
times *taskcommon.Times
retryTimes int64
}
func (p *preImportTask) GetJobID() int64 {
@ -102,6 +103,10 @@ func (p *preImportTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(p.times)
}
func (p *preImportTask) GetTaskVersion() int64 {
return p.retryTimes
}
func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
log.Info("processing pending preimport task...", WrapTaskLog(p)...)
job := p.imeta.GetJob(context.TODO(), p.GetJobID())
@ -110,6 +115,7 @@ func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster
err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot())
if err != nil {
log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...)
p.retryTimes++
return
}
err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(),

View File

@ -251,6 +251,8 @@ func (s *globalTaskScheduler) updateTaskTimeMetrics() {
if !ok || maxQueueingTime < queueingTime.Milliseconds() {
maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
}
metrics.TaskVersion.WithLabelValues(task.GetTaskType()).Observe(float64(task.GetTaskVersion()))
}
collectRunningMetricsFunc := func(task Task) {

View File

@ -319,6 +319,51 @@ func (_c *MockTask_GetTaskType_Call) RunAndReturn(run func() string) *MockTask_G
return _c
}
// GetTaskVersion provides a mock function with no fields
func (_m *MockTask) GetTaskVersion() int64 {
ret := _m.Called()
if len(ret) == 0 {
panic("no return value specified for GetTaskVersion")
}
var r0 int64
if rf, ok := ret.Get(0).(func() int64); ok {
r0 = rf()
} else {
r0 = ret.Get(0).(int64)
}
return r0
}
// MockTask_GetTaskVersion_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetTaskVersion'
type MockTask_GetTaskVersion_Call struct {
*mock.Call
}
// GetTaskVersion is a helper method to define mock.On call
func (_e *MockTask_Expecter) GetTaskVersion() *MockTask_GetTaskVersion_Call {
return &MockTask_GetTaskVersion_Call{Call: _e.mock.On("GetTaskVersion")}
}
func (_c *MockTask_GetTaskVersion_Call) Run(run func()) *MockTask_GetTaskVersion_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *MockTask_GetTaskVersion_Call) Return(_a0 int64) *MockTask_GetTaskVersion_Call {
_c.Call.Return(_a0)
return _c
}
func (_c *MockTask_GetTaskVersion_Call) RunAndReturn(run func() int64) *MockTask_GetTaskVersion_Call {
_c.Call.Return(run)
return _c
}
// QueryTaskOnWorker provides a mock function with given fields: cluster
func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) {
_m.Called(cluster)

View File

@ -32,6 +32,7 @@ type Task interface {
GetTaskSlot() int64
SetTaskTime(timeType taskcommon.TimeType, time time.Time)
GetTaskTime(timeType taskcommon.TimeType) time.Time
GetTaskVersion() int64
CreateTaskOnWorker(nodeID int64, cluster session.Cluster)
QueryTaskOnWorker(cluster session.Cluster)

View File

@ -56,6 +56,10 @@ func (at *analyzeTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(at.times)
}
func (at *analyzeTask) GetTaskVersion() int64 {
return at.GetVersion()
}
func (at *analyzeTask) GetTaskType() taskcommon.Type {
return taskcommon.Analyze
}

View File

@ -100,6 +100,10 @@ func (it *indexBuildTask) GetTaskType() taskcommon.Type {
return taskcommon.Index
}
func (it *indexBuildTask) GetTaskVersion() int64 {
return it.IndexVersion
}
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
it.IndexState = commonpb.IndexState(state)
it.FailReason = failReason

View File

@ -96,6 +96,10 @@ func (st *statsTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
return timeType.GetTaskTime(st.times)
}
func (st *statsTask) GetTaskVersion() int64 {
return st.GetVersion()
}
func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
st.State = state
st.FailReason = failReason

View File

@ -363,6 +363,18 @@ var (
Name: "task_count",
Help: "number of index tasks of each type",
}, []string{TaskTypeLabel, TaskStateLabel})
// TaskVersion records the version of task(retry times of task).
TaskVersion = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.DataCoordRole,
Name: "task_version",
Help: "version of task",
Buckets: buckets,
}, []string{
TaskTypeLabel,
})
)
// RegisterDataCoord registers DataCoord metrics
@ -395,6 +407,7 @@ func RegisterDataCoord(registry *prometheus.Registry) {
registry.MustRegister(GarbageCollectorRunCount)
registry.MustRegister(DataCoordTaskExecuteLatency)
registry.MustRegister(TaskNum)
registry.MustRegister(TaskVersion)
registerStreamingCoord(registry)
}