mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: Add task version monitoring (#42023)
issue: https://github.com/milvus-io/milvus/issues/41123 --------- Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
244aa30076
commit
e04e5b41ca
@ -88,6 +88,10 @@ func (t *clusteringCompactionTask) GetTaskTime(timeType taskcommon.TimeType) tim
|
||||
return timeType.GetTaskTime(t.times)
|
||||
}
|
||||
|
||||
func (t *clusteringCompactionTask) GetTaskVersion() int64 {
|
||||
return int64(t.GetTaskProto().GetRetryTimes())
|
||||
}
|
||||
|
||||
func (t *clusteringCompactionTask) retryOnError(err error) {
|
||||
if err != nil {
|
||||
log.Warn("clustering compaction task failed", zap.Error(err))
|
||||
|
||||
@ -81,6 +81,10 @@ func (t *l0CompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
||||
return timeType.GetTaskTime(t.times)
|
||||
}
|
||||
|
||||
func (t *l0CompactionTask) GetTaskVersion() int64 {
|
||||
return int64(t.GetTaskProto().GetRetryTimes())
|
||||
}
|
||||
|
||||
func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID()))
|
||||
plan, err := t.BuildCompactionRequest()
|
||||
|
||||
@ -57,6 +57,10 @@ func (t *mixCompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time
|
||||
return timeType.GetTaskTime(t.times)
|
||||
}
|
||||
|
||||
func (t *mixCompactionTask) GetTaskVersion() int64 {
|
||||
return int64(t.GetTaskProto().GetRetryTimes())
|
||||
}
|
||||
|
||||
func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()),
|
||||
zap.Int64("PlanID", t.GetTaskProto().GetPlanID()),
|
||||
|
||||
@ -50,6 +50,7 @@ type importTask struct {
|
||||
imeta ImportMeta
|
||||
tr *timerecord.TimeRecorder
|
||||
times *taskcommon.Times
|
||||
retryTimes int64
|
||||
}
|
||||
|
||||
func (t *importTask) GetJobID() int64 {
|
||||
@ -80,6 +81,10 @@ func (t *importTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
||||
return timeType.GetTaskTime(t.times)
|
||||
}
|
||||
|
||||
func (t *importTask) GetTaskVersion() int64 {
|
||||
return t.retryTimes
|
||||
}
|
||||
|
||||
func (t *importTask) GetReason() string {
|
||||
return t.task.Load().GetReason()
|
||||
}
|
||||
@ -142,6 +147,7 @@ func (t *importTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||
err = cluster.CreateImport(nodeID, req, t.GetTaskSlot())
|
||||
if err != nil {
|
||||
log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...)
|
||||
t.retryTimes++
|
||||
return
|
||||
}
|
||||
err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(),
|
||||
|
||||
@ -44,6 +44,7 @@ type preImportTask struct {
|
||||
imeta ImportMeta
|
||||
tr *timerecord.TimeRecorder
|
||||
times *taskcommon.Times
|
||||
retryTimes int64
|
||||
}
|
||||
|
||||
func (p *preImportTask) GetJobID() int64 {
|
||||
@ -102,6 +103,10 @@ func (p *preImportTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
||||
return timeType.GetTaskTime(p.times)
|
||||
}
|
||||
|
||||
func (p *preImportTask) GetTaskVersion() int64 {
|
||||
return p.retryTimes
|
||||
}
|
||||
|
||||
func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||
log.Info("processing pending preimport task...", WrapTaskLog(p)...)
|
||||
job := p.imeta.GetJob(context.TODO(), p.GetJobID())
|
||||
@ -110,6 +115,7 @@ func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster
|
||||
err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot())
|
||||
if err != nil {
|
||||
log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...)
|
||||
p.retryTimes++
|
||||
return
|
||||
}
|
||||
err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(),
|
||||
|
||||
@ -251,6 +251,8 @@ func (s *globalTaskScheduler) updateTaskTimeMetrics() {
|
||||
if !ok || maxQueueingTime < queueingTime.Milliseconds() {
|
||||
maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
|
||||
}
|
||||
|
||||
metrics.TaskVersion.WithLabelValues(task.GetTaskType()).Observe(float64(task.GetTaskVersion()))
|
||||
}
|
||||
|
||||
collectRunningMetricsFunc := func(task Task) {
|
||||
|
||||
@ -319,6 +319,51 @@ func (_c *MockTask_GetTaskType_Call) RunAndReturn(run func() string) *MockTask_G
|
||||
return _c
|
||||
}
|
||||
|
||||
// GetTaskVersion provides a mock function with no fields
|
||||
func (_m *MockTask) GetTaskVersion() int64 {
|
||||
ret := _m.Called()
|
||||
|
||||
if len(ret) == 0 {
|
||||
panic("no return value specified for GetTaskVersion")
|
||||
}
|
||||
|
||||
var r0 int64
|
||||
if rf, ok := ret.Get(0).(func() int64); ok {
|
||||
r0 = rf()
|
||||
} else {
|
||||
r0 = ret.Get(0).(int64)
|
||||
}
|
||||
|
||||
return r0
|
||||
}
|
||||
|
||||
// MockTask_GetTaskVersion_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetTaskVersion'
|
||||
type MockTask_GetTaskVersion_Call struct {
|
||||
*mock.Call
|
||||
}
|
||||
|
||||
// GetTaskVersion is a helper method to define mock.On call
|
||||
func (_e *MockTask_Expecter) GetTaskVersion() *MockTask_GetTaskVersion_Call {
|
||||
return &MockTask_GetTaskVersion_Call{Call: _e.mock.On("GetTaskVersion")}
|
||||
}
|
||||
|
||||
func (_c *MockTask_GetTaskVersion_Call) Run(run func()) *MockTask_GetTaskVersion_Call {
|
||||
_c.Call.Run(func(args mock.Arguments) {
|
||||
run()
|
||||
})
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *MockTask_GetTaskVersion_Call) Return(_a0 int64) *MockTask_GetTaskVersion_Call {
|
||||
_c.Call.Return(_a0)
|
||||
return _c
|
||||
}
|
||||
|
||||
func (_c *MockTask_GetTaskVersion_Call) RunAndReturn(run func() int64) *MockTask_GetTaskVersion_Call {
|
||||
_c.Call.Return(run)
|
||||
return _c
|
||||
}
|
||||
|
||||
// QueryTaskOnWorker provides a mock function with given fields: cluster
|
||||
func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) {
|
||||
_m.Called(cluster)
|
||||
|
||||
@ -32,6 +32,7 @@ type Task interface {
|
||||
GetTaskSlot() int64
|
||||
SetTaskTime(timeType taskcommon.TimeType, time time.Time)
|
||||
GetTaskTime(timeType taskcommon.TimeType) time.Time
|
||||
GetTaskVersion() int64
|
||||
|
||||
CreateTaskOnWorker(nodeID int64, cluster session.Cluster)
|
||||
QueryTaskOnWorker(cluster session.Cluster)
|
||||
|
||||
@ -56,6 +56,10 @@ func (at *analyzeTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
||||
return timeType.GetTaskTime(at.times)
|
||||
}
|
||||
|
||||
func (at *analyzeTask) GetTaskVersion() int64 {
|
||||
return at.GetVersion()
|
||||
}
|
||||
|
||||
func (at *analyzeTask) GetTaskType() taskcommon.Type {
|
||||
return taskcommon.Analyze
|
||||
}
|
||||
|
||||
@ -100,6 +100,10 @@ func (it *indexBuildTask) GetTaskType() taskcommon.Type {
|
||||
return taskcommon.Index
|
||||
}
|
||||
|
||||
func (it *indexBuildTask) GetTaskVersion() int64 {
|
||||
return it.IndexVersion
|
||||
}
|
||||
|
||||
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
|
||||
it.IndexState = commonpb.IndexState(state)
|
||||
it.FailReason = failReason
|
||||
|
||||
@ -96,6 +96,10 @@ func (st *statsTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
||||
return timeType.GetTaskTime(st.times)
|
||||
}
|
||||
|
||||
func (st *statsTask) GetTaskVersion() int64 {
|
||||
return st.GetVersion()
|
||||
}
|
||||
|
||||
func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
|
||||
st.State = state
|
||||
st.FailReason = failReason
|
||||
|
||||
@ -363,6 +363,18 @@ var (
|
||||
Name: "task_count",
|
||||
Help: "number of index tasks of each type",
|
||||
}, []string{TaskTypeLabel, TaskStateLabel})
|
||||
|
||||
// TaskVersion records the version of task(retry times of task).
|
||||
TaskVersion = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: milvusNamespace,
|
||||
Subsystem: typeutil.DataCoordRole,
|
||||
Name: "task_version",
|
||||
Help: "version of task",
|
||||
Buckets: buckets,
|
||||
}, []string{
|
||||
TaskTypeLabel,
|
||||
})
|
||||
)
|
||||
|
||||
// RegisterDataCoord registers DataCoord metrics
|
||||
@ -395,6 +407,7 @@ func RegisterDataCoord(registry *prometheus.Registry) {
|
||||
registry.MustRegister(GarbageCollectorRunCount)
|
||||
registry.MustRegister(DataCoordTaskExecuteLatency)
|
||||
registry.MustRegister(TaskNum)
|
||||
registry.MustRegister(TaskVersion)
|
||||
|
||||
registerStreamingCoord(registry)
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user