mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
enhance: Add task version monitoring (#42023)
issue: https://github.com/milvus-io/milvus/issues/41123 --------- Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
244aa30076
commit
e04e5b41ca
@ -88,6 +88,10 @@ func (t *clusteringCompactionTask) GetTaskTime(timeType taskcommon.TimeType) tim
|
|||||||
return timeType.GetTaskTime(t.times)
|
return timeType.GetTaskTime(t.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *clusteringCompactionTask) GetTaskVersion() int64 {
|
||||||
|
return int64(t.GetTaskProto().GetRetryTimes())
|
||||||
|
}
|
||||||
|
|
||||||
func (t *clusteringCompactionTask) retryOnError(err error) {
|
func (t *clusteringCompactionTask) retryOnError(err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("clustering compaction task failed", zap.Error(err))
|
log.Warn("clustering compaction task failed", zap.Error(err))
|
||||||
|
|||||||
@ -81,6 +81,10 @@ func (t *l0CompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
|||||||
return timeType.GetTaskTime(t.times)
|
return timeType.GetTaskTime(t.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *l0CompactionTask) GetTaskVersion() int64 {
|
||||||
|
return int64(t.GetTaskProto().GetRetryTimes())
|
||||||
|
}
|
||||||
|
|
||||||
func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
func (t *l0CompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||||
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID()))
|
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()), zap.Int64("nodeID", t.GetTaskProto().GetNodeID()))
|
||||||
plan, err := t.BuildCompactionRequest()
|
plan, err := t.BuildCompactionRequest()
|
||||||
|
|||||||
@ -57,6 +57,10 @@ func (t *mixCompactionTask) GetTaskTime(timeType taskcommon.TimeType) time.Time
|
|||||||
return timeType.GetTaskTime(t.times)
|
return timeType.GetTaskTime(t.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *mixCompactionTask) GetTaskVersion() int64 {
|
||||||
|
return int64(t.GetTaskProto().GetRetryTimes())
|
||||||
|
}
|
||||||
|
|
||||||
func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
func (t *mixCompactionTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||||
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()),
|
log := log.With(zap.Int64("triggerID", t.GetTaskProto().GetTriggerID()),
|
||||||
zap.Int64("PlanID", t.GetTaskProto().GetPlanID()),
|
zap.Int64("PlanID", t.GetTaskProto().GetPlanID()),
|
||||||
|
|||||||
@ -50,6 +50,7 @@ type importTask struct {
|
|||||||
imeta ImportMeta
|
imeta ImportMeta
|
||||||
tr *timerecord.TimeRecorder
|
tr *timerecord.TimeRecorder
|
||||||
times *taskcommon.Times
|
times *taskcommon.Times
|
||||||
|
retryTimes int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *importTask) GetJobID() int64 {
|
func (t *importTask) GetJobID() int64 {
|
||||||
@ -80,6 +81,10 @@ func (t *importTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
|||||||
return timeType.GetTaskTime(t.times)
|
return timeType.GetTaskTime(t.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *importTask) GetTaskVersion() int64 {
|
||||||
|
return t.retryTimes
|
||||||
|
}
|
||||||
|
|
||||||
func (t *importTask) GetReason() string {
|
func (t *importTask) GetReason() string {
|
||||||
return t.task.Load().GetReason()
|
return t.task.Load().GetReason()
|
||||||
}
|
}
|
||||||
@ -142,6 +147,7 @@ func (t *importTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
|||||||
err = cluster.CreateImport(nodeID, req, t.GetTaskSlot())
|
err = cluster.CreateImport(nodeID, req, t.GetTaskSlot())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...)
|
log.Warn("import failed", WrapTaskLog(t, zap.Error(err))...)
|
||||||
|
t.retryTimes++
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(),
|
err = t.imeta.UpdateTask(context.TODO(), t.GetTaskID(),
|
||||||
|
|||||||
@ -44,6 +44,7 @@ type preImportTask struct {
|
|||||||
imeta ImportMeta
|
imeta ImportMeta
|
||||||
tr *timerecord.TimeRecorder
|
tr *timerecord.TimeRecorder
|
||||||
times *taskcommon.Times
|
times *taskcommon.Times
|
||||||
|
retryTimes int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *preImportTask) GetJobID() int64 {
|
func (p *preImportTask) GetJobID() int64 {
|
||||||
@ -102,6 +103,10 @@ func (p *preImportTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
|||||||
return timeType.GetTaskTime(p.times)
|
return timeType.GetTaskTime(p.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (p *preImportTask) GetTaskVersion() int64 {
|
||||||
|
return p.retryTimes
|
||||||
|
}
|
||||||
|
|
||||||
func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster) {
|
||||||
log.Info("processing pending preimport task...", WrapTaskLog(p)...)
|
log.Info("processing pending preimport task...", WrapTaskLog(p)...)
|
||||||
job := p.imeta.GetJob(context.TODO(), p.GetJobID())
|
job := p.imeta.GetJob(context.TODO(), p.GetJobID())
|
||||||
@ -110,6 +115,7 @@ func (p *preImportTask) CreateTaskOnWorker(nodeID int64, cluster session.Cluster
|
|||||||
err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot())
|
err := cluster.CreatePreImport(nodeID, req, p.GetTaskSlot())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...)
|
log.Warn("preimport failed", WrapTaskLog(p, zap.Error(err))...)
|
||||||
|
p.retryTimes++
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(),
|
err = p.imeta.UpdateTask(context.TODO(), p.GetTaskID(),
|
||||||
|
|||||||
@ -251,6 +251,8 @@ func (s *globalTaskScheduler) updateTaskTimeMetrics() {
|
|||||||
if !ok || maxQueueingTime < queueingTime.Milliseconds() {
|
if !ok || maxQueueingTime < queueingTime.Milliseconds() {
|
||||||
maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
|
maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
metrics.TaskVersion.WithLabelValues(task.GetTaskType()).Observe(float64(task.GetTaskVersion()))
|
||||||
}
|
}
|
||||||
|
|
||||||
collectRunningMetricsFunc := func(task Task) {
|
collectRunningMetricsFunc := func(task Task) {
|
||||||
|
|||||||
@ -319,6 +319,51 @@ func (_c *MockTask_GetTaskType_Call) RunAndReturn(run func() string) *MockTask_G
|
|||||||
return _c
|
return _c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetTaskVersion provides a mock function with no fields
|
||||||
|
func (_m *MockTask) GetTaskVersion() int64 {
|
||||||
|
ret := _m.Called()
|
||||||
|
|
||||||
|
if len(ret) == 0 {
|
||||||
|
panic("no return value specified for GetTaskVersion")
|
||||||
|
}
|
||||||
|
|
||||||
|
var r0 int64
|
||||||
|
if rf, ok := ret.Get(0).(func() int64); ok {
|
||||||
|
r0 = rf()
|
||||||
|
} else {
|
||||||
|
r0 = ret.Get(0).(int64)
|
||||||
|
}
|
||||||
|
|
||||||
|
return r0
|
||||||
|
}
|
||||||
|
|
||||||
|
// MockTask_GetTaskVersion_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetTaskVersion'
|
||||||
|
type MockTask_GetTaskVersion_Call struct {
|
||||||
|
*mock.Call
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetTaskVersion is a helper method to define mock.On call
|
||||||
|
func (_e *MockTask_Expecter) GetTaskVersion() *MockTask_GetTaskVersion_Call {
|
||||||
|
return &MockTask_GetTaskVersion_Call{Call: _e.mock.On("GetTaskVersion")}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (_c *MockTask_GetTaskVersion_Call) Run(run func()) *MockTask_GetTaskVersion_Call {
|
||||||
|
_c.Call.Run(func(args mock.Arguments) {
|
||||||
|
run()
|
||||||
|
})
|
||||||
|
return _c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (_c *MockTask_GetTaskVersion_Call) Return(_a0 int64) *MockTask_GetTaskVersion_Call {
|
||||||
|
_c.Call.Return(_a0)
|
||||||
|
return _c
|
||||||
|
}
|
||||||
|
|
||||||
|
func (_c *MockTask_GetTaskVersion_Call) RunAndReturn(run func() int64) *MockTask_GetTaskVersion_Call {
|
||||||
|
_c.Call.Return(run)
|
||||||
|
return _c
|
||||||
|
}
|
||||||
|
|
||||||
// QueryTaskOnWorker provides a mock function with given fields: cluster
|
// QueryTaskOnWorker provides a mock function with given fields: cluster
|
||||||
func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) {
|
func (_m *MockTask) QueryTaskOnWorker(cluster session.Cluster) {
|
||||||
_m.Called(cluster)
|
_m.Called(cluster)
|
||||||
|
|||||||
@ -32,6 +32,7 @@ type Task interface {
|
|||||||
GetTaskSlot() int64
|
GetTaskSlot() int64
|
||||||
SetTaskTime(timeType taskcommon.TimeType, time time.Time)
|
SetTaskTime(timeType taskcommon.TimeType, time time.Time)
|
||||||
GetTaskTime(timeType taskcommon.TimeType) time.Time
|
GetTaskTime(timeType taskcommon.TimeType) time.Time
|
||||||
|
GetTaskVersion() int64
|
||||||
|
|
||||||
CreateTaskOnWorker(nodeID int64, cluster session.Cluster)
|
CreateTaskOnWorker(nodeID int64, cluster session.Cluster)
|
||||||
QueryTaskOnWorker(cluster session.Cluster)
|
QueryTaskOnWorker(cluster session.Cluster)
|
||||||
|
|||||||
@ -56,6 +56,10 @@ func (at *analyzeTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
|||||||
return timeType.GetTaskTime(at.times)
|
return timeType.GetTaskTime(at.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (at *analyzeTask) GetTaskVersion() int64 {
|
||||||
|
return at.GetVersion()
|
||||||
|
}
|
||||||
|
|
||||||
func (at *analyzeTask) GetTaskType() taskcommon.Type {
|
func (at *analyzeTask) GetTaskType() taskcommon.Type {
|
||||||
return taskcommon.Analyze
|
return taskcommon.Analyze
|
||||||
}
|
}
|
||||||
|
|||||||
@ -100,6 +100,10 @@ func (it *indexBuildTask) GetTaskType() taskcommon.Type {
|
|||||||
return taskcommon.Index
|
return taskcommon.Index
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (it *indexBuildTask) GetTaskVersion() int64 {
|
||||||
|
return it.IndexVersion
|
||||||
|
}
|
||||||
|
|
||||||
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
|
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
|
||||||
it.IndexState = commonpb.IndexState(state)
|
it.IndexState = commonpb.IndexState(state)
|
||||||
it.FailReason = failReason
|
it.FailReason = failReason
|
||||||
|
|||||||
@ -96,6 +96,10 @@ func (st *statsTask) GetTaskTime(timeType taskcommon.TimeType) time.Time {
|
|||||||
return timeType.GetTaskTime(st.times)
|
return timeType.GetTaskTime(st.times)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (st *statsTask) GetTaskVersion() int64 {
|
||||||
|
return st.GetVersion()
|
||||||
|
}
|
||||||
|
|
||||||
func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
|
func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
|
||||||
st.State = state
|
st.State = state
|
||||||
st.FailReason = failReason
|
st.FailReason = failReason
|
||||||
|
|||||||
@ -363,6 +363,18 @@ var (
|
|||||||
Name: "task_count",
|
Name: "task_count",
|
||||||
Help: "number of index tasks of each type",
|
Help: "number of index tasks of each type",
|
||||||
}, []string{TaskTypeLabel, TaskStateLabel})
|
}, []string{TaskTypeLabel, TaskStateLabel})
|
||||||
|
|
||||||
|
// TaskVersion records the version of task(retry times of task).
|
||||||
|
TaskVersion = prometheus.NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: milvusNamespace,
|
||||||
|
Subsystem: typeutil.DataCoordRole,
|
||||||
|
Name: "task_version",
|
||||||
|
Help: "version of task",
|
||||||
|
Buckets: buckets,
|
||||||
|
}, []string{
|
||||||
|
TaskTypeLabel,
|
||||||
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
// RegisterDataCoord registers DataCoord metrics
|
// RegisterDataCoord registers DataCoord metrics
|
||||||
@ -395,6 +407,7 @@ func RegisterDataCoord(registry *prometheus.Registry) {
|
|||||||
registry.MustRegister(GarbageCollectorRunCount)
|
registry.MustRegister(GarbageCollectorRunCount)
|
||||||
registry.MustRegister(DataCoordTaskExecuteLatency)
|
registry.MustRegister(DataCoordTaskExecuteLatency)
|
||||||
registry.MustRegister(TaskNum)
|
registry.MustRegister(TaskNum)
|
||||||
|
registry.MustRegister(TaskVersion)
|
||||||
|
|
||||||
registerStreamingCoord(registry)
|
registerStreamingCoord(registry)
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user