yihao.dai b69f4ab1cd
fix: Fix replicate lag metric calculation to prevent false-positive health (#46120)
This change fixes the calculation by using timestamp subtraction (WAL
confirmed time - Last replicate time). This ensures the lag metric
immediately spikes when replication is blocked, providing reliable
monitoring.

issue: https://github.com/milvus-io/milvus/issues/46116

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
2025-12-07 21:51:12 +08:00

136 lines
4.1 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package replicatestream
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
"github.com/milvus-io/milvus/pkg/v2/util/timerecord"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type ReplicateMetrics interface {
StartReplicate(msg message.ImmutableMessage)
OnSent(msg message.ImmutableMessage)
OnConfirmed(msg message.ImmutableMessage)
OnInitiate()
OnConnect()
OnDisconnect()
OnClose()
}
type msgMetrics struct {
tr *timerecord.TimeRecorder
}
type replicateMetrics struct {
replicateInfo *streamingpb.ReplicatePChannelMeta
msgsMetrics *typeutil.ConcurrentMap[string, msgMetrics] // message id -> msgMetrics
}
func NewReplicateMetrics(replicateInfo *streamingpb.ReplicatePChannelMeta) ReplicateMetrics {
return &replicateMetrics{
replicateInfo: replicateInfo,
msgsMetrics: typeutil.NewConcurrentMap[string, msgMetrics](),
}
}
func (m *replicateMetrics) StartReplicate(msg message.ImmutableMessage) {
msgID := msg.MessageID().String()
m.msgsMetrics.Insert(msgID, msgMetrics{
tr: timerecord.NewTimeRecorder("replicate_msg"),
})
}
func (m *replicateMetrics) OnSent(msg message.ImmutableMessage) {
sourceChannel := m.replicateInfo.GetSourceChannelName()
targetChannel := m.replicateInfo.GetTargetChannelName()
msgType := msg.MessageType().String()
metrics.CDCReplicatedMessagesTotal.WithLabelValues(
sourceChannel,
targetChannel,
msgType,
).Inc()
metrics.CDCReplicatedBytesTotal.WithLabelValues(
sourceChannel,
targetChannel,
msgType,
).Add(float64(msg.EstimateSize()))
}
func (m *replicateMetrics) OnConfirmed(msg message.ImmutableMessage) {
msgMetrics, ok := m.msgsMetrics.GetAndRemove(msg.MessageID().String())
if !ok {
return
}
replicateDuration := msgMetrics.tr.RecordSpan()
metrics.CDCReplicateEndToEndLatency.WithLabelValues(
m.replicateInfo.GetSourceChannelName(),
m.replicateInfo.GetTargetChannelName(),
).Observe(float64(replicateDuration.Milliseconds()))
metrics.CDCLastReplicatedTimeTick.WithLabelValues(
m.replicateInfo.GetSourceChannelName(),
m.replicateInfo.GetTargetChannelName(),
).Set(float64(msg.TimeTick()))
}
func (m *replicateMetrics) OnInitiate() {
metrics.CDCStreamRPCConnections.WithLabelValues(
m.replicateInfo.GetTargetCluster().GetClusterId(),
metrics.CDCStatusDisconnected,
).Inc()
}
func (m *replicateMetrics) OnDisconnect() {
targetClusterID := m.replicateInfo.GetTargetCluster().GetClusterId()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusConnected,
).Dec()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusDisconnected,
).Inc()
}
func (m *replicateMetrics) OnConnect() {
targetClusterID := m.replicateInfo.GetTargetCluster().GetClusterId()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusDisconnected,
).Dec()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusConnected,
).Inc()
metrics.CDCStreamRPCReconnectTimes.WithLabelValues(
targetClusterID,
).Inc()
}
func (m *replicateMetrics) OnClose() {
metrics.CDCStreamRPCConnections.DeletePartialMatch(prometheus.Labels{
metrics.CDCLabelTargetCluster: m.replicateInfo.GetTargetCluster().GetClusterId(),
})
}