mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 14:35:27 +08:00
fix: Fix replicate lag when server is idle (#46574)
issue: https://github.com/milvus-io/milvus/issues/46116 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: the metric CDCLastReplicatedTimeTick must reflect the most recent time-tick when replication has effectively processed all pending messages (including idle periods), so reported replicate lag = confirmed WAL tick − last replicated tick can reach zero when the server is idle. - Exact fix (bug): addresses issue #46116 by ensuring the last-replicated metric is updated when the server is idle. Concretely, a new ReplicateMetrics.UpdateLastReplicatedTimeTick(ts uint64) was added and called from OnConfirmed (OnConfirmed now delegates to UpdateLastReplicatedTimeTick(msg.TimeTick())), and from Replicate’s self-controlled-message path when the pending queue is empty — so the code records the time tick before returning ErrReplicateIgnored. - Logic simplified / removed: direct, ad-hoc metric writes in OnConfirmed were replaced by a single UpdateLastReplicatedTimeTick helper on the metrics implementation. The scattered manual set of CDCLastReplicatedTimeTick is consolidated into one method, removing redundant direct metric manipulations and centralizing timestamp conversion (tsoutil.PhysicalTimeSeconds). - No data loss / no behavior regression: this change only updates monitoring metrics and does not alter replication control flow or message processing. Replicate still returns ErrReplicateIgnored for self-controlled messages and does not change message persistence or acknowledgement paths; OnConfirmed continues to be invoked on confirmed messages but now delegates metric recording to the new method. Therefore no replication state, message ordering, or persistence semantics are modified. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
parent
db3f065a61
commit
e0fd091d41
@ -28,6 +28,7 @@ import (
|
||||
)
|
||||
|
||||
type ReplicateMetrics interface {
|
||||
UpdateLastReplicatedTimeTick(ts uint64)
|
||||
StartReplicate(msg message.ImmutableMessage)
|
||||
OnSent(msg message.ImmutableMessage)
|
||||
OnConfirmed(msg message.ImmutableMessage)
|
||||
@ -53,6 +54,13 @@ func NewReplicateMetrics(replicateInfo *streamingpb.ReplicatePChannelMeta) Repli
|
||||
}
|
||||
}
|
||||
|
||||
func (m *replicateMetrics) UpdateLastReplicatedTimeTick(ts uint64) {
|
||||
metrics.CDCLastReplicatedTimeTick.WithLabelValues(
|
||||
m.replicateInfo.GetSourceChannelName(),
|
||||
m.replicateInfo.GetTargetChannelName(),
|
||||
).Set(tsoutil.PhysicalTimeSeconds(ts))
|
||||
}
|
||||
|
||||
func (m *replicateMetrics) StartReplicate(msg message.ImmutableMessage) {
|
||||
msgID := msg.MessageID().String()
|
||||
m.msgsMetrics.Insert(msgID, msgMetrics{
|
||||
@ -88,10 +96,7 @@ func (m *replicateMetrics) OnConfirmed(msg message.ImmutableMessage) {
|
||||
m.replicateInfo.GetTargetChannelName(),
|
||||
).Observe(float64(replicateDuration.Milliseconds()))
|
||||
|
||||
metrics.CDCLastReplicatedTimeTick.WithLabelValues(
|
||||
m.replicateInfo.GetSourceChannelName(),
|
||||
m.replicateInfo.GetTargetChannelName(),
|
||||
).Set(tsoutil.PhysicalTimeSeconds(msg.TimeTick()))
|
||||
m.UpdateLastReplicatedTimeTick(msg.TimeTick())
|
||||
}
|
||||
|
||||
func (m *replicateMetrics) OnInitiate() {
|
||||
|
||||
@ -168,8 +168,11 @@ func (r *replicateStreamClient) Replicate(msg message.ImmutableMessage) error {
|
||||
case <-r.ctx.Done():
|
||||
return nil
|
||||
default:
|
||||
// TODO: Should be done at streamingnode, but after move it into streamingnode, the metric need to be adjusted.
|
||||
if msg.MessageType().IsSelfControlled() {
|
||||
// If no messages are being replicated, update the last replicated time tick.
|
||||
if r.pendingMessages.Len() == 0 {
|
||||
r.metrics.UpdateLastReplicatedTimeTick(msg.TimeTick())
|
||||
}
|
||||
return ErrReplicateIgnored
|
||||
}
|
||||
r.metrics.StartReplicate(msg)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user