Zhen Ye 369c6eb206
enhance: support remove cluster from replicate topology (#44642)
issue: #44558, #44123
- Update config(A->C) to A and C, config(B) to B on replicate topology
(A->B,A->C) can remove the B from replicate topology
- Fix some metric error of CDC

Signed-off-by: chyezh <chyezh@outlook.com>
2025-10-13 11:07:58 +08:00

151 lines
4.5 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package replicatestream
import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
"github.com/milvus-io/milvus/pkg/v2/util/timerecord"
"github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type ReplicateMetrics interface {
StartReplicate(msg message.ImmutableMessage)
OnSent(msg message.ImmutableMessage)
OnConfirmed(msg message.ImmutableMessage)
OnNoIncomingMessages()
OnInitiate()
OnConnect()
OnDisconnect()
OnClose()
}
type msgMetrics struct {
tr *timerecord.TimeRecorder
}
type replicateMetrics struct {
replicateInfo *streamingpb.ReplicatePChannelMeta
msgsMetrics *typeutil.ConcurrentMap[string, msgMetrics] // message id -> msgMetrics
}
func NewReplicateMetrics(replicateInfo *streamingpb.ReplicatePChannelMeta) ReplicateMetrics {
return &replicateMetrics{
replicateInfo: replicateInfo,
msgsMetrics: typeutil.NewConcurrentMap[string, msgMetrics](),
}
}
func (m *replicateMetrics) StartReplicate(msg message.ImmutableMessage) {
msgID := msg.MessageID().String()
m.msgsMetrics.Insert(msgID, msgMetrics{
tr: timerecord.NewTimeRecorder("replicate_msg"),
})
}
func (m *replicateMetrics) OnSent(msg message.ImmutableMessage) {
sourceChannel := m.replicateInfo.GetSourceChannelName()
targetChannel := m.replicateInfo.GetTargetChannelName()
msgType := msg.MessageType().String()
metrics.CDCReplicatedMessagesTotal.WithLabelValues(
sourceChannel,
targetChannel,
msgType,
).Inc()
metrics.CDCReplicatedBytesTotal.WithLabelValues(
sourceChannel,
targetChannel,
msgType,
).Add(float64(msg.EstimateSize()))
}
func (m *replicateMetrics) OnConfirmed(msg message.ImmutableMessage) {
msgMetrics, ok := m.msgsMetrics.GetAndRemove(msg.MessageID().String())
if !ok {
return
}
replicateDuration := msgMetrics.tr.RecordSpan()
metrics.CDCReplicateEndToEndLatency.WithLabelValues(
m.replicateInfo.GetSourceChannelName(),
m.replicateInfo.GetTargetChannelName(),
).Observe(float64(replicateDuration.Milliseconds()))
now := time.Now()
confirmedTime := tsoutil.PhysicalTime(msg.TimeTick())
lag := now.Sub(confirmedTime)
metrics.CDCReplicateLag.WithLabelValues(
m.replicateInfo.GetSourceChannelName(),
m.replicateInfo.GetTargetChannelName(),
).Set(float64(lag.Milliseconds()))
}
// OnNoIncomingMessages is called when there are no incoming messages.
func (m *replicateMetrics) OnNoIncomingMessages() {
metrics.CDCReplicateLag.WithLabelValues(
m.replicateInfo.GetSourceChannelName(),
m.replicateInfo.GetTargetChannelName(),
).Set(0)
}
func (m *replicateMetrics) OnInitiate() {
metrics.CDCStreamRPCConnections.WithLabelValues(
m.replicateInfo.GetTargetCluster().GetClusterId(),
metrics.CDCStatusDisconnected,
).Inc()
}
func (m *replicateMetrics) OnDisconnect() {
targetClusterID := m.replicateInfo.GetTargetCluster().GetClusterId()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusConnected,
).Dec()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusDisconnected,
).Inc()
}
func (m *replicateMetrics) OnConnect() {
targetClusterID := m.replicateInfo.GetTargetCluster().GetClusterId()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusDisconnected,
).Dec()
metrics.CDCStreamRPCConnections.WithLabelValues(
targetClusterID,
metrics.CDCStatusConnected,
).Inc()
metrics.CDCStreamRPCReconnectTimes.WithLabelValues(
targetClusterID,
).Inc()
}
func (m *replicateMetrics) OnClose() {
metrics.CDCStreamRPCConnections.DeletePartialMatch(prometheus.Labels{
metrics.CDCLabelTargetCluster: m.replicateInfo.GetTargetCluster().GetClusterId(),
})
}