fix: pulsar cannot work properly if backlog exceed (#42653)

issue: #42649

- the sync operation of different pchannel is concurrent now.
- add a option to notify the backlog clear automatically.
- make pulsar walimpls can be recovered from backlog exceed.

Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
Zhen Ye 2025-06-13 14:28:37 +08:00 committed by GitHub
parent 78c39edbce
commit 1f66b650e9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 308 additions and 55 deletions

View File

@ -227,6 +227,16 @@ pulsar:
namespace: default # A Pulsar namespace is the administrative unit nomenclature within a tenant.
requestTimeout: 60 # pulsar client global request timeout in seconds
enableClientMetrics: false # Whether to register pulsar client metrics into milvus metrics path.
# Perform a backlog cleanup every time the data of given bytes is written.
# Because milvus use puslar reader to read the message, so if there's no pulsar subscriber when milvus running.
# If the pulsar cluster open the backlog protection (backlogQuotaDefaultLimitBytes), the backlog exceed will reported to fail the write operation
# set this option to non-zero will create a subscription seek to latest position to clear the pulsar backlog.
# If these options is non-zero, the wal data in pulsar is fully protected by retention policy,
# so admin of pulsar should give enough retention time to avoid the wal message lost.
# If these options is zero, no subscription will be created, so pulsar cluster must close the backlog protection, otherwise the milvus can not recovered if backlog exceed.
# Moreover, if these option is zero, Milvus use a truncation subscriber to protect the wal data in pulsar if user disable the subscriptionExpirationTimeMinutes.
# The retention policy of pulsar can set shorter to save the storage space in this case.
backlogAutoClearBytes: 100m
# If you want to enable kafka, needs to comment the pulsar configs
# kafka:
@ -1269,14 +1279,18 @@ streaming:
# If that persist operation exceeds this timeout, the wal recovery module will close right now.
gracefulCloseTimeout: 3s
walTruncate:
# The interval of sampling wal checkpoint when truncate, 1m by default.
# The interval of sampling wal checkpoint when truncate, 30m by default.
# Every time the checkpoint is persisted, the checkpoint will be sampled and used to be a candidate of truncate checkpoint.
# More samples, more frequent truncate, more memory usage.
sampleInterval: 1m
# The retention interval of wal truncate, 5m by default.
sampleInterval: 30m
# The retention interval of wal truncate, 26h by default.
# If the sampled checkpoint is older than this interval, it will be used to truncate wal checkpoint.
# Greater the interval, more wal storage usage, more redundant data in wal
retentionInterval: 5m
# Greater the interval, more wal storage usage, more redundant data in wal.
# Because current query path doesn't promise the read operation not happen before the truncate point,
# retention interval should be greater than the dataCoord.segment.maxLife to avoid the message lost at query path.
# If the wal is pulsar, the pulsar should close the subscription expiration to avoid the message lost.
# because the wal truncate operation is implemented by pulsar consumer.
retentionInterval: 26h
# Any configuration related to the knowhere vector search engine
knowhere:

View File

@ -26,6 +26,7 @@ import (
"github.com/apache/arrow/go/v17/parquet/compress"
"github.com/klauspost/compress/zstd"
"github.com/milvus-io/milvus/pkg/v2/util/hardware"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
@ -119,8 +120,8 @@ func (zstdCodec) getConcurrency() int {
// But most of the time, we only serialize 16MB data for one binlog generation.
// So 1 is enough for most cases to avoid to use too much memory.
concurrent := paramtable.Get().CommonCfg.StorageZstdConcurrency.GetAsInt()
if concurrent < 0 {
return 0
if concurrent <= 0 {
return hardware.GetCPUNum()
}
return concurrent
}

View File

@ -1,6 +1,7 @@
package inspector
import (
"sync"
"time"
"go.uber.org/zap"
@ -27,6 +28,8 @@ type timeTickSyncInspectorImpl struct {
taskNotifier *syncutil.AsyncTaskNotifier[struct{}]
syncNotifier *syncNotifier
operators *typeutil.ConcurrentMap[string, TimeTickSyncOperator]
wg sync.WaitGroup
working typeutil.ConcurrentSet[string]
}
func (s *timeTickSyncInspectorImpl) TriggerSync(pChannelInfo types.PChannelInfo, persisted bool) {
@ -71,22 +74,40 @@ func (s *timeTickSyncInspectorImpl) background() {
case <-s.taskNotifier.Context().Done():
return
case <-ticker.C:
s.operators.Range(func(_ string, operator TimeTickSyncOperator) bool {
operator.Sync(s.taskNotifier.Context(), false)
s.operators.Range(func(name string, _ TimeTickSyncOperator) bool {
s.asyncSync(name, false)
return true
})
case <-s.syncNotifier.WaitChan():
signals := s.syncNotifier.Get()
for pchannel, persisted := range signals {
if operator, ok := s.operators.Get(pchannel.Name); ok {
operator.Sync(s.taskNotifier.Context(), persisted)
}
s.asyncSync(pchannel.Name, persisted)
}
}
}
}
// asyncSync syncs the pchannel in a goroutine.
func (s *timeTickSyncInspectorImpl) asyncSync(pchannelName string, persisted bool) {
if !s.working.Insert(pchannelName) {
// Check if the sync operation of pchannel is working, if so, skip it.
return
}
s.wg.Add(1)
go func() {
defer func() {
s.wg.Done()
s.working.Remove(pchannelName)
}()
if operator, ok := s.operators.Get(pchannelName); ok {
operator.Sync(s.taskNotifier.Context(), persisted)
}
}()
}
func (s *timeTickSyncInspectorImpl) Close() {
s.taskNotifier.Cancel()
s.taskNotifier.BlockUntilFinish()
s.wg.Wait()
}

View File

@ -55,6 +55,6 @@ func TestTruncatorConfig(t *testing.T) {
paramtable.Init()
cfg := newTruncatorConfig()
assert.Equal(t, 1*time.Minute, cfg.sampleInterval)
assert.Equal(t, 5*time.Minute, cfg.retentionInterval)
assert.Equal(t, 30*time.Minute, cfg.sampleInterval)
assert.Equal(t, 26*time.Hour, cfg.retentionInterval)
}

View File

@ -0,0 +1,115 @@
package pulsar
import (
"sync"
"time"
"github.com/apache/pulsar-client-go/pulsar"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
"github.com/milvus-io/milvus/pkg/v2/util/retry"
"github.com/milvus-io/milvus/pkg/v2/util/syncutil"
)
const (
backlogClearHelperName = "backlog-clear"
)
// backlogClearHelper is a helper to clear the backlog of pulsar.
type backlogClearHelper struct {
log.Binder
notifier *syncutil.AsyncTaskNotifier[struct{}]
cond *syncutil.ContextCond
written int64
threshold int64
channelName types.PChannelInfo
c pulsar.Client
}
// newBacklogClearHelper creates a new backlog clear helper.
func newBacklogClearHelper(c pulsar.Client, channelName types.PChannelInfo, threshold int64) *backlogClearHelper {
h := &backlogClearHelper{
notifier: syncutil.NewAsyncTaskNotifier[struct{}](),
cond: syncutil.NewContextCond(&sync.Mutex{}),
written: threshold, // trigger the backlog clear immediately.
threshold: threshold,
channelName: channelName,
c: c,
}
h.SetLogger(log.With(zap.String("channel", channelName.String())))
go h.background()
return h
}
// ObserveAppend observes the append traffic.
func (h *backlogClearHelper) ObserveAppend(size int) {
h.cond.L.Lock()
h.written += int64(size)
if h.written >= h.threshold {
h.cond.UnsafeBroadcast()
}
h.cond.L.Unlock()
}
// background is the background goroutine to clear the backlog.
func (h *backlogClearHelper) background() {
defer func() {
h.notifier.Finish(struct{}{})
h.Logger().Info("backlog clear helper exit")
}()
for {
h.cond.L.Lock()
for h.written < h.threshold {
if err := h.cond.Wait(h.notifier.Context()); err != nil {
return
}
}
h.written = 0
h.cond.L.Unlock()
if err := retry.Do(h.notifier.Context(), func() error {
if h.notifier.Context().Err() != nil {
return h.notifier.Context().Err()
}
if err := h.performBacklogClear(); err != nil {
h.Logger().Warn("failed to perform backlog clear", zap.Error(err))
return err
}
h.Logger().Debug("perform backlog clear done")
return nil
}, retry.AttemptAlways()); err != nil {
return
}
}
}
// performBacklogClear performs the backlog clear.
func (h *backlogClearHelper) performBacklogClear() error {
cursor, err := h.c.Subscribe(pulsar.ConsumerOptions{
Topic: h.channelName.Name,
SubscriptionName: backlogClearHelperName,
Type: pulsar.Exclusive,
MaxPendingChunkedMessage: 0,
StartMessageIDInclusive: true,
})
if err != nil {
return errors.Wrap(err, "when create subscription")
}
defer cursor.Close()
if err := cursor.SeekByTime(time.Now()); err != nil {
return errors.Wrap(err, "when seek to latest message")
}
return nil
}
// Close closes the backlog clear helper.
func (h *backlogClearHelper) Close() {
h.notifier.Cancel()
h.notifier.BlockUntilFinish()
}

View File

@ -8,6 +8,8 @@ import (
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
"github.com/milvus-io/milvus/pkg/v2/streaming/walimpls"
"github.com/milvus-io/milvus/pkg/v2/streaming/walimpls/helper"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/syncutil"
)
const (
@ -26,36 +28,40 @@ func (o *openerImpl) Open(ctx context.Context, opt *walimpls.OpenOption) (walimp
if err := opt.Validate(); err != nil {
return nil, err
}
var p pulsar.Producer
if opt.Channel.AccessMode == types.AccessModeRW {
var err error
p, err = o.c.CreateProducer(pulsar.ProducerOptions{
Topic: opt.Channel.Name,
// TODO: current go pulsar client does not support fencing, we should enable it after go pulsar client supports it.
// ProducerAccessMode: pulsar.ProducerAccessModeExclusiveWithFencing,
})
if err != nil {
return nil, err
}
// Initialize a persistent cursor to protect the topic from being retention.
cursor, err := o.c.Subscribe(pulsar.ConsumerOptions{
Topic: opt.Channel.Name,
SubscriptionName: truncateCursorSubscriptionName,
Type: pulsar.Exclusive,
MaxPendingChunkedMessage: 1,
SubscriptionInitialPosition: pulsar.SubscriptionPositionEarliest,
})
if err != nil {
return nil, err
var backlogClearHelper *backlogClearHelper
if opt.Channel.AccessMode == types.AccessModeRW {
backlogAutoClearBytes := paramtable.Get().PulsarCfg.BacklogAutoClearBytes.GetAsSize()
if backlogAutoClearBytes > 0 {
backlogClearHelper = newBacklogClearHelper(o.c, opt.Channel, backlogAutoClearBytes)
} else {
// Initialize a persistent cursor to protect the topic from being retention.
cursor, err := o.c.Subscribe(pulsar.ConsumerOptions{
Topic: opt.Channel.Name,
SubscriptionName: truncateCursorSubscriptionName,
Type: pulsar.Exclusive,
MaxPendingChunkedMessage: 0,
SubscriptionInitialPosition: pulsar.SubscriptionPositionEarliest,
})
if err != nil {
return nil, err
}
cursor.Close()
}
cursor.Close()
}
return &walImpl{
WALHelper: helper.NewWALHelper(opt),
p: p,
c: o.c,
}, nil
w := &walImpl{
WALHelper: helper.NewWALHelper(opt),
c: o.c,
p: syncutil.NewFuture[pulsar.Producer](),
notifier: syncutil.NewAsyncTaskNotifier[struct{}](),
backlogClearHelper: backlogClearHelper,
}
if opt.Channel.AccessMode == types.AccessModeRW {
// because the producer of pulsar cannot be created if the topic is backlog exceeded,
// so we need to set the producer at background with backoff retry.
w.initProducerAtBackground()
}
return w, nil
}
// Close closes the opener resources.

View File

@ -2,8 +2,11 @@ package pulsar
import (
"context"
"time"
"github.com/apache/pulsar-client-go/pulsar"
"github.com/cenkalti/backoff/v4"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
@ -11,14 +14,59 @@ import (
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
"github.com/milvus-io/milvus/pkg/v2/streaming/walimpls"
"github.com/milvus-io/milvus/pkg/v2/streaming/walimpls/helper"
"github.com/milvus-io/milvus/pkg/v2/util/syncutil"
)
var _ walimpls.WALImpls = (*walImpl)(nil)
type walImpl struct {
*helper.WALHelper
c pulsar.Client
p pulsar.Producer
c pulsar.Client
p *syncutil.Future[pulsar.Producer]
notifier *syncutil.AsyncTaskNotifier[struct{}]
backlogClearHelper *backlogClearHelper
}
// initProducerAtBackground initializes the producer at background.
func (w *walImpl) initProducerAtBackground() {
if w.Channel().AccessMode != types.AccessModeRW {
panic("producer should not be initialized on a wal that is not in read-write mode")
}
defer w.notifier.Finish(struct{}{})
backoff := backoff.NewExponentialBackOff()
backoff.InitialInterval = 10 * time.Millisecond
backoff.MaxInterval = 10 * time.Second
backoff.MaxElapsedTime = 0
backoff.Reset()
for {
if err := w.initProducer(); err == nil {
return
}
select {
case <-time.After(backoff.NextBackOff()):
continue
case <-w.notifier.Context().Done():
return
}
}
}
// initProducer initializes the producer.
func (w *walImpl) initProducer() error {
p, err := w.c.CreateProducer(pulsar.ProducerOptions{
Topic: w.Channel().Name,
// TODO: current go pulsar client does not support fencing, we should enable it after go pulsar client supports it.
// ProducerAccessMode: pulsar.ProducerAccessModeExclusiveWithFencing,
})
if err != nil {
w.Log().Warn("create producer failed", zap.Error(err))
return err
}
w.Log().Info("pulsar create producer done")
w.p.Set(p)
return nil
}
func (w *walImpl) WALName() string {
@ -29,10 +77,19 @@ func (w *walImpl) Append(ctx context.Context, msg message.MutableMessage) (messa
if w.Channel().AccessMode != types.AccessModeRW {
panic("write on a wal that is not in read-write mode")
}
id, err := w.p.Send(ctx, &pulsar.ProducerMessage{
p, err := w.p.GetWithContext(ctx)
if err != nil {
return nil, errors.Wrap(err, "get producer from future")
}
id, err := p.Send(ctx, &pulsar.ProducerMessage{
Payload: msg.Payload(),
Properties: msg.Properties().ToRawMap(),
})
if w.backlogClearHelper != nil {
// Observe the append traffic even if the message is not sent successfully.
// Because if the write is failed, the message may be already written to the pulsar topic.
w.backlogClearHelper.ObserveAppend(len(msg.Payload()))
}
if err != nil {
w.Log().RatedWarn(1, "send message to pulsar failed", zap.Error(err))
return nil, err
@ -80,6 +137,10 @@ func (w *walImpl) Truncate(ctx context.Context, id message.MessageID) error {
if w.Channel().AccessMode != types.AccessModeRW {
panic("truncate on a wal that is not in read-write mode")
}
if w.backlogClearHelper != nil {
// if the backlog clear helper is enabled, the truncate make no sense, skip it.
return nil
}
cursor, err := w.c.Subscribe(pulsar.ConsumerOptions{
Topic: w.Channel().Name,
SubscriptionName: truncateCursorSubscriptionName,
@ -95,7 +156,13 @@ func (w *walImpl) Truncate(ctx context.Context, id message.MessageID) error {
}
func (w *walImpl) Close() {
if w.p != nil {
w.p.Close() // close producer
w.notifier.Cancel()
w.notifier.BlockUntilFinish()
// close producer if it is initialized
if w.p.Ready() {
w.p.Get().Close()
}
if w.backlogClearHelper != nil {
w.backlogClearHelper.Close()
}
}

View File

@ -5659,8 +5659,9 @@ type streamingConfig struct {
WALRecoveryPersistInterval ParamItem `refreshable:"true"`
WALRecoveryMaxDirtyMessage ParamItem `refreshable:"true"`
WALRecoveryGracefulCloseTimeout ParamItem `refreshable:"true"`
WALTruncateSampleInterval ParamItem `refreshable:"true"`
WALTruncateRetentionInterval ParamItem `refreshable:"true"`
WALTruncateSampleInterval ParamItem `refreshable:"true"`
WALTruncateRetentionInterval ParamItem `refreshable:"true"`
}
func (p *streamingConfig) init(base *BaseTable) {
@ -5888,10 +5889,10 @@ If that persist operation exceeds this timeout, the wal recovery module will clo
p.WALTruncateSampleInterval = ParamItem{
Key: "streaming.walTruncate.sampleInterval",
Version: "2.6.0",
Doc: `The interval of sampling wal checkpoint when truncate, 1m by default.
Doc: `The interval of sampling wal checkpoint when truncate, 30m by default.
Every time the checkpoint is persisted, the checkpoint will be sampled and used to be a candidate of truncate checkpoint.
More samples, more frequent truncate, more memory usage.`,
DefaultValue: "1m",
DefaultValue: "30m",
Export: true,
}
p.WALTruncateSampleInterval.Init(base.mgr)
@ -5899,10 +5900,14 @@ More samples, more frequent truncate, more memory usage.`,
p.WALTruncateRetentionInterval = ParamItem{
Key: "streaming.walTruncate.retentionInterval",
Version: "2.6.0",
Doc: `The retention interval of wal truncate, 5m by default.
Doc: `The retention interval of wal truncate, 26h by default.
If the sampled checkpoint is older than this interval, it will be used to truncate wal checkpoint.
Greater the interval, more wal storage usage, more redundant data in wal`,
DefaultValue: "5m",
Greater the interval, more wal storage usage, more redundant data in wal.
Because current query path doesn't promise the read operation not happen before the truncate point,
retention interval should be greater than the dataCoord.segment.maxLife to avoid the message lost at query path.
If the wal is pulsar, the pulsar should close the subscription expiration to avoid the message lost.
because the wal truncate operation is implemented by pulsar consumer.`,
DefaultValue: "26h",
Export: true,
}
p.WALTruncateRetentionInterval.Init(base.mgr)

View File

@ -650,8 +650,8 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, float64(0.6), params.StreamingCfg.FlushMemoryThreshold.GetAsFloat())
assert.Equal(t, float64(0.2), params.StreamingCfg.FlushGrowingSegmentBytesHwmThreshold.GetAsFloat())
assert.Equal(t, float64(0.1), params.StreamingCfg.FlushGrowingSegmentBytesLwmThreshold.GetAsFloat())
assert.Equal(t, 1*time.Minute, params.StreamingCfg.WALTruncateSampleInterval.GetAsDurationByParse())
assert.Equal(t, 5*time.Minute, params.StreamingCfg.WALTruncateRetentionInterval.GetAsDurationByParse())
assert.Equal(t, 30*time.Minute, params.StreamingCfg.WALTruncateSampleInterval.GetAsDurationByParse())
assert.Equal(t, 26*time.Hour, params.StreamingCfg.WALTruncateRetentionInterval.GetAsDurationByParse())
params.Save(params.StreamingCfg.WALBalancerTriggerInterval.Key, "50s")
params.Save(params.StreamingCfg.WALBalancerBackoffInitialInterval.Key, "50s")

View File

@ -875,6 +875,8 @@ type PulsarConfig struct {
// Enable Client side metrics
EnableClientMetrics ParamItem `refreshable:"false"`
BacklogAutoClearBytes ParamItem `refreshable:"false"`
}
func (p *PulsarConfig) Init(base *BaseTable) {
@ -1007,6 +1009,23 @@ To share a Pulsar instance among multiple Milvus instances, you can change this
Export: true,
}
p.EnableClientMetrics.Init(base.mgr)
p.BacklogAutoClearBytes = ParamItem{
Key: "pulsar.backlogAutoClearBytes",
Version: "2.6.0",
DefaultValue: "100m",
Doc: `Perform a backlog cleanup every time the data of given bytes is written.
Because milvus use puslar reader to read the message, so if there's no pulsar subscriber when milvus running.
If the pulsar cluster open the backlog protection (backlogQuotaDefaultLimitBytes), the backlog exceed will reported to fail the write operation
set this option to non-zero will create a subscription seek to latest position to clear the pulsar backlog.
If these options is non-zero, the wal data in pulsar is fully protected by retention policy,
so admin of pulsar should give enough retention time to avoid the wal message lost.
If these options is zero, no subscription will be created, so pulsar cluster must close the backlog protection, otherwise the milvus can not recovered if backlog exceed.
Moreover, if these option is zero, Milvus use a truncation subscriber to protect the wal data in pulsar if user disable the subscriptionExpirationTimeMinutes.
The retention policy of pulsar can set shorter to save the storage space in this case.`,
Export: true,
}
p.BacklogAutoClearBytes.Init(base.mgr)
}
// --- kafka ---

View File

@ -193,6 +193,11 @@ func TestServiceParam(t *testing.T) {
assert.Equal(t, "60", Params.RequestTimeout.GetValue())
})
t.Run("pulsar_backlog_auto_clear_bytes", func(t *testing.T) {
Params := &SParams.PulsarCfg
assert.Equal(t, int64(100*1024*1024), Params.BacklogAutoClearBytes.GetAsSize())
})
t.Run("test rocksmqConfig", func(t *testing.T) {
Params := &SParams.RocksmqCfg