milvus/internal/streamingcoord/server/broadcaster/ack_callback_scheduler.go
Zhen Ye 19e5e9f910
enhance: broadcaster will lock resource until message acked (#44508)
issue: #43897

- Return LastConfirmedMessageID when wal append operation.
- Add resource-key-based locker for broadcast-ack operation to protect
the coord state when executing ddl.
- Resource-key-based locker is held until the broadcast operation is
acked.
- ResourceKey support shared and exclusive lock.
- Add FastAck execute ack right away after the broadcast done to speed
up ddl.
- Ack callback will support broadcast message result now.
- Add tombstone for broadcaster to avoid to repeatedly commit DDL and
ABA issue.

---------

Signed-off-by: chyezh <chyezh@outlook.com>
2025-09-24 20:58:05 +08:00

195 lines
7.2 KiB
Go

package broadcaster
import (
"context"
"sort"
"time"
"github.com/cenkalti/backoff/v4"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/streamingcoord/server/broadcaster/registry"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
"github.com/milvus-io/milvus/pkg/v2/util/syncutil"
)
// newAckCallbackScheduler creates a new ack callback scheduler.
func newAckCallbackScheduler(logger *log.MLogger) *ackCallbackScheduler {
s := &ackCallbackScheduler{
notifier: syncutil.NewAsyncTaskNotifier[struct{}](),
pending: make(chan *broadcastTask, 16),
triggerChan: make(chan struct{}, 1),
rkLocker: newResourceKeyLocker(newBroadcasterMetrics()),
tombstoneScheduler: newTombstoneScheduler(logger),
}
s.SetLogger(logger)
return s
}
type ackCallbackScheduler struct {
log.Binder
notifier *syncutil.AsyncTaskNotifier[struct{}]
pending chan *broadcastTask
triggerChan chan struct{}
tombstoneScheduler *tombstoneScheduler
pendingAckedTasks []*broadcastTask // should already sorted by the broadcastID
// For the task that hold the conflicted resource-key (which is protected by the resource-key lock),
// broadcastID is always increasing,
// the task which broadcastID is smaller happens before the task which broadcastID is larger.
// Meanwhile the timetick order of any vchannel of those two tasks are same with the order of broadcastID,
// so the smaller broadcastID task is always acked before the larger broadcastID task.
// so we can exeucte the tasks by the order of the broadcastID to promise the ack order is same with wal order.
rkLocker *resourceKeyLocker // it is used to lock the resource-key of ack operation.
// it is not same instance with the resourceKeyLocker in the broadcastTaskManager.
// because it is just used to check if the resource-key is locked when acked.
// For primary milvus cluster, it makes no sense, because the execution order is already protected by the broadcastTaskManager.
// But for secondary milvus cluster, it is necessary to use this rkLocker to protect the resource-key when acked to avoid the execution order broken.
}
// Initialize initializes the ack scheduler with a list of broadcast tasks.
func (s *ackCallbackScheduler) Initialize(tasks []*broadcastTask, tombstoneIDs []uint64, bm *broadcastTaskManager) {
// when initializing, the tasks in recovery info may be out of order, so we need to sort them by the broadcastID.
sortByBroadcastID(tasks)
s.tombstoneScheduler.Initialize(bm, tombstoneIDs)
s.pendingAckedTasks = tasks
go s.background()
}
// AddTask adds a new broadcast task into the ack scheduler.
func (s *ackCallbackScheduler) AddTask(task *broadcastTask) {
select {
case <-s.notifier.Context().Done():
panic("unreachable: ack scheduler is closing when adding new task")
case s.pending <- task:
}
}
// Close closes the ack scheduler.
func (s *ackCallbackScheduler) Close() {
s.notifier.Cancel()
s.notifier.BlockUntilFinish()
// close the tombstone scheduler after the ack scheduler is closed.
s.tombstoneScheduler.Close()
}
// background is the background task of the ack scheduler.
func (s *ackCallbackScheduler) background() {
defer func() {
s.notifier.Finish(struct{}{})
s.Logger().Info("ack scheduler background exit")
}()
s.Logger().Info("ack scheduler background start")
for {
s.triggerAckCallback()
select {
case <-s.notifier.Context().Done():
return
case task := <-s.pending:
s.addBroadcastTask(task)
case <-s.triggerChan:
}
}
}
// addBroadcastTask adds a broadcast task into the pending acked tasks.
func (s *ackCallbackScheduler) addBroadcastTask(task *broadcastTask) error {
s.pendingAckedTasks = append(s.pendingAckedTasks, task)
sortByBroadcastID(s.pendingAckedTasks) // It's a redundant operation,
// once at runtime, the tasks are coming with the order of the broadcastID if they have the conflict resource-key.
return nil
}
// triggerAckCallback triggers the ack callback.
func (s *ackCallbackScheduler) triggerAckCallback() {
pendingTasks := make([]*broadcastTask, 0, len(s.pendingAckedTasks))
for _, task := range s.pendingAckedTasks {
if task.State() != streamingpb.BroadcastTaskState_BROADCAST_TASK_STATE_PENDING &&
task.State() != streamingpb.BroadcastTaskState_BROADCAST_TASK_STATE_WAIT_ACK &&
task.State() != streamingpb.BroadcastTaskState_BROADCAST_TASK_STATE_REPLICATED {
s.Logger().Info("task cannot be acked, skip the ack callback", zap.Uint64("broadcastID", task.Header().BroadcastID))
continue
}
g, err := s.rkLocker.FastLock(task.Header().ResourceKeys.Collect()...)
if err != nil {
s.Logger().Warn("lock is occupied, delay the ack callback", zap.Uint64("broadcastID", task.Header().BroadcastID), zap.Error(err))
pendingTasks = append(pendingTasks, task)
continue
}
// Execute the ack callback in background.
go s.doAckCallback(task, g)
}
s.pendingAckedTasks = pendingTasks
}
// doAckCallback executes the ack callback.
func (s *ackCallbackScheduler) doAckCallback(bt *broadcastTask, g *lockGuards) (err error) {
defer func() {
g.Unlock()
s.triggerChan <- struct{}{}
if err == nil {
s.Logger().Info("execute ack callback done", zap.Uint64("broadcastID", bt.Header().BroadcastID))
} else {
s.Logger().Warn("execute ack callback failed", zap.Uint64("broadcastID", bt.Header().BroadcastID), zap.Error(err))
}
}()
s.Logger().Info("start to execute ack callback", zap.Uint64("broadcastID", bt.Header().BroadcastID))
msg, result := bt.BroadcastResult()
makeMap := make(map[string]*message.AppendResult, len(result))
for vchannel, result := range result {
makeMap[vchannel] = &message.AppendResult{
MessageID: result.MessageID,
LastConfirmedMessageID: result.LastConfirmedMessageID,
TimeTick: result.TimeTick,
}
}
// call the ack callback until done.
if err := s.callMessageAckCallbackUntilDone(s.notifier.Context(), msg, makeMap); err != nil {
return err
}
if err := bt.MarkAckCallbackDone(s.notifier.Context()); err != nil {
// The catalog is reliable to write, so we can mark the ack callback done without retrying.
return err
}
s.tombstoneScheduler.AddPending(bt.Header().BroadcastID)
return nil
}
// callMessageAckCallbackUntilDone calls the message ack callback until done.
func (s *ackCallbackScheduler) callMessageAckCallbackUntilDone(ctx context.Context, msg message.BroadcastMutableMessage, result map[string]*message.AppendResult) error {
backoff := backoff.NewExponentialBackOff()
backoff.InitialInterval = 10 * time.Millisecond
backoff.MaxInterval = 10 * time.Second
backoff.MaxElapsedTime = 0
backoff.Reset()
for {
err := registry.CallMessageAckCallback(ctx, msg, result)
if err == nil {
return nil
}
nextInterval := backoff.NextBackOff()
s.Logger().Warn("failed to call message ack callback, wait for retry...",
log.FieldMessage(msg),
zap.Duration("nextInterval", nextInterval),
zap.Error(err))
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(nextInterval):
}
}
}
func sortByBroadcastID(tasks []*broadcastTask) {
sort.Slice(tasks, func(i, j int) bool {
return tasks[i].Header().BroadcastID < tasks[j].Header().BroadcastID
})
}