mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 03:13:22 +08:00
issue: #41544 - add a truncator implementation into wal recovery storage. - add metrics for recovery storage. --------- Signed-off-by: chyezh <chyezh@outlook.com>
138 lines
4.5 KiB
Go
138 lines
4.5 KiB
Go
package recovery
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/streaming/walimpls"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/syncutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
|
|
)
|
|
|
|
// newSamplingTruncator creates a new sampling truncator.
|
|
func newSamplingTruncator(
|
|
checkpoint *WALCheckpoint,
|
|
truncator walimpls.WALImpls,
|
|
recoveryMetrics *recoveryMetrics,
|
|
) *samplingTruncator {
|
|
st := &samplingTruncator{
|
|
notifier: syncutil.NewAsyncTaskNotifier[struct{}](),
|
|
cfg: newTruncatorConfig(),
|
|
truncator: truncator,
|
|
mu: sync.Mutex{},
|
|
checkpointSamples: []*WALCheckpoint{checkpoint},
|
|
lastTruncatedCheckpoint: nil,
|
|
lastSampled: time.Now(),
|
|
metrics: recoveryMetrics,
|
|
}
|
|
go st.background()
|
|
return st
|
|
}
|
|
|
|
// samplingTruncator is a sampling truncator that samples the incoming checkpoint and truncates the WAL.
|
|
type samplingTruncator struct {
|
|
log.Binder
|
|
notifier *syncutil.AsyncTaskNotifier[struct{}]
|
|
cfg *truncatorConfig
|
|
truncator walimpls.WALImpls
|
|
|
|
mu sync.Mutex
|
|
checkpointSamples []*WALCheckpoint // the samples of checkpoints
|
|
lastTruncatedCheckpoint *WALCheckpoint // the last truncated checkpoint
|
|
lastSampled time.Time // the last time the checkpoint is sampled
|
|
metrics *recoveryMetrics
|
|
}
|
|
|
|
// SampleCheckpoint samples the incoming checkpoint and adds it to the checkpoint samples.
|
|
func (t *samplingTruncator) SampleCheckpoint(checkpoint *WALCheckpoint) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
if time.Since(t.lastSampled) < t.cfg.sampleInterval {
|
|
return
|
|
}
|
|
|
|
if len(t.checkpointSamples) == 0 || t.checkpointSamples[len(t.checkpointSamples)-1].MessageID.LT(checkpoint.MessageID) {
|
|
t.checkpointSamples = append(t.checkpointSamples, checkpoint)
|
|
}
|
|
t.lastSampled = time.Now()
|
|
}
|
|
|
|
// background starts the background task of the sampling truncator.
|
|
func (t *samplingTruncator) background() {
|
|
ticker := time.NewTicker(t.cfg.sampleInterval / 2)
|
|
defer func() {
|
|
ticker.Stop()
|
|
t.notifier.Finish(struct{}{})
|
|
t.Logger().Info("sampling truncator background task exit")
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case <-t.notifier.Context().Done():
|
|
return
|
|
case <-ticker.C:
|
|
t.applyTruncate()
|
|
}
|
|
}
|
|
}
|
|
|
|
// consumeCheckpointSamples consumes the checkpoint samples and returns the truncate checkpoint.
|
|
func (t *samplingTruncator) consumeCheckpointSamples() *WALCheckpoint {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
|
|
targetCheckpointIdx := -1
|
|
for i := 0; i < len(t.checkpointSamples); i++ {
|
|
if time.Since(tsoutil.PhysicalTime(t.checkpointSamples[i].TimeTick)) < t.cfg.retentionInterval {
|
|
break
|
|
}
|
|
targetCheckpointIdx = i
|
|
}
|
|
if targetCheckpointIdx >= 0 {
|
|
checkpoint := t.checkpointSamples[targetCheckpointIdx]
|
|
t.checkpointSamples = t.checkpointSamples[targetCheckpointIdx+1:]
|
|
return checkpoint
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// applyTruncate applies the truncate operation.
|
|
func (t *samplingTruncator) applyTruncate() {
|
|
truncateCheckpoint := t.consumeCheckpointSamples()
|
|
if truncateCheckpoint == nil {
|
|
t.Logger().Debug("no checkpoint sample can be used to truncate wal")
|
|
return
|
|
}
|
|
logger := t.Logger().With(zap.String("messageID", truncateCheckpoint.MessageID.String()), zap.Uint64("timeTick", truncateCheckpoint.TimeTick))
|
|
if t.lastTruncatedCheckpoint != nil {
|
|
logger = logger.With(zap.String("lastMessageID", t.lastTruncatedCheckpoint.MessageID.String()), zap.Uint64("lastTimeTick", t.lastTruncatedCheckpoint.TimeTick))
|
|
if truncateCheckpoint.MessageID.EQ(t.lastTruncatedCheckpoint.MessageID) {
|
|
logger.Debug("checkpoint sample is the same, ignore the operation", zap.String("messageID", truncateCheckpoint.MessageID.String()))
|
|
t.lastTruncatedCheckpoint = truncateCheckpoint
|
|
t.metrics.ObServeTruncateMetrics(truncateCheckpoint.TimeTick)
|
|
return
|
|
} else if truncateCheckpoint.MessageID.LT(t.lastTruncatedCheckpoint.MessageID) {
|
|
logger.Warn("checkpoint sample is not in order, the wal may be corrupted", zap.String("targetMessageID", truncateCheckpoint.MessageID.String()))
|
|
return
|
|
}
|
|
}
|
|
|
|
err := t.truncator.Truncate(t.notifier.Context(), truncateCheckpoint.MessageID)
|
|
if err != nil {
|
|
logger.Warn("failed to truncate wal, the checkpoint sample is lost", zap.Error(err))
|
|
return
|
|
}
|
|
logger.Info("truncate wal")
|
|
t.lastTruncatedCheckpoint = truncateCheckpoint
|
|
t.metrics.ObServeTruncateMetrics(truncateCheckpoint.TimeTick)
|
|
}
|
|
|
|
// Close closes the sampling truncator.
|
|
func (t *samplingTruncator) Close() {
|
|
t.notifier.Cancel()
|
|
t.notifier.BlockAndGetResult()
|
|
}
|