wei liu 990a25e51a
fix: Prevent delete records loss during slow segment loading [QueryNodeV2] (#43527)
issue: #42884
Fixes an issue where delete records for a segment are lost from the
delete buffer if `load segment` execution on the delegator is too slow,
causing `syncTargetVersion` or other cleanup operations to clear them
prematurely.

Changes include:
- Introduced `Pin` and `Unpin` methods in `DeleteBuffer` interface and
its implementations (`doubleCacheBuffer`, `listDeleteBuffer`).
- Added a `pinnedTimestamps` map to track timestamps protected from
cleanup by specific segments.
- Modified `LoadSegments` in `shardDelegator` to `Pin` relevant segment
delete records before loading and `Unpin` them afterwards.
- Added `isPinned` check in `UnRegister` and `TryDiscard` methods of
`listDeleteBuffer` to skip cleanup if corresponding timestamps are
pinned.
- Added comprehensive unit tests for `Pin`, `Unpin`, and `isPinned`
functionality, covering basic, multiple pins, concurrent, and edge
cases.

This ensures the integrity of delete records by preventing their
premature removal from the delete buffer during segment loading.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2025-07-24 01:00:54 +08:00

304 lines
7.5 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package deletebuffer
import (
"context"
"sort"
"sync"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/querynodev2/segments"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/util/tsoutil"
)
var errBufferFull = errors.New("buffer full")
type timed interface {
Timestamp() uint64
Size() int64
EntryNum() int64
}
// DeleteBuffer is the interface for delete buffer.
type DeleteBuffer[T timed] interface {
Put(T)
ListAfter(uint64) []T
SafeTs() uint64
TryDiscard(uint64)
// Size returns current size information of delete buffer: entryNum and memory
Size() (entryNum, memorySize int64)
// Register L0 segment
RegisterL0(segments ...segments.Segment)
// ListAll L0
ListL0() []segments.Segment
// Clean delete data, include l0 segment and delete buffer
UnRegister(ts uint64)
// clean up delete buffer
Clear()
// Pin/Unpin methods for protecting specific timestamps from cleanup
Pin(ts uint64, segmentID int64)
Unpin(ts uint64, segmentID int64)
}
func NewDoubleCacheDeleteBuffer[T timed](startTs uint64, maxSize int64) DeleteBuffer[T] {
return &doubleCacheBuffer[T]{
head: newCacheBlock[T](startTs, maxSize),
maxSize: maxSize,
ts: startTs,
l0Segments: make([]segments.Segment, 0),
pinnedTimestamps: make(map[uint64]map[int64]struct{}),
}
}
// doubleCacheBuffer implements DeleteBuffer with fixed sized double cache.
type doubleCacheBuffer[T timed] struct {
mut sync.RWMutex
head, tail *cacheBlock[T]
maxSize int64
ts uint64
// maintain l0 segment list
l0Segments []segments.Segment
// track pinned timestamps to prevent cleanup
// map[timestamp]map[segmentID]struct{} - tracks which segments pin which timestamps
pinnedTimestamps map[uint64]map[int64]struct{}
}
func (c *doubleCacheBuffer[T]) RegisterL0(segmentList ...segments.Segment) {
c.mut.Lock()
defer c.mut.Unlock()
// Filter out nil segments
for _, seg := range segmentList {
if seg != nil {
c.l0Segments = append(c.l0Segments, seg)
log.Info("register l0 from delete buffer",
zap.Int64("segmentID", seg.ID()),
zap.Time("startPosition", tsoutil.PhysicalTime(seg.StartPosition().GetTimestamp())),
)
}
}
}
func (c *doubleCacheBuffer[T]) ListL0() []segments.Segment {
c.mut.RLock()
defer c.mut.RUnlock()
return c.l0Segments
}
func (c *doubleCacheBuffer[T]) UnRegister(ts uint64) {
c.mut.Lock()
defer c.mut.Unlock()
var newSegments []segments.Segment
for _, s := range c.l0Segments {
if s.StartPosition().GetTimestamp() < ts {
s.Release(context.TODO())
log.Info("unregister l0 from delete buffer",
zap.Int64("segmentID", s.ID()),
zap.Time("startPosition", tsoutil.PhysicalTime(s.StartPosition().GetTimestamp())),
zap.Time("cleanTs", tsoutil.PhysicalTime(ts)),
)
continue
}
newSegments = append(newSegments, s)
}
c.l0Segments = newSegments
}
func (c *doubleCacheBuffer[T]) Clear() {
c.mut.Lock()
defer c.mut.Unlock()
for _, s := range c.l0Segments {
s.Release(context.TODO())
}
c.l0Segments = nil
// reset cache block
c.tail = c.head
c.head = newCacheBlock[T](c.ts, c.maxSize)
}
func (c *doubleCacheBuffer[T]) SafeTs() uint64 {
return c.ts
}
func (c *doubleCacheBuffer[T]) TryDiscard(_ uint64) {
}
// Put implements DeleteBuffer.
func (c *doubleCacheBuffer[T]) Put(entry T) {
c.mut.Lock()
defer c.mut.Unlock()
err := c.head.Put(entry)
if errors.Is(err, errBufferFull) {
c.evict(entry.Timestamp(), entry)
}
}
// ListAfter implements DeleteBuffer.
func (c *doubleCacheBuffer[T]) ListAfter(ts uint64) []T {
c.mut.RLock()
defer c.mut.RUnlock()
var result []T
if c.tail != nil {
result = append(result, c.tail.ListAfter(ts)...)
}
if c.head != nil {
result = append(result, c.head.ListAfter(ts)...)
}
return result
}
func (c *doubleCacheBuffer[T]) Size() (entryNum int64, memorySize int64) {
c.mut.RLock()
defer c.mut.RUnlock()
if c.head != nil {
blockNum, blockSize := c.head.Size()
entryNum += blockNum
memorySize += blockSize
}
if c.tail != nil {
blockNum, blockSize := c.tail.Size()
entryNum += blockNum
memorySize += blockSize
}
return entryNum, memorySize
}
// evict sets head as tail and evicts tail.
func (c *doubleCacheBuffer[T]) evict(newTs uint64, entry T) {
c.tail = c.head
c.head = &cacheBlock[T]{
headTs: newTs,
maxSize: c.maxSize / 2,
size: entry.Size(),
entryNum: entry.EntryNum(),
data: []T{entry},
}
c.ts = c.tail.headTs
}
func newCacheBlock[T timed](ts uint64, maxSize int64, elements ...T) *cacheBlock[T] {
var entryNum, memorySize int64
for _, element := range elements {
entryNum += element.EntryNum()
memorySize += element.Size()
}
return &cacheBlock[T]{
headTs: ts,
maxSize: maxSize,
data: elements,
entryNum: entryNum,
size: memorySize,
}
}
type cacheBlock[T timed] struct {
mut sync.RWMutex
headTs uint64
entryNum int64
size int64
maxSize int64
data []T
}
// Cache adds entry into cache item.
// returns error if item is full
func (c *cacheBlock[T]) Put(entry T) error {
c.mut.Lock()
defer c.mut.Unlock()
if c.size+entry.Size() > c.maxSize {
return errBufferFull
}
c.data = append(c.data, entry)
c.size += entry.Size()
c.entryNum += entry.EntryNum()
return nil
}
// ListAfter returns entries of which ts after provided value.
func (c *cacheBlock[T]) ListAfter(ts uint64) []T {
c.mut.RLock()
defer c.mut.RUnlock()
idx := sort.Search(len(c.data), func(idx int) bool {
return c.data[idx].Timestamp() >= ts
})
// not found
if idx == len(c.data) {
return nil
}
return c.data[idx:]
}
func (c *cacheBlock[T]) Size() (entryNum, memorySize int64) {
return c.entryNum, c.size
}
// Pin protects a specific timestamp from being cleaned up by a specific segment
func (c *doubleCacheBuffer[T]) Pin(ts uint64, segmentID int64) {
c.mut.Lock()
defer c.mut.Unlock()
if c.pinnedTimestamps[ts] == nil {
c.pinnedTimestamps[ts] = make(map[int64]struct{})
}
c.pinnedTimestamps[ts][segmentID] = struct{}{}
log.Info("pin timestamp for segment",
zap.Uint64("timestamp", ts),
zap.Int64("segmentID", segmentID),
zap.Time("physicalTime", tsoutil.PhysicalTime(ts)),
)
}
// Unpin removes protection for a specific timestamp by a specific segment
func (c *doubleCacheBuffer[T]) Unpin(ts uint64, segmentID int64) {
c.mut.Lock()
defer c.mut.Unlock()
if segmentMap, exists := c.pinnedTimestamps[ts]; exists {
delete(segmentMap, segmentID)
if len(segmentMap) == 0 {
delete(c.pinnedTimestamps, ts)
}
}
log.Info("unpin timestamp for segment",
zap.Uint64("timestamp", ts),
zap.Int64("segmentID", segmentID),
zap.Time("physicalTime", tsoutil.PhysicalTime(ts)),
)
// Note: doubleCacheBuffer doesn't implement cleanup logic in TryDiscard,
// so no cleanup is triggered here
}