fix: Implement NeededFields feature in RecordReader (#43523)

Related to #43522

Currently, passing partial schema to storage v2 packed reader may
trigger SEGV during clustering compaction unit test.

This patch implement `NeededFields` differently in each `RecordReader`
imlementation. For now, v2 will implemented as no-op. This will be
supported after packed reader support this API.

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-07-24 00:22:54 +08:00 committed by GitHub
parent abb3aeacdf
commit 1cf8ed505f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 56 additions and 15 deletions

View File

@ -866,18 +866,13 @@ func (t *clusteringCompactionTask) scalarAnalyzeSegment(
log.Debug("binlogNum", zap.Int("binlogNum", binlogNum))
expiredFilter := compaction.NewEntityFilter(nil, t.plan.GetCollectionTtl(), t.currentTime)
schema := &schemapb.CollectionSchema{
Fields: make([]*schemapb.FieldSchema, 0),
}
binlogs := make([]*datapb.FieldBinlog, 0)
requiredFields := typeutil.NewSet[int64]()
requiredFields.Insert(0, 1, t.primaryKeyField.GetFieldID(), t.clusteringKeyField.GetFieldID())
for _, field := range t.plan.GetSchema().GetFields() {
if requiredFields.Contain(field.GetFieldID()) {
schema.Fields = append(schema.Fields, field)
}
}
selectedFields := lo.Filter(t.plan.GetSchema().GetFields(), func(field *schemapb.FieldSchema, _ int) bool {
return requiredFields.Contain(field.GetFieldID())
})
switch segment.GetStorageVersion() {
case storage.StorageV1:
@ -894,13 +889,14 @@ func (t *clusteringCompactionTask) scalarAnalyzeSegment(
}
rr, err := storage.NewBinlogRecordReader(ctx,
binlogs,
schema,
t.plan.GetSchema(),
storage.WithDownloader(func(ctx context.Context, paths []string) ([][]byte, error) {
return t.binlogIO.Download(ctx, paths)
}),
storage.WithVersion(segment.StorageVersion),
storage.WithBufferSize(t.bufferSize),
storage.WithStorageConfig(t.compactionParams.StorageConfig),
storage.WithNeededFields(requiredFields),
)
if err != nil {
log.Warn("new binlog record reader wrong", zap.Error(err))
@ -908,7 +904,7 @@ func (t *clusteringCompactionTask) scalarAnalyzeSegment(
}
pkIter := storage.NewDeserializeReader(rr, func(r storage.Record, v []*storage.Value) error {
return storage.ValueDeserializer(r, v, schema.Fields)
return storage.ValueDeserializer(r, v, selectedFields)
})
defer pkIter.Close()
analyzeResult, remained, err := t.iterAndGetScalarAnalyzeResult(pkIter, expiredFilter)

View File

@ -32,6 +32,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
const (
@ -61,6 +62,7 @@ type rwOptions struct {
multiPartUploadSize int64
columnGroups []storagecommon.ColumnGroup
storageConfig *indexpb.StorageConfig
neededFields typeutil.Set[int64]
}
func (o *rwOptions) validate() error {
@ -141,6 +143,12 @@ func WithStorageConfig(storageConfig *indexpb.StorageConfig) RwOption {
}
}
func WithNeededFields(neededFields typeutil.Set[int64]) RwOption {
return func(options *rwOptions) {
options.neededFields = neededFields
}
}
func makeBlobsReader(ctx context.Context, binlogs []*datapb.FieldBinlog, downloader downloaderFn) (ChunkedBlobsReader, error) {
if len(binlogs) == 0 {
return func() ([]*Blob, error) {
@ -212,7 +220,7 @@ func makeBlobsReader(ctx context.Context, binlogs []*datapb.FieldBinlog, downloa
}, nil
}
func NewBinlogRecordReader(ctx context.Context, binlogs []*datapb.FieldBinlog, schema *schemapb.CollectionSchema, option ...RwOption) (RecordReader, error) {
func NewBinlogRecordReader(ctx context.Context, binlogs []*datapb.FieldBinlog, schema *schemapb.CollectionSchema, option ...RwOption) (rr RecordReader, err error) {
rwOptions := DefaultReaderOptions()
for _, opt := range option {
opt(rwOptions)
@ -222,11 +230,13 @@ func NewBinlogRecordReader(ctx context.Context, binlogs []*datapb.FieldBinlog, s
}
switch rwOptions.version {
case StorageV1:
blobsReader, err := makeBlobsReader(ctx, binlogs, rwOptions.downloader)
var blobsReader ChunkedBlobsReader
blobsReader, err = makeBlobsReader(ctx, binlogs, rwOptions.downloader)
if err != nil {
return nil, err
}
return newCompositeBinlogRecordReader(schema, blobsReader)
rr, err = newCompositeBinlogRecordReader(schema, blobsReader)
case StorageV2:
if len(binlogs) <= 0 {
return nil, sio.EOF
@ -245,9 +255,17 @@ func NewBinlogRecordReader(ctx context.Context, binlogs []*datapb.FieldBinlog, s
paths[j] = append(paths[j], logPath)
}
}
return newPackedRecordReader(paths, schema, rwOptions.bufferSize, rwOptions.storageConfig)
rr, err = newPackedRecordReader(paths, schema, rwOptions.bufferSize, rwOptions.storageConfig)
default:
return nil, merr.WrapErrServiceInternal(fmt.Sprintf("unsupported storage version %d", rwOptions.version))
}
return nil, merr.WrapErrServiceInternal(fmt.Sprintf("unsupported storage version %d", rwOptions.version))
if err != nil {
return nil, err
}
if rwOptions.neededFields != nil {
rr.SetNeededFields(rwOptions.neededFields)
}
return rr, nil
}
func NewBinlogRecordWriter(ctx context.Context, collectionID, partitionID, segmentID UniqueID,

View File

@ -44,6 +44,7 @@ type Record interface {
type RecordReader interface {
Next() (Record, error)
SetNeededFields(fields typeutil.Set[int64])
Close() error
}

View File

@ -151,6 +151,23 @@ func (crr *CompositeBinlogRecordReader) Next() (Record, error) {
return r, nil
}
func (crr *CompositeBinlogRecordReader) SetNeededFields(neededFields typeutil.Set[int64]) {
if neededFields == nil {
return
}
crr.schema = &schemapb.CollectionSchema{
Fields: lo.Filter(crr.schema.GetFields(), func(field *schemapb.FieldSchema, _ int) bool {
return neededFields.Contain(field.GetFieldID())
}),
}
index := make(map[FieldID]int16)
for i, f := range crr.schema.Fields {
index[f.FieldID] = int16(i)
}
crr.index = index
}
func (crr *CompositeBinlogRecordReader) Close() error {
if crr.brs != nil {
for _, er := range crr.brs {
@ -1042,6 +1059,10 @@ func (crr *simpleArrowRecordReader) Next() (Record, error) {
return &crr.r, nil
}
func (crr *simpleArrowRecordReader) SetNeededFields(_ typeutil.Set[int64]) {
// no-op for simple arrow record reader
}
func (crr *simpleArrowRecordReader) Close() error {
if crr.closer != nil {
crr.closer()

View File

@ -95,6 +95,11 @@ func (pr *packedRecordReader) Next() (Record, error) {
}
}
func (pr *packedRecordReader) SetNeededFields(fields typeutil.Set[int64]) {
// TODO, push down SetNeededFields to packedReader after implemented
// no-op for now
}
func (pr *packedRecordReader) Close() error {
if pr.reader != nil {
return pr.reader.Close()