enhance: [Add Field] Use consistent schema for single buffer (#41891)

Related to #41873

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-05-17 19:46:22 +08:00 committed by GitHub
parent a3d5ad135e
commit b8d7045539
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 56 additions and 36 deletions

View File

@ -281,7 +281,7 @@ func (s *ClusteringCompactionTaskStorageV2Suite) initStorageV2Segments(rows int,
channelName := fmt.Sprintf("by-dev-rootcoord-dml_0_%dv0", CollectionID)
deleteData := storage.NewDeleteData([]storage.PrimaryKey{storage.NewInt64PrimaryKey(100)}, []uint64{tsoutil.ComposeTSByTime(getMilvusBirthday().Add(time.Second), 0)})
pack := new(syncmgr.SyncPack).WithCollectionID(CollectionID).WithPartitionID(PartitionID).WithSegmentID(segmentID).WithChannelName(channelName).WithInsertData(genInsertData(rows, segmentID, genCollectionSchema())).WithDeleteData(deleteData)
bw := syncmgr.NewBulkPackWriterV2(mc, cm, s.mockAlloc, packed.DefaultWriteBufferSize, 0)
bw := syncmgr.NewBulkPackWriterV2(mc, genCollectionSchema(), cm, s.mockAlloc, packed.DefaultWriteBufferSize, 0)
return bw.Write(context.Background(), pack)
}

View File

@ -318,7 +318,7 @@ func (s *MixCompactionTaskStorageV2Suite) initStorageV2Segments(rows int, seed i
channelName := fmt.Sprintf("by-dev-rootcoord-dml_0_%dv0", CollectionID)
pack := new(syncmgr.SyncPack).WithCollectionID(CollectionID).WithPartitionID(PartitionID).WithSegmentID(seed).WithChannelName(channelName).WithInsertData(getInsertData(rows, seed, s.meta.GetSchema()))
bw := syncmgr.NewBulkPackWriterV2(mc, cm, alloc, packed.DefaultWriteBufferSize, 0)
bw := syncmgr.NewBulkPackWriterV2(mc, s.meta.Schema, cm, alloc, packed.DefaultWriteBufferSize, 0)
return bw.Write(context.Background(), pack)
}

View File

@ -89,7 +89,11 @@ func NewSyncTask(ctx context.Context,
syncPack.WithBM25Stats(bm25Stats)
}
task := syncmgr.NewSyncTask().WithAllocator(allocator).WithMetaCache(metaCache).WithSyncPack(syncPack)
task := syncmgr.NewSyncTask().
WithAllocator(allocator).
WithMetaCache(metaCache).
WithSchema(metaCache.Schema()). // TODO specify import schema if needed
WithSyncPack(syncPack)
return task, nil
}

View File

@ -1,6 +1,7 @@
package syncmgr
import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/flushcommon/metacache"
"github.com/milvus-io/milvus/internal/storage"
@ -52,6 +53,11 @@ func (t *SyncTask) WithMetaCache(metacache metacache.MetaCache) *SyncTask {
return t
}
func (t *SyncTask) WithSchema(schema *schemapb.CollectionSchema) *SyncTask {
t.schema = schema
return t
}
func (t *SyncTask) WithMetaWriter(metaWriter MetaWriter) *SyncTask {
t.metaWriter = metaWriter
return t

View File

@ -23,6 +23,7 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/flushcommon/metacache"
"github.com/milvus-io/milvus/internal/storage"
@ -41,6 +42,7 @@ type PackWriter interface {
type BulkPackWriter struct {
metaCache metacache.MetaCache
schema *schemapb.CollectionSchema
chunkManager storage.ChunkManager
allocator allocator.Interface
writeRetryOpts []retry.Option
@ -50,11 +52,14 @@ type BulkPackWriter struct {
sizeWritten int64
}
func NewBulkPackWriter(metaCache metacache.MetaCache, chunkManager storage.ChunkManager,
func NewBulkPackWriter(metaCache metacache.MetaCache,
schema *schemapb.CollectionSchema,
chunkManager storage.ChunkManager,
allocator allocator.Interface, writeRetryOpts ...retry.Option,
) *BulkPackWriter {
return &BulkPackWriter{
metaCache: metaCache,
schema: schema,
chunkManager: chunkManager,
allocator: allocator,
writeRetryOpts: writeRetryOpts,
@ -163,7 +168,7 @@ func (bw *BulkPackWriter) writeInserts(ctx context.Context, pack *SyncPack) (map
return make(map[int64]*datapb.FieldBinlog), nil
}
serializer, err := NewStorageSerializer(bw.metaCache)
serializer, err := NewStorageSerializer(bw.metaCache, bw.schema)
if err != nil {
return nil, err
}
@ -192,7 +197,7 @@ func (bw *BulkPackWriter) writeStats(ctx context.Context, pack *SyncPack) (map[i
if len(pack.insertData) == 0 {
return make(map[int64]*datapb.FieldBinlog), nil
}
serializer, err := NewStorageSerializer(bw.metaCache)
serializer, err := NewStorageSerializer(bw.metaCache, bw.schema)
if err != nil {
return nil, err
}
@ -240,7 +245,7 @@ func (bw *BulkPackWriter) writeBM25Stasts(ctx context.Context, pack *SyncPack) (
return make(map[int64]*datapb.FieldBinlog), nil
}
serializer, err := NewStorageSerializer(bw.metaCache)
serializer, err := NewStorageSerializer(bw.metaCache, bw.schema)
if err != nil {
return nil, err
}
@ -297,7 +302,7 @@ func (bw *BulkPackWriter) writeDelta(ctx context.Context, pack *SyncPack) (*data
if pack.deltaData == nil {
return &datapb.FieldBinlog{}, nil
}
s, err := NewStorageSerializer(bw.metaCache)
s, err := NewStorageSerializer(bw.metaCache, bw.schema)
if err != nil {
return nil, err
}

View File

@ -49,7 +49,7 @@ func TestNextID(t *testing.T) {
i++
return rt, nil
}
bw := NewBulkPackWriter(nil, nil, al)
bw := NewBulkPackWriter(nil, nil, nil, al)
bw.prefetchIDs(new(SyncPack).WithFlush())
t.Run("normal_next", func(t *testing.T) {
@ -116,6 +116,7 @@ func TestBulkPackWriter_Write(t *testing.T) {
bw := &BulkPackWriter{
metaCache: mc,
schema: schema,
chunkManager: cm,
allocator: allocator.NewLocalAllocator(10000, 100000),
}

View File

@ -49,17 +49,18 @@ type BulkPackWriterV2 struct {
multiPartUploadSize int64
}
func NewBulkPackWriterV2(metaCache metacache.MetaCache, chunkManager storage.ChunkManager,
func NewBulkPackWriterV2(metaCache metacache.MetaCache, schema *schemapb.CollectionSchema, chunkManager storage.ChunkManager,
allocator allocator.Interface, bufferSize, multiPartUploadSize int64, writeRetryOpts ...retry.Option,
) *BulkPackWriterV2 {
return &BulkPackWriterV2{
BulkPackWriter: &BulkPackWriter{
metaCache: metaCache,
schema: schema,
chunkManager: chunkManager,
allocator: allocator,
writeRetryOpts: writeRetryOpts,
},
schema: metaCache.Schema(),
schema: schema,
bufferSize: bufferSize,
multiPartUploadSize: multiPartUploadSize,
}

View File

@ -121,7 +121,7 @@ func (s *PackWriterV2Suite) TestPackWriterV2_Write() {
pack := new(SyncPack).WithCollectionID(collectionID).WithPartitionID(partitionID).WithSegmentID(segmentID).WithChannelName(channelName).WithInsertData(genInsertData(rows, s.schema)).WithDeleteData(deletes)
bw := NewBulkPackWriterV2(mc, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
bw := NewBulkPackWriterV2(mc, s.schema, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
gotInserts, _, _, _, _, err := bw.Write(context.Background(), pack)
s.NoError(err)
@ -139,7 +139,7 @@ func (s *PackWriterV2Suite) TestWriteEmptyInsertData() {
mc.EXPECT().Schema().Return(s.schema).Maybe()
pack := new(SyncPack).WithCollectionID(collectionID).WithPartitionID(partitionID).WithSegmentID(segmentID).WithChannelName(channelName)
bw := NewBulkPackWriterV2(mc, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
bw := NewBulkPackWriterV2(mc, s.schema, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
_, _, _, _, _, err := bw.Write(context.Background(), pack)
s.NoError(err)
@ -168,7 +168,7 @@ func (s *PackWriterV2Suite) TestNoPkField() {
buf.Append(data)
pack := new(SyncPack).WithCollectionID(collectionID).WithPartitionID(partitionID).WithSegmentID(segmentID).WithChannelName(channelName).WithInsertData([]*storage.InsertData{buf})
bw := NewBulkPackWriterV2(mc, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
bw := NewBulkPackWriterV2(mc, s.schema, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
_, _, _, _, _, err := bw.Write(context.Background(), pack)
s.Error(err)
@ -185,7 +185,7 @@ func (s *PackWriterV2Suite) TestAllocIDExhausedError() {
mc.EXPECT().Schema().Return(s.schema).Maybe()
pack := new(SyncPack).WithCollectionID(collectionID).WithPartitionID(partitionID).WithSegmentID(segmentID).WithChannelName(channelName).WithInsertData(genInsertData(rows, s.schema))
bw := NewBulkPackWriterV2(mc, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
bw := NewBulkPackWriterV2(mc, s.schema, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
_, _, _, _, _, err := bw.Write(context.Background(), pack)
s.Error(err)
@ -206,7 +206,7 @@ func (s *PackWriterV2Suite) TestWriteInsertDataError() {
buf.Append(data)
pack := new(SyncPack).WithCollectionID(collectionID).WithPartitionID(partitionID).WithSegmentID(segmentID).WithChannelName(channelName).WithInsertData([]*storage.InsertData{buf})
bw := NewBulkPackWriterV2(mc, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
bw := NewBulkPackWriterV2(mc, s.schema, s.cm, s.logIDAlloc, packed.DefaultWriteBufferSize, 0)
_, _, _, _, _, err := bw.Write(context.Background(), pack)
s.Error(err)

View File

@ -41,8 +41,7 @@ type storageV1Serializer struct {
metacache metacache.MetaCache
}
func NewStorageSerializer(metacache metacache.MetaCache) (*storageV1Serializer, error) {
schema := metacache.Schema()
func NewStorageSerializer(metacache metacache.MetaCache, schema *schemapb.CollectionSchema) (*storageV1Serializer, error) {
pkField := lo.FindOrElse(schema.GetFields(), nil, func(field *schemapb.FieldSchema) bool { return field.GetIsPrimaryKey() })
if pkField == nil {
return nil, merr.WrapErrServiceInternal("cannot find pk field")

View File

@ -91,10 +91,10 @@ func (s *StorageV1SerializerSuite) SetupSuite() {
}
func (s *StorageV1SerializerSuite) SetupTest() {
s.mockCache.EXPECT().Schema().Return(s.schema)
s.mockCache.EXPECT().Schema().Return(s.schema).Maybe()
var err error
s.serializer, err = NewStorageSerializer(s.mockCache)
s.serializer, err = NewStorageSerializer(s.mockCache, s.schema)
s.Require().NoError(err)
}
@ -253,8 +253,7 @@ func (s *StorageV1SerializerSuite) TestSerializeDelete() {
func (s *StorageV1SerializerSuite) TestBadSchema() {
mockCache := metacache.NewMockMetaCache(s.T())
mockCache.EXPECT().Schema().Return(&schemapb.CollectionSchema{}).Once()
_, err := NewStorageSerializer(mockCache)
_, err := NewStorageSerializer(mockCache, &schemapb.CollectionSchema{})
s.Error(err)
}

View File

@ -25,6 +25,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/flushcommon/metacache"
"github.com/milvus-io/milvus/internal/json"
@ -62,6 +63,7 @@ type SyncTask struct {
metacache metacache.MetaCache
metaWriter MetaWriter
schema *schemapb.CollectionSchema // schema for when buffer created, could be different from current on in metacache
pack *SyncPack
@ -126,14 +128,14 @@ func (t *SyncTask) Run(ctx context.Context) (err error) {
switch segmentInfo.GetStorageVersion() {
case storage.StorageV2:
writer := NewBulkPackWriterV2(t.metacache, t.chunkManager, t.allocator, t.syncBufferSize, t.multiPartUploadSize, t.writeRetryOpts...)
writer := NewBulkPackWriterV2(t.metacache, t.schema, t.chunkManager, t.allocator, t.syncBufferSize, t.multiPartUploadSize, t.writeRetryOpts...)
t.insertBinlogs, t.deltaBinlog, t.statsBinlogs, t.bm25Binlogs, t.flushedSize, err = writer.Write(ctx, t.pack)
if err != nil {
log.Warn("failed to write sync data with storage v2 format", zap.Error(err))
return err
}
default:
writer := NewBulkPackWriter(t.metacache, t.chunkManager, t.allocator, t.writeRetryOpts...)
writer := NewBulkPackWriter(t.metacache, t.schema, t.chunkManager, t.allocator, t.writeRetryOpts...)
t.insertBinlogs, t.deltaBinlog, t.statsBinlogs, t.bm25Binlogs, t.flushedSize, err = writer.Write(ctx, t.pack)
if err != nil {
log.Warn("failed to write sync data", zap.Error(err))

View File

@ -159,7 +159,8 @@ func (s *SyncTaskSuite) getSuiteSyncTask(pack *SyncPack) *SyncTask {
WithChannelName(s.channelName)).
WithAllocator(s.allocator).
WithChunkManager(s.chunkManager).
WithMetaCache(s.metacache)
WithMetaCache(s.metacache).
WithSchema(s.schema)
return task
}
@ -235,7 +236,7 @@ func (s *SyncTaskSuite) runTestRunNormal(storageVersion int64) {
MsgID: []byte{1, 2, 3, 4},
Timestamp: 100,
}))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1)).WithSchema(s.schema)
if storageVersion == storage.StorageV2 {
task.WithMultiPartUploadSize(0)
task.WithSyncBufferSize(packed.DefaultWriteBufferSize)
@ -255,7 +256,7 @@ func (s *SyncTaskSuite) runTestRunNormal(storageVersion int64) {
MsgID: []byte{1, 2, 3, 4},
Timestamp: 100,
}))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1)).WithSchema(s.schema)
if storageVersion == storage.StorageV2 {
task.WithMultiPartUploadSize(0)
task.WithSyncBufferSize(packed.DefaultWriteBufferSize)
@ -274,7 +275,7 @@ func (s *SyncTaskSuite) runTestRunNormal(storageVersion int64) {
MsgID: []byte{1, 2, 3, 4},
Timestamp: 100,
}))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1)).WithSchema(s.schema)
if storageVersion == storage.StorageV2 {
task.WithMultiPartUploadSize(0)
task.WithSyncBufferSize(packed.DefaultWriteBufferSize)
@ -303,7 +304,7 @@ func (s *SyncTaskSuite) TestRunL0Segment() {
MsgID: []byte{1, 2, 3, 4},
Timestamp: 100,
}))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1)).WithSchema(s.schema)
err := task.Run(ctx)
s.NoError(err)
@ -324,7 +325,7 @@ func (s *SyncTaskSuite) TestRunL0Segment() {
MsgID: []byte{1, 2, 3, 4},
Timestamp: 100,
}))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1))
task.WithMetaWriter(BrokerMetaWriter(s.broker, 1)).WithSchema(s.schema)
err := task.Run(ctx)
s.NoError(err)

View File

@ -32,10 +32,11 @@ func (buf *segmentBuffer) IsFull() bool {
return buf.insertBuffer.IsFull() || buf.deltaBuffer.IsFull()
}
func (buf *segmentBuffer) Yield() (insert []*storage.InsertData, bm25stats map[int64]*storage.BM25Stats, delete *storage.DeleteData) {
func (buf *segmentBuffer) Yield() (insert []*storage.InsertData, bm25stats map[int64]*storage.BM25Stats, delete *storage.DeleteData, schema *schemapb.CollectionSchema) {
insert = buf.insertBuffer.Yield()
bm25stats = buf.insertBuffer.YieldStats()
delete = buf.deltaBuffer.Yield()
schema = buf.insertBuffer.collSchema
return
}

View File

@ -371,19 +371,19 @@ func (wb *writeBufferBase) getOrCreateBuffer(segmentID int64) *segmentBuffer {
return buffer
}
func (wb *writeBufferBase) yieldBuffer(segmentID int64) ([]*storage.InsertData, map[int64]*storage.BM25Stats, *storage.DeleteData, *TimeRange, *msgpb.MsgPosition) {
func (wb *writeBufferBase) yieldBuffer(segmentID int64) ([]*storage.InsertData, map[int64]*storage.BM25Stats, *storage.DeleteData, *schemapb.CollectionSchema, *TimeRange, *msgpb.MsgPosition) {
buffer, ok := wb.buffers[segmentID]
if !ok {
return nil, nil, nil, nil, nil
return nil, nil, nil, nil, nil, nil
}
// remove buffer and move it to sync manager
delete(wb.buffers, segmentID)
start := buffer.EarliestPosition()
timeRange := buffer.GetTimeRange()
insert, bm25, delta := buffer.Yield()
insert, bm25, delta, schema := buffer.Yield()
return insert, bm25, delta, timeRange, start
return insert, bm25, delta, schema, timeRange, start
}
type InsertData struct {
@ -543,7 +543,7 @@ func (wb *writeBufferBase) getSyncTask(ctx context.Context, segmentID int64) (sy
var totalMemSize float64 = 0
var tsFrom, tsTo uint64
insert, bm25, delta, timeRange, startPos := wb.yieldBuffer(segmentID)
insert, bm25, delta, schema, timeRange, startPos := wb.yieldBuffer(segmentID)
if timeRange != nil {
tsFrom, tsTo = timeRange.timestampMin, timeRange.timestampMax
}
@ -600,6 +600,7 @@ func (wb *writeBufferBase) getSyncTask(ctx context.Context, segmentID int64) (sy
WithAllocator(wb.allocator).
WithMetaWriter(wb.metaWriter).
WithMetaCache(wb.metaCache).
WithSchema(schema).
WithSyncPack(pack).
WithMultiPartUploadSize(packed.DefaultMultiPartUploadSize)
return task, nil