milvus/internal/flushcommon/syncmgr/pack_writer_v2.go
XuanYang-cn 0bbb134e39
feat: Enable to backup and reload ez (#46332)
see also: #40013

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
2025-12-16 17:19:16 +08:00

326 lines
10 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package syncmgr
import (
"context"
"encoding/base64"
"math"
"path"
"github.com/apache/arrow/go/v17/arrow/array"
"github.com/apache/arrow/go/v17/arrow/memory"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/allocator"
"github.com/milvus-io/milvus/internal/flushcommon/metacache"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/storagecommon"
"github.com/milvus-io/milvus/internal/storagev2/packed"
"github.com/milvus-io/milvus/internal/util/hookutil"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexcgopb"
"github.com/milvus-io/milvus/pkg/v2/proto/indexpb"
"github.com/milvus-io/milvus/pkg/v2/util/metautil"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/retry"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type BulkPackWriterV2 struct {
*BulkPackWriter
schema *schemapb.CollectionSchema
bufferSize int64
multiPartUploadSize int64
storageConfig *indexpb.StorageConfig
columnGroups []storagecommon.ColumnGroup
manifestPath string
}
func NewBulkPackWriterV2(metaCache metacache.MetaCache, schema *schemapb.CollectionSchema, chunkManager storage.ChunkManager,
allocator allocator.Interface, bufferSize, multiPartUploadSize int64,
storageConfig *indexpb.StorageConfig, columnGroups []storagecommon.ColumnGroup, curManifestPath string, writeRetryOpts ...retry.Option,
) *BulkPackWriterV2 {
return &BulkPackWriterV2{
BulkPackWriter: &BulkPackWriter{
metaCache: metaCache,
schema: schema,
chunkManager: chunkManager,
allocator: allocator,
writeRetryOpts: writeRetryOpts,
},
schema: schema,
bufferSize: bufferSize,
multiPartUploadSize: multiPartUploadSize,
storageConfig: storageConfig,
columnGroups: columnGroups,
manifestPath: curManifestPath,
}
}
func (bw *BulkPackWriterV2) Write(ctx context.Context, pack *SyncPack) (
inserts map[int64]*datapb.FieldBinlog,
deltas *datapb.FieldBinlog,
stats map[int64]*datapb.FieldBinlog,
bm25Stats map[int64]*datapb.FieldBinlog,
manifest string,
size int64,
err error,
) {
if inserts, manifest, err = bw.writeInserts(ctx, pack); err != nil {
log.Error("failed to write insert data", zap.Error(err))
return
}
if stats, err = bw.writeStats(ctx, pack); err != nil {
log.Error("failed to process stats blob", zap.Error(err))
return
}
if deltas, err = bw.writeDelta(ctx, pack); err != nil {
log.Error("failed to process delta blob", zap.Error(err))
return
}
if bm25Stats, err = bw.writeBM25Stasts(ctx, pack); err != nil {
log.Error("failed to process bm25 stats blob", zap.Error(err))
return
}
size = bw.sizeWritten
return
}
// getRootPath returns the rootPath current task shall use.
// when storageConfig is set, use the rootPath in it.
// otherwise, use chunkManager.RootPath() instead.
func (bw *BulkPackWriterV2) getRootPath() string {
if bw.storageConfig != nil {
return bw.storageConfig.RootPath
}
return bw.chunkManager.RootPath()
}
func (bw *BulkPackWriterV2) getBucketName() string {
if bw.storageConfig != nil {
return bw.storageConfig.BucketName
}
return paramtable.Get().ServiceParam.MinioCfg.BucketName.GetValue()
}
func (bw *BulkPackWriterV2) writeInserts(ctx context.Context, pack *SyncPack) (map[int64]*datapb.FieldBinlog, string, error) {
if len(pack.insertData) == 0 {
return make(map[int64]*datapb.FieldBinlog), "", nil
}
rec, err := bw.serializeBinlog(ctx, pack)
if err != nil {
return nil, "", err
}
defer rec.Release()
tsArray := rec.Column(common.TimeStampField).(*array.Int64)
rows := rec.Len()
var tsFrom uint64 = math.MaxUint64
var tsTo uint64 = 0
for i := 0; i < rows; i++ {
ts := typeutil.Timestamp(tsArray.Value(i))
if ts < tsFrom {
tsFrom = ts
}
if ts > tsTo {
tsTo = ts
}
}
var pluginContextPtr *indexcgopb.StoragePluginContext
if hookutil.IsClusterEncryptionEnabled() {
ez := hookutil.GetEzByCollProperties(bw.schema.GetProperties(), pack.collectionID)
if ez != nil {
unsafe := hookutil.GetCipher().GetUnsafeKey(ez.EzID, ez.CollectionID)
if len(unsafe) > 0 {
pluginContext := indexcgopb.StoragePluginContext{
EncryptionZoneId: ez.EzID,
CollectionId: ez.CollectionID,
EncryptionKey: base64.StdEncoding.EncodeToString(unsafe),
}
pluginContextPtr = &pluginContext
}
}
}
var logs map[int64]*datapb.FieldBinlog
var manifestPath string
if err := retry.Do(ctx, func() error {
var err error
logs, manifestPath, err = bw.writeInsertsIntoStorage(ctx, pluginContextPtr, pack, rec, tsFrom, tsTo)
if err != nil {
log.Warn("failed to write inserts into storage",
zap.Int64("collectionID", pack.collectionID),
zap.Int64("segmentID", pack.segmentID),
zap.Error(err))
return err
}
return nil
}, bw.writeRetryOpts...); err != nil {
return nil, "", err
}
return logs, manifestPath, nil
}
func (bw *BulkPackWriterV2) GetManifestInfo(pack *SyncPack) (basePath string, version int64, err error) {
// empty info, shall be first write,
// initialize manifestPath with -1 version
if bw.manifestPath == "" {
k := metautil.JoinIDPath(pack.collectionID, pack.partitionID, pack.segmentID)
logicalPath := path.Join(bw.getRootPath(), common.SegmentInsertLogPath, k)
bucketName := bw.getBucketName()
// if storage config is not passed, use common config
storageType := paramtable.Get().CommonCfg.StorageType.GetValue()
if bw.storageConfig != nil {
storageType = bw.storageConfig.GetStorageType()
}
if storageType != "local" {
basePath = path.Join(bucketName, logicalPath)
}
return basePath, -1, nil
}
return packed.UnmarshalManfestPath(bw.manifestPath)
}
func (bw *BulkPackWriterV2) writeInsertsIntoStorage(_ context.Context,
pluginContextPtr *indexcgopb.StoragePluginContext,
pack *SyncPack,
rec storage.Record,
tsFrom typeutil.Timestamp,
tsTo typeutil.Timestamp,
) (map[int64]*datapb.FieldBinlog, string, error) {
logs := make(map[int64]*datapb.FieldBinlog)
columnGroups := bw.columnGroups
bucketName := bw.getBucketName()
var err error
doWrite := func(w storage.RecordWriter) error {
if err = w.Write(rec); err != nil {
if closeErr := w.Close(); closeErr != nil {
log.Error("failed to close writer after write failed", zap.Error(closeErr))
}
return err
}
// close first the get stats & output
return w.Close()
}
var manifestPath string
if paramtable.Get().CommonCfg.UseLoonFFI.GetAsBool() || bw.manifestPath != "" {
basePath, version, err := bw.GetManifestInfo(pack)
if err != nil {
return nil, "", err
}
w, err := storage.NewPackedRecordManifestWriter(bucketName, basePath, version, bw.schema, bw.bufferSize, bw.multiPartUploadSize, columnGroups, bw.storageConfig, pluginContextPtr)
if err != nil {
return nil, "", err
}
if err = doWrite(w); err != nil {
return nil, "", err
}
for _, columnGroup := range columnGroups {
columnGroupID := columnGroup.GroupID
logs[columnGroupID] = &datapb.FieldBinlog{
FieldID: columnGroupID,
ChildFields: columnGroup.Fields,
Binlogs: []*datapb.Binlog{
{
LogSize: int64(w.GetColumnGroupWrittenCompressed(columnGroup.GroupID)),
MemorySize: int64(w.GetColumnGroupWrittenUncompressed(columnGroup.GroupID)),
LogPath: w.GetWrittenPaths(columnGroupID),
EntriesNum: w.GetWrittenRowNum(),
TimestampFrom: tsFrom,
TimestampTo: tsTo,
},
},
}
}
manifestPath = w.GetWrittenManifest()
} else {
paths := make([]string, 0)
for _, columnGroup := range columnGroups {
id, err := bw.allocator.AllocOne()
if err != nil {
return nil, "", err
}
path := metautil.BuildInsertLogPath(bw.getRootPath(), pack.collectionID, pack.partitionID, pack.segmentID, columnGroup.GroupID, id)
paths = append(paths, path)
}
w, err := storage.NewPackedRecordWriter(bucketName, paths, bw.schema, bw.bufferSize, bw.multiPartUploadSize, columnGroups, bw.storageConfig, pluginContextPtr)
if err != nil {
return nil, "", err
}
if err = doWrite(w); err != nil {
return nil, "", err
}
// workaround to store row num
for _, columnGroup := range columnGroups {
columnGroupID := columnGroup.GroupID
logs[columnGroupID] = &datapb.FieldBinlog{
FieldID: columnGroupID,
ChildFields: columnGroup.Fields,
Binlogs: []*datapb.Binlog{
{
LogSize: int64(w.GetColumnGroupWrittenCompressed(columnGroup.GroupID)),
MemorySize: int64(w.GetColumnGroupWrittenUncompressed(columnGroup.GroupID)),
LogPath: w.GetWrittenPaths(columnGroupID),
EntriesNum: w.GetWrittenRowNum(),
TimestampFrom: tsFrom,
TimestampTo: tsTo,
},
},
}
}
}
return logs, manifestPath, nil
}
func (bw *BulkPackWriterV2) serializeBinlog(_ context.Context, pack *SyncPack) (storage.Record, error) {
if len(pack.insertData) == 0 {
return nil, nil
}
arrowSchema, err := storage.ConvertToArrowSchema(bw.schema, true)
if err != nil {
return nil, err
}
builder := array.NewRecordBuilder(memory.DefaultAllocator, arrowSchema)
defer builder.Release()
for _, chunk := range pack.insertData {
if err := storage.BuildRecord(builder, chunk, bw.schema); err != nil {
return nil, err
}
}
rec := builder.NewRecord()
allFields := typeutil.GetAllFieldSchemas(bw.schema)
field2Col := make(map[storage.FieldID]int, len(allFields))
for c, field := range allFields {
field2Col[field.FieldID] = c
}
return storage.NewSimpleArrowRecord(rec, field2Col), nil
}