mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
Cherry-pick from master pr: #45061 #45488 #45803 #46017 #44991 #45132 #45723 #45726 #45798 #45897 #45918 #44998 This feature integrates the Storage V2 (Loon) FFI interface as a unified storage layer for segment loading and index building in Milvus. It enables manifest-based data access, replacing the traditional binlog-based approach with a more efficient columnar storage format. Key changes: ### Segment Self-Managed Loading Architecture - Move segment loading orchestration from Go layer to C++ segcore - Add NewSegmentWithLoadInfo() API for passing load info during segment creation - Implement SetLoadInfo() and Load() methods in SegmentInterface - Support parallel loading of indexed and non-indexed fields - Enable both sealed and growing segments to self-manage loading ### Storage V2 FFI Integration - Integrate milvus-storage library's FFI interface for packed columnar data - Add manifest path support throughout the data path (SegmentInfo, LoadInfo) - Implement ManifestReader for generating manifests from binlogs - Support zero-copy data exchange using Arrow C Data Interface - Add ToCStorageConfig() for Go-to-C storage config conversion ### Manifest-Based Index Building - Extend FileManagerContext to carry loon_ffi_properties - Implement GetFieldDatasFromManifest() using Arrow C Stream interface - Support manifest-based reading in DiskFileManagerImpl and MemFileManagerImpl - Add fallback to traditional segment insert files when manifest unavailable ### Compaction Pipeline Updates - Include manifest path in all compaction task builders (clustering, L0, mix) - Update BulkPackWriterV2 to return manifest path - Propagate manifest metadata through compaction pipeline ### Configuration & Protocol - Add common.storageV2.useLoonFFI config option (default: false) - Add manifest_path field to SegmentLoadInfo and related proto messages - Add manifest field to compaction segment messages ### Bug Fixes - Fix mmap settings not applied during segment load (key typo fix) - Populate index info after segment loading to prevent redundant load tasks - Fix memory corruption by removing premature transaction handle destruction Related issues: #44956, #45060, #39173 ## Individual Cherry-Picked Commits 1. **e1c923b5cc** - fix: apply mmap settings correctly during segment load (#46017) 2. **63b912370b** - enhance: use milvus-storage internal C++ Reader API for Loon FFI (#45897) 3. **bfc192faa5** - enhance: Resolve issues integrating loon FFI (#45918) 4. **fb18564631** - enhance: support manifest-based index building with Loon FFI reader (#45726) 5. **b9ec2392b9** - enhance: integrate StorageV2 FFI interface for manifest-based segment loading (#45798) 6. **66db3c32e6** - enhance: integrate Storage V2 FFI interface for unified storage access (#45723) 7. **ae789273ac** - fix: populate index info after segment loading to prevent redundant load tasks (#45803) 8. **49688b0be2** - enhance: Move segment loading logic from Go layer to segcore for self-managed loading (#45488) 9. **5b2df88bac** - enhance: [StorageV2] Integrate FFI interface for packed reader (#45132) 10. **91ff5706ac** - enhance: [StorageV2] add manifest path support for FFI integration (#44991) 11. **2192bb4a85** - enhance: add NewSegmentWithLoadInfo API to support segment self-managed loading (#45061) 12. **4296b01da0** - enhance: update delta log serialization APIs to integrate storage V2 (#44998) ## Technical Details ### Architecture Changes - **Before**: Go layer orchestrated segment loading, making multiple CGO calls - **After**: Segments autonomously manage loading in C++ layer with single entry point ### Storage Access Pattern - **Before**: Read individual binlog files through Go storage layer - **After**: Read manifest file that references packed columnar data via FFI ### Benefits - Reduced cross-language call overhead - Better resource management at C++ level - Improved I/O performance through batched streaming reads - Cleaner separation of concerns between Go and C++ layers - Foundation for proactive schema evolution handling --------- Signed-off-by: Ted Xu <ted.xu@zilliz.com> Signed-off-by: Congqi Xia <congqi.xia@zilliz.com> Co-authored-by: Ted Xu <ted.xu@zilliz.com>
200 lines
5.6 KiB
Go
200 lines
5.6 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package metacache
|
|
|
|
import (
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/msgpb"
|
|
"github.com/milvus-io/milvus/internal/flushcommon/metacache/pkoracle"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/storagecommon"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
)
|
|
|
|
type SegmentInfo struct {
|
|
segmentID int64
|
|
partitionID int64
|
|
state commonpb.SegmentState
|
|
startPosition *msgpb.MsgPosition
|
|
checkpoint *msgpb.MsgPosition
|
|
startPosRecorded bool
|
|
flushedRows int64
|
|
bufferRows int64
|
|
syncingRows int64
|
|
bfs pkoracle.PkStat
|
|
bm25stats *SegmentBM25Stats
|
|
level datapb.SegmentLevel
|
|
syncingTasks int32
|
|
storageVersion int64
|
|
binlogs []*datapb.FieldBinlog
|
|
statslogs []*datapb.FieldBinlog
|
|
deltalogs []*datapb.FieldBinlog
|
|
bm25logs []*datapb.FieldBinlog
|
|
currentSplit []storagecommon.ColumnGroup
|
|
manifestPath string
|
|
}
|
|
|
|
func (s *SegmentInfo) SegmentID() int64 {
|
|
return s.segmentID
|
|
}
|
|
|
|
func (s *SegmentInfo) PartitionID() int64 {
|
|
return s.partitionID
|
|
}
|
|
|
|
func (s *SegmentInfo) State() commonpb.SegmentState {
|
|
return s.state
|
|
}
|
|
|
|
// NumOfRows returns sum of number of rows,
|
|
// including flushed, syncing and buffered
|
|
func (s *SegmentInfo) NumOfRows() int64 {
|
|
return s.flushedRows + s.syncingRows + s.bufferRows
|
|
}
|
|
|
|
// FlushedRows return flushed rows number.
|
|
func (s *SegmentInfo) FlushedRows() int64 {
|
|
return s.flushedRows
|
|
}
|
|
|
|
func (s *SegmentInfo) StartPosition() *msgpb.MsgPosition {
|
|
return s.startPosition
|
|
}
|
|
|
|
func (s *SegmentInfo) Checkpoint() *msgpb.MsgPosition {
|
|
return s.checkpoint
|
|
}
|
|
|
|
func (s *SegmentInfo) GetHistory() []*storage.PkStatistics {
|
|
return s.bfs.GetHistory()
|
|
}
|
|
|
|
func (s *SegmentInfo) GetBloomFilterSet() pkoracle.PkStat {
|
|
return s.bfs
|
|
}
|
|
|
|
func (s *SegmentInfo) GetBM25Stats() *SegmentBM25Stats {
|
|
return s.bm25stats
|
|
}
|
|
|
|
func (s *SegmentInfo) Level() datapb.SegmentLevel {
|
|
return s.level
|
|
}
|
|
|
|
func (s *SegmentInfo) BufferRows() int64 {
|
|
return s.bufferRows
|
|
}
|
|
|
|
func (s *SegmentInfo) SyncingRows() int64 {
|
|
return s.syncingRows
|
|
}
|
|
|
|
func (s *SegmentInfo) GetStorageVersion() int64 {
|
|
return s.storageVersion
|
|
}
|
|
|
|
func (s *SegmentInfo) GetCurrentSplit() []storagecommon.ColumnGroup {
|
|
return s.currentSplit
|
|
}
|
|
|
|
func (s *SegmentInfo) Binlogs() []*datapb.FieldBinlog {
|
|
return s.binlogs
|
|
}
|
|
|
|
func (s *SegmentInfo) Statslogs() []*datapb.FieldBinlog {
|
|
return s.statslogs
|
|
}
|
|
|
|
func (s *SegmentInfo) Deltalogs() []*datapb.FieldBinlog {
|
|
return s.deltalogs
|
|
}
|
|
|
|
func (s *SegmentInfo) Bm25logs() []*datapb.FieldBinlog {
|
|
return s.bm25logs
|
|
}
|
|
|
|
func (s *SegmentInfo) ManifestPath() string {
|
|
return s.manifestPath
|
|
}
|
|
|
|
func (s *SegmentInfo) Clone() *SegmentInfo {
|
|
return &SegmentInfo{
|
|
segmentID: s.segmentID,
|
|
partitionID: s.partitionID,
|
|
state: s.state,
|
|
startPosition: s.startPosition,
|
|
checkpoint: s.checkpoint,
|
|
startPosRecorded: s.startPosRecorded,
|
|
flushedRows: s.flushedRows,
|
|
bufferRows: s.bufferRows,
|
|
syncingRows: s.syncingRows,
|
|
bfs: s.bfs,
|
|
level: s.level,
|
|
syncingTasks: s.syncingTasks,
|
|
bm25stats: s.bm25stats,
|
|
storageVersion: s.storageVersion,
|
|
binlogs: s.binlogs,
|
|
statslogs: s.statslogs,
|
|
deltalogs: s.deltalogs,
|
|
bm25logs: s.bm25logs,
|
|
currentSplit: s.currentSplit,
|
|
manifestPath: s.manifestPath,
|
|
}
|
|
}
|
|
|
|
func NewSegmentInfo(info *datapb.SegmentInfo, bfs pkoracle.PkStat, bm25Stats *SegmentBM25Stats) *SegmentInfo {
|
|
level := info.GetLevel()
|
|
if level == datapb.SegmentLevel_Legacy {
|
|
level = datapb.SegmentLevel_L1
|
|
}
|
|
// legacy split also share same field here
|
|
// shall be checked by caller
|
|
var currentSplit []storagecommon.ColumnGroup
|
|
if info.GetStorageVersion() == storage.StorageV2 && len(info.Binlogs) > 0 {
|
|
currentSplit = make([]storagecommon.ColumnGroup, 0, len(info.Binlogs))
|
|
for _, group := range info.Binlogs {
|
|
currentSplit = append(currentSplit, storagecommon.ColumnGroup{
|
|
GroupID: group.GetFieldID(),
|
|
Fields: group.GetChildFields(),
|
|
})
|
|
}
|
|
log.Info("recover split info", zap.Int64("segmentID", info.GetID()), zap.Stringers("columnGroup", currentSplit))
|
|
}
|
|
return &SegmentInfo{
|
|
segmentID: info.GetID(),
|
|
partitionID: info.GetPartitionID(),
|
|
state: info.GetState(),
|
|
flushedRows: info.GetNumOfRows(),
|
|
startPosition: info.GetStartPosition(),
|
|
checkpoint: info.GetDmlPosition(),
|
|
startPosRecorded: true,
|
|
level: level,
|
|
bfs: bfs,
|
|
bm25stats: bm25Stats,
|
|
storageVersion: info.GetStorageVersion(),
|
|
binlogs: info.GetBinlogs(),
|
|
statslogs: info.GetStatslogs(),
|
|
deltalogs: info.GetDeltalogs(),
|
|
bm25logs: info.GetBm25Statslogs(),
|
|
currentSplit: currentSplit,
|
|
manifestPath: info.GetManifestPath(),
|
|
}
|
|
}
|