enhance: Estimate index task slot using field size instead of segment size (#46275)

issue: #45186

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2025-12-23 11:23:22 +08:00 committed by GitHub
parent 5e525eb3bf
commit 5911cb44e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 41 additions and 9 deletions

View File

@ -179,8 +179,9 @@ func (i *indexInspector) createIndexForSegment(ctx context.Context, segment *Seg
indexParams := i.meta.indexMeta.GetIndexParams(segment.CollectionID, indexID)
indexType := GetIndexType(indexParams)
isVectorIndex := vecindexmgr.GetVecIndexMgrInstance().IsVecIndex(indexType)
segSize := segment.getSegmentSize()
taskSlot := calculateIndexTaskSlot(segSize, isVectorIndex)
fieldID := i.meta.indexMeta.GetFieldIDByIndexID(segment.CollectionID, indexID)
fieldSize := segment.getFieldBinlogSize(fieldID)
taskSlot := calculateIndexTaskSlot(fieldSize, isVectorIndex)
// rewrite the index type if needed, and this final index type will be persisted in the meta
if isVectorIndex && Params.KnowhereConfig.Enable.GetAsBool() {
@ -219,7 +220,9 @@ func (i *indexInspector) createIndexForSegment(ctx context.Context, segment *Seg
log.Info("indexInspector create index for segment success",
zap.Int64("segmentID", segment.ID),
zap.Int64("indexID", indexID),
zap.Int64("segment size", segSize),
zap.Int64("fieldID", fieldID),
zap.Int64("segment size", segment.getSegmentSize()),
zap.Int64("field size", fieldSize),
zap.Int64("task slot", taskSlot))
return nil
}
@ -237,7 +240,9 @@ func (i *indexInspector) reloadFromMeta() {
indexParams := i.meta.indexMeta.GetIndexParams(segment.CollectionID, segIndex.IndexID)
indexType := GetIndexType(indexParams)
isVectorIndex := vecindexmgr.GetVecIndexMgrInstance().IsVecIndex(indexType)
taskSlot := calculateIndexTaskSlot(segment.getSegmentSize(), isVectorIndex)
fieldID := i.meta.indexMeta.GetFieldIDByIndexID(segment.CollectionID, segIndex.IndexID)
fieldSize := segment.getFieldBinlogSize(fieldID)
taskSlot := calculateIndexTaskSlot(fieldSize, isVectorIndex)
i.scheduler.Enqueue(newIndexBuildTask(
model.CloneSegmentIndex(segIndex),

View File

@ -532,6 +532,29 @@ func (s *SegmentInfo) getSegmentSize() int64 {
return s.size.Load()
}
func (s *SegmentInfo) getFieldBinlogSize(fieldID int64) int64 {
var size int64
for _, binlogs := range s.GetBinlogs() {
if binlogs.GetFieldID() == fieldID {
for _, l := range binlogs.GetBinlogs() {
size += l.GetMemorySize()
}
} else {
for _, childFieldID := range binlogs.GetChildFields() {
if childFieldID == fieldID {
for _, l := range binlogs.GetBinlogs() {
size += l.GetMemorySize()
}
}
}
}
}
if size <= 0 {
return s.getSegmentSize()
}
return size
}
// Any edits on deltalogs of flushed segments will reset deltaRowcount to -1
func (s *SegmentInfo) getDeltaCount() int64 {
if s.deltaRowcount.Load() < 0 || s.GetState() != commonpb.SegmentState_Flushed {

View File

@ -110,6 +110,7 @@ func TestGetSegmentSize(t *testing.T) {
SegmentInfo: &datapb.SegmentInfo{
Binlogs: []*datapb.FieldBinlog{
{
FieldID: 1,
Binlogs: []*datapb.Binlog{
{
LogID: 1,
@ -143,6 +144,9 @@ func TestGetSegmentSize(t *testing.T) {
assert.Equal(t, int64(3), segment.getSegmentSize())
assert.Equal(t, int64(3), segment.getSegmentSize())
assert.Equal(t, int64(1), segment.getFieldBinlogSize(1))
// field 2 has no binlogs, fallback to getSegmentSize
assert.Equal(t, int64(3), segment.getFieldBinlogSize(2))
}
func TestIsDeltaLogExists(t *testing.T) {

View File

@ -373,17 +373,17 @@ func getSortStatus(sorted bool) string {
return "unsorted"
}
func calculateIndexTaskSlot(segmentSize int64, isVectorIndex bool) int64 {
func calculateIndexTaskSlot(fieldSize int64, isVectorIndex bool) int64 {
defaultSlots := Params.DataCoordCfg.IndexTaskSlotUsage.GetAsInt64()
if !isVectorIndex {
defaultSlots = Params.DataCoordCfg.ScalarIndexTaskSlotUsage.GetAsInt64()
}
if segmentSize > 512*1024*1024 {
taskSlot := max(segmentSize/512/1024/1024, 1) * defaultSlots
if fieldSize > 512*1024*1024 {
taskSlot := max(fieldSize/512/1024/1024, 1) * defaultSlots
return max(taskSlot, 1)
} else if segmentSize > 100*1024*1024 {
} else if fieldSize > 100*1024*1024 {
return max(defaultSlots/4, 1)
} else if segmentSize > 10*1024*1024 {
} else if fieldSize > 10*1024*1024 {
return max(defaultSlots/16, 1)
}
return max(defaultSlots/64, 1)