fix: Fix import segment size is uneven (#33605)

The data coordinator computed the appropriate number of import segments,
thus when importing in the data node, one can randomly select a segment.

issue: https://github.com/milvus-io/milvus/issues/33604

---------

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2024-06-05 15:41:51 +08:00 committed by GitHub
parent 545d4725fb
commit bbdf99a45e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 60 additions and 23 deletions

View File

@ -224,7 +224,6 @@ func (t *ImportTask) sync(task *ImportTask, hashedData HashedData) ([]*conc.Futu
log.Info("start to sync import data", WrapLogFields(task)...) log.Info("start to sync import data", WrapLogFields(task)...)
futures := make([]*conc.Future[struct{}], 0) futures := make([]*conc.Future[struct{}], 0)
syncTasks := make([]syncmgr.Task, 0) syncTasks := make([]syncmgr.Task, 0)
segmentImportedSizes := make(map[int64]int)
for channelIdx, datas := range hashedData { for channelIdx, datas := range hashedData {
channel := task.GetVchannels()[channelIdx] channel := task.GetVchannels()[channelIdx]
for partitionIdx, data := range datas { for partitionIdx, data := range datas {
@ -232,13 +231,11 @@ func (t *ImportTask) sync(task *ImportTask, hashedData HashedData) ([]*conc.Futu
continue continue
} }
partitionID := task.GetPartitionIDs()[partitionIdx] partitionID := task.GetPartitionIDs()[partitionIdx]
size := data.GetMemorySize() segmentID := PickSegment(task, channel, partitionID)
segmentID := PickSegment(task, segmentImportedSizes, channel, partitionID, size)
syncTask, err := NewSyncTask(task.ctx, task, segmentID, partitionID, channel, data) syncTask, err := NewSyncTask(task.ctx, task, segmentID, partitionID, channel, data)
if err != nil { if err != nil {
return nil, nil, err return nil, nil, err
} }
segmentImportedSizes[segmentID] += size
future := t.syncMgr.SyncData(task.ctx, syncTask) future := t.syncMgr.SyncData(task.ctx, syncTask)
futures = append(futures, future) futures = append(futures, future)
syncTasks = append(syncTasks, syncTask) syncTasks = append(syncTasks, syncTask)

View File

@ -19,7 +19,9 @@ package importv2
import ( import (
"context" "context"
"fmt" "fmt"
"math/rand"
"strconv" "strconv"
"time"
"github.com/samber/lo" "github.com/samber/lo"
"go.uber.org/zap" "go.uber.org/zap"
@ -34,7 +36,6 @@ import (
"github.com/milvus-io/milvus/pkg/common" "github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr" "github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil" "github.com/milvus-io/milvus/pkg/util/typeutil"
) )
@ -99,28 +100,13 @@ func NewImportSegmentInfo(syncTask syncmgr.Task, task *ImportTask) (*datapb.Impo
}, nil }, nil
} }
func PickSegment(task *ImportTask, segmentImportedSizes map[int64]int, vchannel string, partitionID int64, sizeToImport int) int64 { func PickSegment(task *ImportTask, vchannel string, partitionID int64) int64 {
candidates := lo.Filter(task.req.GetRequestSegments(), func(info *datapb.ImportRequestSegment, _ int) bool { candidates := lo.Filter(task.req.GetRequestSegments(), func(info *datapb.ImportRequestSegment, _ int) bool {
return info.GetVchannel() == vchannel && info.GetPartitionID() == partitionID return info.GetVchannel() == vchannel && info.GetPartitionID() == partitionID
}) })
segmentMaxSize := paramtable.Get().DataCoordCfg.SegmentMaxSize.GetAsInt() * 1024 * 1024 r := rand.New(rand.NewSource(time.Now().UnixNano()))
return candidates[r.Intn(len(candidates))].GetSegmentID()
for _, candidate := range candidates {
sizeImported := segmentImportedSizes[candidate.GetSegmentID()]
if sizeImported+sizeToImport <= segmentMaxSize {
return candidate.GetSegmentID()
}
}
segmentID := lo.MinBy(task.GetSegmentsInfo(), func(s1, s2 *datapb.ImportSegmentInfo) bool {
return segmentImportedSizes[s1.GetSegmentID()] < segmentImportedSizes[s2.GetSegmentID()]
}).GetSegmentID()
log.Warn("failed to pick an appropriate segment, opt for the smallest one instead",
WrapLogFields(task, zap.Int64("segmentID", segmentID),
zap.Int("sizeToImport", sizeToImport),
zap.Int("sizeImported", segmentImportedSizes[segmentID]),
zap.Int("segmentMaxSize", segmentMaxSize))...)
return segmentID
} }
func CheckRowsEqual(schema *schemapb.CollectionSchema, data *storage.InsertData) error { func CheckRowsEqual(schema *schemapb.CollectionSchema, data *storage.InsertData) error {

View File

@ -113,3 +113,57 @@ func Test_UnsetAutoID(t *testing.T) {
} }
} }
} }
func Test_PickSegment(t *testing.T) {
const (
vchannel = "ch-0"
partitionID = 10
)
task := &ImportTask{
req: &datapb.ImportRequest{
RequestSegments: []*datapb.ImportRequestSegment{
{
SegmentID: 100,
PartitionID: partitionID,
Vchannel: vchannel,
},
{
SegmentID: 101,
PartitionID: partitionID,
Vchannel: vchannel,
},
{
SegmentID: 102,
PartitionID: partitionID,
Vchannel: vchannel,
},
{
SegmentID: 103,
PartitionID: partitionID,
Vchannel: vchannel,
},
},
},
}
importedSize := map[int64]int{}
totalSize := 8 * 1024 * 1024 * 1024
batchSize := 16 * 1024 * 1024
for totalSize > 0 {
picked := PickSegment(task, vchannel, partitionID)
importedSize[picked] += batchSize
totalSize -= batchSize
}
expectSize := 2 * 1024 * 1024 * 1024
fn := func(actual int) {
t.Logf("actual=%d, expect*0.8=%f, expect*1.2=%f", actual, float64(expectSize)*0.9, float64(expectSize)*1.1)
assert.True(t, float64(actual) > float64(expectSize)*0.8)
assert.True(t, float64(actual) < float64(expectSize)*1.2)
}
fn(importedSize[int64(100)])
fn(importedSize[int64(101)])
fn(importedSize[int64(102)])
fn(importedSize[int64(103)])
}