milvus/internal/datacoord/compaction_view_forcemerge_test.go
XuanYang-cn 0507db2015
feat: Add force merge (#45556)
See also: #46043

---------

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
2025-12-19 18:03:18 +08:00

632 lines
16 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package datacoord
import (
"fmt"
"testing"
"time"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
)
func TestForceMergeSegmentView_GetGroupLabel(t *testing.T) {
label := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: 10,
Channel: "ch1",
}
view := &ForceMergeSegmentView{
label: label,
}
assert.Equal(t, label, view.GetGroupLabel())
}
func TestForceMergeSegmentView_GetSegmentsView(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 1024},
{ID: 2, Size: 2048},
}
view := &ForceMergeSegmentView{
segments: segments,
}
assert.Equal(t, segments, view.GetSegmentsView())
assert.Len(t, view.GetSegmentsView(), 2)
}
func TestForceMergeSegmentView_Append(t *testing.T) {
view := &ForceMergeSegmentView{
segments: []*SegmentView{
{ID: 1, Size: 1024},
},
}
newSegments := []*SegmentView{
{ID: 2, Size: 2048},
{ID: 3, Size: 3072},
}
view.Append(newSegments...)
assert.Len(t, view.segments, 3)
assert.Equal(t, int64(1), view.segments[0].ID)
assert.Equal(t, int64(2), view.segments[1].ID)
assert.Equal(t, int64(3), view.segments[2].ID)
}
func TestForceMergeSegmentView_String(t *testing.T) {
label := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: 10,
Channel: "ch1",
}
view := &ForceMergeSegmentView{
label: label,
segments: []*SegmentView{
{ID: 1},
{ID: 2},
},
triggerID: 12345,
}
str := view.String()
assert.Contains(t, str, "ForceMerge")
assert.Contains(t, str, "segments=2")
assert.Contains(t, str, "triggerID=12345")
}
func TestForceMergeSegmentView_Trigger(t *testing.T) {
view := &ForceMergeSegmentView{
triggerID: 100,
}
assert.Panics(t, func() {
view.Trigger()
})
}
func TestForceMergeSegmentView_ForceTrigger(t *testing.T) {
view := &ForceMergeSegmentView{
triggerID: 100,
}
assert.Panics(t, func() {
view.ForceTrigger()
})
}
func TestForceMergeSegmentView_GetTriggerID(t *testing.T) {
view := &ForceMergeSegmentView{
triggerID: 12345,
}
assert.Equal(t, int64(12345), view.GetTriggerID())
}
func TestForceMergeSegmentView_Complete(t *testing.T) {
label := &CompactionGroupLabel{
CollectionID: 100,
PartitionID: 200,
Channel: "test-channel",
}
segments := []*SegmentView{
{ID: 1, Size: 1024 * 1024 * 1024},
{ID: 2, Size: 512 * 1024 * 1024},
}
topology := &CollectionTopology{
CollectionID: 100,
NumReplicas: 1,
IsStandaloneMode: false,
QueryNodeMemory: map[int64]uint64{1: 8 * 1024 * 1024 * 1024},
DataNodeMemory: map[int64]uint64{1: 8 * 1024 * 1024 * 1024},
}
view := &ForceMergeSegmentView{
label: label,
segments: segments,
triggerID: 99999,
collectionTTL: 24 * time.Hour,
targetSize: 2048 * 1024 * 1024,
topology: topology,
}
// Test String output
str := view.String()
assert.Contains(t, str, "ForceMerge")
views, r3 := view.ForceTriggerAll()
assert.Len(t, views, 1)
assert.NotEmpty(t, r3)
}
func TestGroupingAlgorithmsComparison(t *testing.T) {
type testCase struct {
name string
segments []float64
targetSize float64
}
testCases := []testCase{
{
name: "perfect fit - 5x2GB to 5GB",
segments: []float64{2, 2, 2, 2, 2},
targetSize: 5,
},
{
name: "varying sizes - example from discussion",
segments: []float64{1.2, 1.3, 1.4, 1.8, 1.8, 1.8, 1.8, 1.8},
targetSize: 3,
},
{
name: "small segments",
segments: []float64{0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5},
targetSize: 2,
},
{
name: "large segments",
segments: []float64{3, 3, 3, 3},
targetSize: 5,
},
{
name: "mixed sizes",
segments: []float64{0.5, 1, 1.5, 2, 2.5, 3},
targetSize: 4,
},
{
name: "many small segments",
segments: []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
targetSize: 3,
},
{
name: "uneven distribution",
segments: []float64{0.3, 0.4, 2.5, 0.3, 2.8, 0.5, 2.2},
targetSize: 3,
},
{
name: "single segment",
segments: []float64{5},
targetSize: 3,
},
{
name: "two segments perfect",
segments: []float64{2.5, 2.5},
targetSize: 5,
},
{
name: "fibonacci-like sizes",
segments: []float64{1, 1, 2, 3, 5, 8},
targetSize: 10,
},
{
name: "near-perfect split - tests greedy vs optimal",
segments: []float64{1.5, 1.5, 1.5, 1.5, 1.5, 1.5},
targetSize: 3,
},
{
name: "strategic grouping - [2.8,0.3] vs [2.8,0.2,0.1]",
segments: []float64{2.8, 0.2, 0.1, 2.8, 0.3},
targetSize: 3,
},
{
name: "tail optimization - many small + one large",
segments: []float64{0.5, 0.5, 0.5, 0.5, 0.5, 2.5},
targetSize: 3,
},
{
name: "alternating sizes for different strategies",
segments: []float64{1.0, 2.5, 1.0, 2.5, 1.0, 2.5},
targetSize: 4,
},
{
name: "edge case - slightly over target creates decision point",
segments: []float64{2.1, 2.1, 2.1, 2.1, 2.1},
targetSize: 4,
},
{
name: "optimal vs greedy - can fit 3 full or 2 full + small tail",
segments: []float64{1.8, 1.8, 1.8, 1.8, 1.8, 1.5},
targetSize: 3,
},
{
name: "many segments with complex optimal solution",
segments: []float64{0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8},
targetSize: 2,
},
{
name: "greedy stops early, optimal continues",
segments: []float64{2.8, 0.2, 0.1, 2.8, 0.3},
targetSize: 3.0,
},
{
name: "Balanced vs Larger - distribution vs grouping",
segments: []float64{1.0, 1.0, 1.0, 1.0, 1.0, 1.0},
targetSize: 2.5,
},
{
name: "MaxFull achieves theoretical maximum when possible",
// Perfect case: 6x1.5GB segments, target=3GB
// Total=9GB, theoretical max = 3 full segments
segments: []float64{1.5, 1.5, 1.5, 1.5, 1.5, 1.5},
targetSize: 3.0,
},
{
name: "Larger creates fewer compaction tasks",
segments: lo.Times(20, func(i int) float64 { return 0.5 }),
targetSize: 2.0,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// Convert to SegmentView
segments := make([]*SegmentView, len(tc.segments))
for i, size := range tc.segments {
segments[i] = &SegmentView{
ID: int64(i + 1),
Size: size * 1024 * 1024 * 1024,
}
}
targetSize := tc.targetSize * 1024 * 1024 * 1024
totalSize := sumSegmentSize(segments)
theoreticalMaxFull := int(totalSize / targetSize)
// Test all three algorithms
groupsBalanced := adaptiveGroupSegments(segments, targetSize)
groupsLarger := largerGroupingSegments(segments, targetSize)
groupsMax := maxFullSegmentsGrouping(segments, targetSize)
// Helper to count full segments and tails
countMetrics := func(groups [][]*SegmentView) (numGroups, numFull, numTails int) {
numGroups = len(groups)
for _, group := range groups {
groupSize := sumSegmentSize(group)
full := int(groupSize / targetSize)
remainder := groupSize - float64(full)*targetSize
numFull += full
if remainder > 0.01 {
numTails++
}
}
return
}
// Helper to verify all segments used exactly once
verifyAllSegmentsUsed := func(groups [][]*SegmentView) bool {
seen := make(map[int64]int)
for _, group := range groups {
for _, seg := range group {
seen[seg.ID]++
}
}
if len(seen) != len(segments) {
return false
}
for _, count := range seen {
if count != 1 {
return false
}
}
return true
}
// Verify all algorithms use each segment exactly once
assert.True(t, verifyAllSegmentsUsed(groupsBalanced), "adaptiveGroupSegments: all segments must be used exactly once")
assert.True(t, verifyAllSegmentsUsed(groupsLarger), "largerGroupingSegments: all segments must be used exactly once")
assert.True(t, verifyAllSegmentsUsed(groupsMax), "maxFullSegmentsGrouping: all segments must be used exactly once")
// Get metrics
adaptiveGroups, adaptiveFull, adaptiveTails := countMetrics(groupsBalanced)
largerGroups, largerFull, largerTails := countMetrics(groupsLarger)
maxGroups, maxFull, maxTails := countMetrics(groupsMax)
t.Logf("Total size: %.1f GB, Target: %.1f GB, Theoretical max full: %d",
totalSize/(1024*1024*1024), targetSize/(1024*1024*1024), theoreticalMaxFull)
t.Logf("Adaptive: %d groups, %d full, %d tails", adaptiveGroups, adaptiveFull, adaptiveTails)
t.Logf("Larger: %d groups, %d full, %d tails", largerGroups, largerFull, largerTails)
t.Logf("MaxFull: %d groups, %d full, %d tails", maxGroups, maxFull, maxTails)
// Assertions
// 1. maxFullSegmentsGrouping should produce most full segments
assert.GreaterOrEqual(t, maxFull, largerFull, "maxFullSegmentsGrouping should produce >= full segments than largerGroupingSegments")
// 2. maxFullSegmentsGrouping should not exceed theoretical maximum
assert.LessOrEqual(t, maxFull, theoreticalMaxFull, "cannot exceed theoretical maximum")
// 3. All algorithms should process all segments
for _, groups := range [][][]*SegmentView{groupsBalanced, groupsLarger, groupsMax} {
totalProcessed := 0
for _, group := range groups {
totalProcessed += len(group)
}
assert.Equal(t, len(segments), totalProcessed)
}
})
}
}
func TestAdaptiveGroupSegments(t *testing.T) {
t.Run("empty segments", func(t *testing.T) {
groups := adaptiveGroupSegments(nil, 5*1024*1024*1024)
assert.Nil(t, groups)
})
t.Run("uses maxFull for small segment count", func(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 1.5 * 1024 * 1024 * 1024},
{ID: 2, Size: 1.5 * 1024 * 1024 * 1024},
{ID: 3, Size: 1.5 * 1024 * 1024 * 1024},
{ID: 4, Size: 1.5 * 1024 * 1024 * 1024},
}
groups := adaptiveGroupSegments(segments, 3*1024*1024*1024)
// Should produce 2 groups with 2 full segments
assert.Equal(t, 2, len(groups))
})
t.Run("uses larger for large segment count", func(t *testing.T) {
// Create 200 segments (> defaultMaxFullSegmentThreshold)
segments := make([]*SegmentView, 200)
for i := 0; i < 200; i++ {
segments[i] = &SegmentView{
ID: int64(i),
Size: 1 * 1024 * 1024 * 1024,
}
}
groups := adaptiveGroupSegments(segments, 3*1024*1024*1024)
// Should use larger algorithm
assert.NotNil(t, groups)
assert.Greater(t, len(groups), 0)
})
}
func TestLargerGroupingSegments(t *testing.T) {
t.Run("empty segments", func(t *testing.T) {
groups := largerGroupingSegments(nil, 5*1024*1024*1024)
assert.Nil(t, groups)
})
t.Run("single segment", func(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 3 * 1024 * 1024 * 1024},
}
groups := largerGroupingSegments(segments, 5*1024*1024*1024)
assert.Equal(t, 1, len(groups))
assert.Equal(t, 1, len(groups[0]))
})
}
func TestMaxFullSegmentsGrouping(t *testing.T) {
t.Run("empty segments", func(t *testing.T) {
groups := maxFullSegmentsGrouping(nil, 5*1024*1024*1024)
assert.Nil(t, groups)
})
t.Run("single segment", func(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 3 * 1024 * 1024 * 1024},
}
groups := maxFullSegmentsGrouping(segments, 5*1024*1024*1024)
assert.Equal(t, 1, len(groups))
assert.Equal(t, 1, len(groups[0]))
})
t.Run("perfect fit achieves theoretical maximum", func(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 2.5 * 1024 * 1024 * 1024},
{ID: 2, Size: 2.5 * 1024 * 1024 * 1024},
{ID: 3, Size: 2.5 * 1024 * 1024 * 1024},
{ID: 4, Size: 2.5 * 1024 * 1024 * 1024},
}
targetSize := 5.0 * 1024 * 1024 * 1024
groups := maxFullSegmentsGrouping(segments, targetSize)
totalFull := 0
for _, group := range groups {
groupSize := sumSegmentSize(group)
totalFull += int(groupSize / targetSize)
}
// Total is 10GB, should produce exactly 2 full 5GB segments
assert.Equal(t, 2, totalFull)
})
}
func TestSumSegmentSize(t *testing.T) {
segments := []*SegmentView{
{ID: 1, Size: 1024 * 1024 * 1024},
{ID: 2, Size: 512 * 1024 * 1024},
}
totalSize := sumSegmentSize(segments)
expected := 1.5 * 1024 * 1024 * 1024
assert.InDelta(t, expected, totalSize, 1)
}
func TestGroupByPartitionChannel(t *testing.T) {
label1 := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: 10,
Channel: "ch1",
}
label2 := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: 20,
Channel: "ch1",
}
segments := []*SegmentView{
{ID: 1, label: label1},
{ID: 2, label: label1},
{ID: 3, label: label2},
}
groups := groupByPartitionChannel(segments)
assert.Equal(t, 2, len(groups))
var count1, count2 int
for _, segs := range groups {
if len(segs) == 2 {
count1++
} else if len(segs) == 1 {
count2++
}
}
assert.Equal(t, 1, count1)
assert.Equal(t, 1, count2)
}
func TestGroupByPartitionChannel_EmptySegments(t *testing.T) {
groups := groupByPartitionChannel([]*SegmentView{})
assert.Empty(t, groups)
}
func TestGroupByPartitionChannel_SameLabel(t *testing.T) {
label := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: 10,
Channel: "ch1",
}
segments := []*SegmentView{
{ID: 1, label: label},
{ID: 2, label: label},
{ID: 3, label: label},
}
groups := groupByPartitionChannel(segments)
assert.Equal(t, 1, len(groups))
for _, segs := range groups {
assert.Equal(t, 3, len(segs))
}
}
// Benchmark tests
func BenchmarkLargerGroupingSegments(b *testing.B) {
sizes := []int{10, 50, 100, 500}
for _, n := range sizes {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
segments := make([]*SegmentView, n)
for i := 0; i < n; i++ {
segments[i] = &SegmentView{
ID: int64(i),
Size: float64((i%10+1)*100*1024*1024 + i*1024*1024),
}
}
targetSize := float64(3 * 1024 * 1024 * 1024)
b.ResetTimer()
for i := 0; i < b.N; i++ {
largerGroupingSegments(segments, targetSize)
}
})
}
}
func BenchmarkMaxFullSegmentsGrouping(b *testing.B) {
sizes := []int{10, 50, 100}
for _, n := range sizes {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
segments := make([]*SegmentView, n)
for i := 0; i < n; i++ {
segments[i] = &SegmentView{
ID: int64(i),
Size: float64((i%10+1)*100*1024*1024 + i*1024*1024),
}
}
targetSize := float64(3 * 1024 * 1024 * 1024)
b.ResetTimer()
for i := 0; i < b.N; i++ {
maxFullSegmentsGrouping(segments, targetSize)
}
})
}
}
func BenchmarkGroupingAlgorithmsComparison(b *testing.B) {
sizes := []int{10, 50, 100, 200, 500}
targetSize := float64(3 * 1024 * 1024 * 1024)
for _, n := range sizes {
segments := make([]*SegmentView, n)
for i := 0; i < n; i++ {
segments[i] = &SegmentView{
ID: int64(i),
Size: float64((i%10+1)*100*1024*1024 + i*1024*1024),
}
}
b.Run(fmt.Sprintf("adaptive/n=%d", n), func(b *testing.B) {
for i := 0; i < b.N; i++ {
adaptiveGroupSegments(segments, targetSize)
}
})
b.Run(fmt.Sprintf("larger/n=%d", n), func(b *testing.B) {
for i := 0; i < b.N; i++ {
largerGroupingSegments(segments, targetSize)
}
})
// Only test maxFull with smaller sizes due to O(n³) complexity
if n <= 200 {
b.Run(fmt.Sprintf("maxFull/n=%d", n), func(b *testing.B) {
for i := 0; i < b.N; i++ {
maxFullSegmentsGrouping(segments, targetSize)
}
})
}
}
}
func BenchmarkGroupByPartitionChannel(b *testing.B) {
sizes := []int{10, 100, 1000}
for _, n := range sizes {
b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
segments := make([]*SegmentView, n)
for i := 0; i < n; i++ {
label := &CompactionGroupLabel{
CollectionID: 1,
PartitionID: int64(i % 5),
Channel: fmt.Sprintf("ch%d", i%3),
}
segments[i] = &SegmentView{
ID: int64(i),
label: label,
}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = groupByPartitionChannel(segments)
}
})
}
}