mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: limit the gc concurrency when cpu is high (#43059)
issue: #42833 Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
parent
1d9a9a993d
commit
e97e44d56e
@ -679,6 +679,7 @@ dataCoord:
|
||||
dropTolerance: 10800 # The retention duration of the binlog files of the deleted segments before they are cleared, unit: second.
|
||||
removeConcurrent: 32 # number of concurrent goroutines to remove dropped s3 objects
|
||||
scanInterval: 168 # orphan file (file on oss but has not been registered on meta) on object storage garbage collection scanning interval in hours
|
||||
slowDownCPUUsageThreshold: 0.6 # The CPU usage threshold at which the garbage collection will be slowed down
|
||||
enableActiveStandby: false
|
||||
brokerTimeout: 5000 # 5000ms, dataCoord broker rpc timeout
|
||||
autoBalance: true # Enable auto balance
|
||||
|
||||
@ -39,6 +39,7 @@ import (
|
||||
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/conc"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/funcutil"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/hardware"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/metautil"
|
||||
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
||||
@ -73,13 +74,42 @@ type garbageCollector struct {
|
||||
wg sync.WaitGroup
|
||||
cmdCh chan gcCmd
|
||||
pauseUntil atomic.Time
|
||||
|
||||
systemMetricsListener *hardware.SystemMetricsListener
|
||||
}
|
||||
|
||||
type gcCmd struct {
|
||||
cmdType datapb.GcCommand
|
||||
duration time.Duration
|
||||
done chan struct{}
|
||||
}
|
||||
|
||||
// newSystemMetricsListener creates a system metrics listener for garbage collector.
|
||||
// used to slow down the garbage collector when cpu usage is high.
|
||||
func newSystemMetricsListener(opt *GcOption) *hardware.SystemMetricsListener {
|
||||
return &hardware.SystemMetricsListener{
|
||||
Cooldown: 15 * time.Second,
|
||||
Context: false,
|
||||
Condition: func(metrics hardware.SystemMetrics, listener *hardware.SystemMetricsListener) bool { return true },
|
||||
Callback: func(metrics hardware.SystemMetrics, listener *hardware.SystemMetricsListener) {
|
||||
isSlowDown := listener.Context.(bool)
|
||||
if metrics.UsedRatio() > paramtable.Get().DataCoordCfg.GCSlowDownCPUUsageThreshold.GetAsFloat() {
|
||||
if !isSlowDown {
|
||||
log.Info("garbage collector slow down...", zap.Float64("cpuUsage", metrics.UsedRatio()))
|
||||
opt.removeObjectPool.Resize(1)
|
||||
listener.Context = true
|
||||
}
|
||||
return
|
||||
}
|
||||
if isSlowDown {
|
||||
log.Info("garbage collector slow down finished", zap.Float64("cpuUsage", metrics.UsedRatio()))
|
||||
opt.removeObjectPool.Resize(paramtable.Get().DataCoordCfg.GCRemoveConcurrent.GetAsInt())
|
||||
listener.Context = false
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// newGarbageCollector create garbage collector with meta and option
|
||||
func newGarbageCollector(meta *meta, handler Handler, opt GcOption) *garbageCollector {
|
||||
log.Info("GC with option",
|
||||
@ -97,6 +127,7 @@ func newGarbageCollector(meta *meta, handler Handler, opt GcOption) *garbageColl
|
||||
handler: handler,
|
||||
option: opt,
|
||||
cmdCh: make(chan gcCmd),
|
||||
systemMetricsListener: newSystemMetricsListener(&opt),
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,6 +213,9 @@ func (gc *garbageCollector) work(ctx context.Context) {
|
||||
|
||||
// startControlLoop start a control loop for garbageCollector.
|
||||
func (gc *garbageCollector) startControlLoop(_ context.Context) {
|
||||
hardware.RegisterSystemMetricsListener(gc.systemMetricsListener)
|
||||
defer hardware.UnregisterSystemMetricsListener(gc.systemMetricsListener)
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-gc.cmdCh:
|
||||
@ -337,6 +371,7 @@ func (gc *garbageCollector) recycleUnusedBinLogWithChecker(ctx context.Context,
|
||||
|
||||
// ignore error since it could be cleaned up next time
|
||||
file := chunkInfo.FilePath
|
||||
|
||||
future := gc.option.removeObjectPool.Submit(func() (struct{}, error) {
|
||||
logger := logger.With(zap.String("file", file))
|
||||
logger.Info("garbageCollector recycleUnusedBinlogFiles remove file...")
|
||||
|
||||
@ -56,11 +56,11 @@ func (m *sealWorker) loop() {
|
||||
timer := time.NewTicker(m.timePolicyCheckInterval)
|
||||
listener := &hardware.SystemMetricsListener{
|
||||
Cooldown: 30 * time.Second,
|
||||
Condition: func(sm hardware.SystemMetrics) bool {
|
||||
Condition: func(sm hardware.SystemMetrics, _ *hardware.SystemMetricsListener) bool {
|
||||
memoryThreshold := m.statsManager.getConfig().memoryThreshold
|
||||
return sm.UsedRatio() > memoryThreshold
|
||||
},
|
||||
Callback: func(sm hardware.SystemMetrics) {
|
||||
Callback: func(sm hardware.SystemMetrics, _ *hardware.SystemMetricsListener) {
|
||||
select {
|
||||
case memoryNotifier <- policy.PolicyNodeMemory(sm.UsedRatio()):
|
||||
// the repeated notify can be ignored.
|
||||
|
||||
@ -41,9 +41,10 @@ func (s SystemMetrics) String() string {
|
||||
// SystemMetricsListener is a listener that listens for system metrics.
|
||||
type SystemMetricsListener struct {
|
||||
nextTriggerInstant time.Time
|
||||
Context any
|
||||
Cooldown time.Duration
|
||||
Condition func(SystemMetrics) bool // condition to trigger the callback
|
||||
Callback func(SystemMetrics) // callback function if the condition met, should be non-blocking.
|
||||
Condition func(SystemMetrics, *SystemMetricsListener) bool // condition to trigger the callback
|
||||
Callback func(SystemMetrics, *SystemMetricsListener) // callback function if the condition met, should be non-blocking.
|
||||
}
|
||||
|
||||
// RegisterSystemMetricsListener registers a listener into global default systemMetricsWatcher.
|
||||
@ -63,10 +64,10 @@ func getSystemMetricsWatcher() *SystemMericsWatcher {
|
||||
logger := log.With(log.FieldComponent("system-metrics"))
|
||||
warningLoggerListener := &SystemMetricsListener{
|
||||
Cooldown: 1 * time.Minute,
|
||||
Condition: func(stats SystemMetrics) bool {
|
||||
Condition: func(stats SystemMetrics, listener *SystemMetricsListener) bool {
|
||||
return stats.UsedRatio() > 0.9
|
||||
},
|
||||
Callback: func(sm SystemMetrics) {
|
||||
Callback: func(sm SystemMetrics, listener *SystemMetricsListener) {
|
||||
logger.Warn("memory used ratio is extremely high", zap.String("memory", sm.String()), zap.Float64("usedRatio", sm.UsedRatio()))
|
||||
},
|
||||
}
|
||||
@ -150,8 +151,8 @@ func (w *SystemMericsWatcher) updateMetrics() {
|
||||
// cool down.
|
||||
continue
|
||||
}
|
||||
if l.Condition(stats) {
|
||||
l.Callback(stats)
|
||||
if l.Condition(stats, l) {
|
||||
l.Callback(stats, l)
|
||||
l.nextTriggerInstant = now.Add(l.Cooldown)
|
||||
}
|
||||
}
|
||||
|
||||
@ -13,14 +13,19 @@ func TestListener(t *testing.T) {
|
||||
called := atomic.NewInt32(0)
|
||||
l := &SystemMetricsListener{
|
||||
Cooldown: 100 * time.Millisecond,
|
||||
Condition: func(stats SystemMetrics) bool {
|
||||
Context: false,
|
||||
Condition: func(stats SystemMetrics, listener *SystemMetricsListener) bool {
|
||||
assert.NotZero(t, stats.UsedMemoryBytes)
|
||||
assert.NotZero(t, stats.TotalMemoryBytes)
|
||||
assert.NotZero(t, stats.UsedRatio())
|
||||
assert.NotEmpty(t, stats.String())
|
||||
assert.False(t, listener.Context.(bool))
|
||||
listener.Context = true
|
||||
return true
|
||||
},
|
||||
Callback: func(sm SystemMetrics) {
|
||||
Callback: func(sm SystemMetrics, listener *SystemMetricsListener) {
|
||||
ctx := listener.Context.(bool)
|
||||
assert.True(t, ctx)
|
||||
assert.NotZero(t, sm.UsedMemoryBytes)
|
||||
assert.NotZero(t, sm.TotalMemoryBytes)
|
||||
assert.NotZero(t, sm.UsedRatio())
|
||||
@ -37,6 +42,7 @@ func TestListener(t *testing.T) {
|
||||
|
||||
l2 := &SystemMetricsListener{
|
||||
Cooldown: 100 * time.Millisecond,
|
||||
Context: false,
|
||||
Condition: l.Condition,
|
||||
Callback: l.Callback,
|
||||
}
|
||||
|
||||
@ -4025,6 +4025,7 @@ type dataCoordConfig struct {
|
||||
GCDropTolerance ParamItem `refreshable:"false"`
|
||||
GCRemoveConcurrent ParamItem `refreshable:"false"`
|
||||
GCScanIntervalInHour ParamItem `refreshable:"false"`
|
||||
GCSlowDownCPUUsageThreshold ParamItem `refreshable:"false"`
|
||||
EnableActiveStandby ParamItem `refreshable:"false"`
|
||||
|
||||
BindIndexNodeMode ParamItem `refreshable:"false"`
|
||||
@ -4703,6 +4704,15 @@ During compaction, the size of segment # of rows is able to exceed segment max #
|
||||
}
|
||||
p.GCScanIntervalInHour.Init(base.mgr)
|
||||
|
||||
p.GCSlowDownCPUUsageThreshold = ParamItem{
|
||||
Key: "dataCoord.gc.slowDownCPUUsageThreshold",
|
||||
Version: "2.6.0",
|
||||
DefaultValue: "0.6",
|
||||
Doc: "The CPU usage threshold at which the garbage collection will be slowed down",
|
||||
Export: true,
|
||||
}
|
||||
p.GCSlowDownCPUUsageThreshold.Init(base.mgr)
|
||||
|
||||
// Do not set this to incredible small value, make sure this to be more than 10 minutes at least
|
||||
p.GCMissingTolerance = ParamItem{
|
||||
Key: "dataCoord.gc.missingTolerance",
|
||||
|
||||
@ -527,6 +527,9 @@ func TestComponentParam(t *testing.T) {
|
||||
params.Save("datacoord.gracefulStopTimeout", "100")
|
||||
assert.Equal(t, 100*time.Second, Params.GracefulStopTimeout.GetAsDuration(time.Second))
|
||||
|
||||
assert.Equal(t, 0.6, Params.GCSlowDownCPUUsageThreshold.GetAsFloat())
|
||||
params.Save("dataCoord.gc.slowDownCPUUsageThreshold", "0.5")
|
||||
assert.Equal(t, 0.5, Params.GCSlowDownCPUUsageThreshold.GetAsFloat())
|
||||
params.Save("dataCoord.compaction.gcInterval", "100")
|
||||
assert.Equal(t, float64(100), Params.CompactionGCIntervalInSeconds.GetAsDuration(time.Second).Seconds())
|
||||
params.Save("dataCoord.compaction.dropTolerance", "100")
|
||||
|
||||
@ -98,7 +98,7 @@ func (s *SealSuite) TestSealByTotalGrowingSegmentsSize() {
|
||||
var segments []*datapb.SegmentInfo
|
||||
segments, err = c.ShowSegments(collectionName)
|
||||
s.NoError(err)
|
||||
s.NotEmpty(segments)
|
||||
// segment may be in growing state or can not be seen at meta right after insert.
|
||||
flushedSegments := lo.Filter(segments, func(segment *datapb.SegmentInfo, _ int) bool {
|
||||
return segment.GetState() == commonpb.SegmentState_Flushed
|
||||
})
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user