enhance: Refine QueryNode task parallelism based on CPU core count (#42166)

issue: #42165
Implement dynamic task execution capacity calculation based on QueryNode
CPU core count instead of static configuration for better resource
utilization.

Changes include:
- Add CpuCoreNum() method and WithCpuCoreNum() option to NodeInfo
- Implement GetTaskExecutionCap() for dynamic capacity calculation
- Add QueryNodeTaskParallelismFactor parameter for tuning
- Update proto definition to include cpu_core_num field
- Add unit tests for new functionality

This allows QueryCoord to automatically adjust task parallelism based on
actual hardware resources.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2025-06-11 13:20:35 +08:00 committed by GitHub
parent 499e9a0a73
commit e7c0a6ffbb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 1180 additions and 1066 deletions

View File

@ -140,6 +140,7 @@ func (dh *distHandler) handleDistResp(ctx context.Context, resp *querypb.GetData
session.WithSegmentCnt(len(resp.GetSegments())), session.WithSegmentCnt(len(resp.GetSegments())),
session.WithChannelCnt(len(resp.GetChannels())), session.WithChannelCnt(len(resp.GetChannels())),
session.WithMemCapacity(resp.GetMemCapacityInMB()), session.WithMemCapacity(resp.GetMemCapacityInMB()),
session.WithChannelCnt(int(resp.GetCpuNum())),
) )
dh.updateSegmentsDistribution(ctx, resp) dh.updateSegmentsDistribution(ctx, resp)
dh.updateChannelsDistribution(ctx, resp) dh.updateChannelsDistribution(ctx, resp)

View File

@ -176,6 +176,12 @@ func (n *NodeInfo) MemCapacity() float64 {
return n.stats.getMemCapacity() return n.stats.getMemCapacity()
} }
func (n *NodeInfo) CPUNum() int64 {
n.mu.RLock()
defer n.mu.RUnlock()
return n.stats.getCPUNum()
}
func (n *NodeInfo) SetLastHeartbeat(time time.Time) { func (n *NodeInfo) SetLastHeartbeat(time time.Time) {
n.lastHeartbeat.Store(time.UnixNano()) n.lastHeartbeat.Store(time.UnixNano())
} }
@ -241,3 +247,9 @@ func WithMemCapacity(capacity float64) StatsOption {
n.setMemCapacity(capacity) n.setMemCapacity(capacity)
} }
} }
func WithCPUNum(num int64) StatsOption {
return func(n *NodeInfo) {
n.setCPUNum(num)
}
}

View File

@ -88,6 +88,58 @@ func (s *NodeManagerSuite) TestNodeInfo() {
s.NotNil(node.LastHeartbeat()) s.NotNil(node.LastHeartbeat())
} }
// TestCPUNumFunctionality tests the newly added CPU core number functionality
func (s *NodeManagerSuite) TestCPUNumFunctionality() {
node := NewNodeInfo(ImmutableNodeInfo{
NodeID: 1,
Address: "localhost:19530",
Hostname: "test-host",
})
// Test initial CPU core number
s.Equal(int64(0), node.CPUNum())
// Test WithCPUNum option
node.UpdateStats(WithCPUNum(8))
s.Equal(int64(8), node.CPUNum())
// Test updating CPU core number
node.UpdateStats(WithCPUNum(16))
s.Equal(int64(16), node.CPUNum())
// Test multiple stats update including CPU core number
node.UpdateStats(
WithSegmentCnt(100),
WithChannelCnt(5),
WithMemCapacity(4096.0),
WithCPUNum(32),
)
s.Equal(int64(32), node.CPUNum())
s.Equal(100, node.SegmentCnt())
s.Equal(5, node.ChannelCnt())
s.Equal(4096.0, node.MemCapacity())
}
// TestMemCapacityFunctionality tests memory capacity related methods
func (s *NodeManagerSuite) TestMemCapacityFunctionality() {
node := NewNodeInfo(ImmutableNodeInfo{
NodeID: 1,
Address: "localhost:19530",
Hostname: "test-host",
})
// Test initial memory capacity
s.Equal(float64(0), node.MemCapacity())
// Test WithMemCapacity option
node.UpdateStats(WithMemCapacity(1024.5))
s.Equal(1024.5, node.MemCapacity())
// Test updating memory capacity
node.UpdateStats(WithMemCapacity(2048.75))
s.Equal(2048.75, node.MemCapacity())
}
func TestNodeManagerSuite(t *testing.T) { func TestNodeManagerSuite(t *testing.T) {
suite.Run(t, new(NodeManagerSuite)) suite.Run(t, new(NodeManagerSuite))
} }

View File

@ -20,6 +20,7 @@ type stats struct {
segmentCnt int segmentCnt int
channelCnt int channelCnt int
memCapacityInMB float64 memCapacityInMB float64
CPUNum int64
} }
func (s *stats) setSegmentCnt(cnt int) { func (s *stats) setSegmentCnt(cnt int) {
@ -46,6 +47,14 @@ func (s *stats) getMemCapacity() float64 {
return s.memCapacityInMB return s.memCapacityInMB
} }
func (s *stats) setCPUNum(num int64) {
s.CPUNum = num
}
func (s *stats) getCPUNum() int64 {
return s.CPUNum
}
func newStats() stats { func newStats() stats {
return stats{} return stats{}
} }

View File

@ -19,6 +19,7 @@ package task
import ( import (
"context" "context"
"fmt" "fmt"
"math"
"sync" "sync"
"time" "time"
@ -54,6 +55,7 @@ var segmentsVersion = semver.Version{
} }
type Executor struct { type Executor struct {
nodeID int64
doneCh chan struct{} doneCh chan struct{}
wg sync.WaitGroup wg sync.WaitGroup
meta *meta.Meta meta *meta.Meta
@ -96,6 +98,17 @@ func (ex *Executor) Stop() {
ex.wg.Wait() ex.wg.Wait()
} }
func (ex *Executor) GetTaskExecutionCap() int32 {
nodeInfo := ex.nodeMgr.Get(ex.nodeID)
if nodeInfo == nil || nodeInfo.CPUNum() == 0 {
return Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32()
}
ret := int32(math.Ceil(float64(nodeInfo.CPUNum()) * Params.QueryCoordCfg.QueryNodeTaskParallelismFactor.GetAsFloat()))
return ret
}
// Execute executes the given action, // Execute executes the given action,
// does nothing and returns false if the action is already committed, // does nothing and returns false if the action is already committed,
// returns true otherwise. // returns true otherwise.
@ -104,7 +117,7 @@ func (ex *Executor) Execute(task Task, step int) bool {
if exist { if exist {
return false return false
} }
if ex.executingTaskNum.Inc() > Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32() { if ex.executingTaskNum.Inc() > ex.GetTaskExecutionCap() {
ex.executingTasks.Remove(task.Index()) ex.executingTasks.Remove(task.Index())
ex.executingTaskNum.Dec() ex.executingTaskNum.Dec()
return false return false

View File

@ -1305,6 +1305,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get
LeaderViews: leaderViews, LeaderViews: leaderViews,
LastModifyTs: lastModifyTs, LastModifyTs: lastModifyTs,
MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024), MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024),
CpuNum: int64(hardware.GetCPUNum()),
}, nil }, nil
} }

View File

@ -616,6 +616,7 @@ message GetDataDistributionResponse {
repeated LeaderView leader_views = 5; repeated LeaderView leader_views = 5;
int64 lastModifyTs = 6; int64 lastModifyTs = 6;
double memCapacityInMB = 7; double memCapacityInMB = 7;
int64 cpu_num = 8;
} }
message LeaderView { message LeaderView {

File diff suppressed because it is too large Load Diff

View File

@ -2120,6 +2120,9 @@ type queryCoordConfig struct {
BalanceSegmentBatchSize ParamItem `refreshable:"true"` BalanceSegmentBatchSize ParamItem `refreshable:"true"`
BalanceChannelBatchSize ParamItem `refreshable:"true"` BalanceChannelBatchSize ParamItem `refreshable:"true"`
EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"` EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"`
// query node task parallelism factor
QueryNodeTaskParallelismFactor ParamItem `refreshable:"true"`
} }
func (p *queryCoordConfig) init(base *BaseTable) { func (p *queryCoordConfig) init(base *BaseTable) {
@ -2743,6 +2746,15 @@ If this parameter is set false, Milvus simply searches the growing segments with
Export: false, Export: false,
} }
p.EnableBalanceOnMultipleCollections.Init(base.mgr) p.EnableBalanceOnMultipleCollections.Init(base.mgr)
p.QueryNodeTaskParallelismFactor = ParamItem{
Key: "queryCoord.queryNodeTaskParallelismFactor",
Version: "2.5.14",
DefaultValue: "1",
Doc: "the parallelism factor for query node task, which permit query node execute cpuNum * parallelismFactor tasks in parallel",
Export: false,
}
p.QueryNodeTaskParallelismFactor.Init(base.mgr)
} }
// ///////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////

View File

@ -389,6 +389,10 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt()) assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt())
assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt()) assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt())
assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool()) assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool())
assert.Equal(t, 1, Params.QueryNodeTaskParallelismFactor.GetAsInt())
params.Save("queryCoord.queryNodeTaskParallelismFactor", "2")
assert.Equal(t, 2, Params.QueryNodeTaskParallelismFactor.GetAsInt())
}) })
t.Run("test queryNodeConfig", func(t *testing.T) { t.Run("test queryNodeConfig", func(t *testing.T) {