enhance: Refine QueryNode task parallelism based on CPU core count (#42166)

issue: #42165
Implement dynamic task execution capacity calculation based on QueryNode
CPU core count instead of static configuration for better resource
utilization.

Changes include:
- Add CpuCoreNum() method and WithCpuCoreNum() option to NodeInfo
- Implement GetTaskExecutionCap() for dynamic capacity calculation
- Add QueryNodeTaskParallelismFactor parameter for tuning
- Update proto definition to include cpu_core_num field
- Add unit tests for new functionality

This allows QueryCoord to automatically adjust task parallelism based on
actual hardware resources.

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2025-06-11 13:20:35 +08:00 committed by GitHub
parent 499e9a0a73
commit e7c0a6ffbb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 1180 additions and 1066 deletions

View File

@ -140,6 +140,7 @@ func (dh *distHandler) handleDistResp(ctx context.Context, resp *querypb.GetData
session.WithSegmentCnt(len(resp.GetSegments())),
session.WithChannelCnt(len(resp.GetChannels())),
session.WithMemCapacity(resp.GetMemCapacityInMB()),
session.WithChannelCnt(int(resp.GetCpuNum())),
)
dh.updateSegmentsDistribution(ctx, resp)
dh.updateChannelsDistribution(ctx, resp)

View File

@ -176,6 +176,12 @@ func (n *NodeInfo) MemCapacity() float64 {
return n.stats.getMemCapacity()
}
func (n *NodeInfo) CPUNum() int64 {
n.mu.RLock()
defer n.mu.RUnlock()
return n.stats.getCPUNum()
}
func (n *NodeInfo) SetLastHeartbeat(time time.Time) {
n.lastHeartbeat.Store(time.UnixNano())
}
@ -241,3 +247,9 @@ func WithMemCapacity(capacity float64) StatsOption {
n.setMemCapacity(capacity)
}
}
func WithCPUNum(num int64) StatsOption {
return func(n *NodeInfo) {
n.setCPUNum(num)
}
}

View File

@ -88,6 +88,58 @@ func (s *NodeManagerSuite) TestNodeInfo() {
s.NotNil(node.LastHeartbeat())
}
// TestCPUNumFunctionality tests the newly added CPU core number functionality
func (s *NodeManagerSuite) TestCPUNumFunctionality() {
node := NewNodeInfo(ImmutableNodeInfo{
NodeID: 1,
Address: "localhost:19530",
Hostname: "test-host",
})
// Test initial CPU core number
s.Equal(int64(0), node.CPUNum())
// Test WithCPUNum option
node.UpdateStats(WithCPUNum(8))
s.Equal(int64(8), node.CPUNum())
// Test updating CPU core number
node.UpdateStats(WithCPUNum(16))
s.Equal(int64(16), node.CPUNum())
// Test multiple stats update including CPU core number
node.UpdateStats(
WithSegmentCnt(100),
WithChannelCnt(5),
WithMemCapacity(4096.0),
WithCPUNum(32),
)
s.Equal(int64(32), node.CPUNum())
s.Equal(100, node.SegmentCnt())
s.Equal(5, node.ChannelCnt())
s.Equal(4096.0, node.MemCapacity())
}
// TestMemCapacityFunctionality tests memory capacity related methods
func (s *NodeManagerSuite) TestMemCapacityFunctionality() {
node := NewNodeInfo(ImmutableNodeInfo{
NodeID: 1,
Address: "localhost:19530",
Hostname: "test-host",
})
// Test initial memory capacity
s.Equal(float64(0), node.MemCapacity())
// Test WithMemCapacity option
node.UpdateStats(WithMemCapacity(1024.5))
s.Equal(1024.5, node.MemCapacity())
// Test updating memory capacity
node.UpdateStats(WithMemCapacity(2048.75))
s.Equal(2048.75, node.MemCapacity())
}
func TestNodeManagerSuite(t *testing.T) {
suite.Run(t, new(NodeManagerSuite))
}

View File

@ -20,6 +20,7 @@ type stats struct {
segmentCnt int
channelCnt int
memCapacityInMB float64
CPUNum int64
}
func (s *stats) setSegmentCnt(cnt int) {
@ -46,6 +47,14 @@ func (s *stats) getMemCapacity() float64 {
return s.memCapacityInMB
}
func (s *stats) setCPUNum(num int64) {
s.CPUNum = num
}
func (s *stats) getCPUNum() int64 {
return s.CPUNum
}
func newStats() stats {
return stats{}
}

View File

@ -19,6 +19,7 @@ package task
import (
"context"
"fmt"
"math"
"sync"
"time"
@ -54,6 +55,7 @@ var segmentsVersion = semver.Version{
}
type Executor struct {
nodeID int64
doneCh chan struct{}
wg sync.WaitGroup
meta *meta.Meta
@ -96,6 +98,17 @@ func (ex *Executor) Stop() {
ex.wg.Wait()
}
func (ex *Executor) GetTaskExecutionCap() int32 {
nodeInfo := ex.nodeMgr.Get(ex.nodeID)
if nodeInfo == nil || nodeInfo.CPUNum() == 0 {
return Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32()
}
ret := int32(math.Ceil(float64(nodeInfo.CPUNum()) * Params.QueryCoordCfg.QueryNodeTaskParallelismFactor.GetAsFloat()))
return ret
}
// Execute executes the given action,
// does nothing and returns false if the action is already committed,
// returns true otherwise.
@ -104,7 +117,7 @@ func (ex *Executor) Execute(task Task, step int) bool {
if exist {
return false
}
if ex.executingTaskNum.Inc() > Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32() {
if ex.executingTaskNum.Inc() > ex.GetTaskExecutionCap() {
ex.executingTasks.Remove(task.Index())
ex.executingTaskNum.Dec()
return false

View File

@ -1305,6 +1305,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get
LeaderViews: leaderViews,
LastModifyTs: lastModifyTs,
MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024),
CpuNum: int64(hardware.GetCPUNum()),
}, nil
}

View File

@ -616,6 +616,7 @@ message GetDataDistributionResponse {
repeated LeaderView leader_views = 5;
int64 lastModifyTs = 6;
double memCapacityInMB = 7;
int64 cpu_num = 8;
}
message LeaderView {

File diff suppressed because it is too large Load Diff

View File

@ -2120,6 +2120,9 @@ type queryCoordConfig struct {
BalanceSegmentBatchSize ParamItem `refreshable:"true"`
BalanceChannelBatchSize ParamItem `refreshable:"true"`
EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"`
// query node task parallelism factor
QueryNodeTaskParallelismFactor ParamItem `refreshable:"true"`
}
func (p *queryCoordConfig) init(base *BaseTable) {
@ -2743,6 +2746,15 @@ If this parameter is set false, Milvus simply searches the growing segments with
Export: false,
}
p.EnableBalanceOnMultipleCollections.Init(base.mgr)
p.QueryNodeTaskParallelismFactor = ParamItem{
Key: "queryCoord.queryNodeTaskParallelismFactor",
Version: "2.5.14",
DefaultValue: "1",
Doc: "the parallelism factor for query node task, which permit query node execute cpuNum * parallelismFactor tasks in parallel",
Export: false,
}
p.QueryNodeTaskParallelismFactor.Init(base.mgr)
}
// /////////////////////////////////////////////////////////////////////////////

View File

@ -389,6 +389,10 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt())
assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt())
assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool())
assert.Equal(t, 1, Params.QueryNodeTaskParallelismFactor.GetAsInt())
params.Save("queryCoord.queryNodeTaskParallelismFactor", "2")
assert.Equal(t, 2, Params.QueryNodeTaskParallelismFactor.GetAsInt())
})
t.Run("test queryNodeConfig", func(t *testing.T) {