mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
enhance: Refine QueryNode task parallelism based on CPU core count (#42166)
issue: #42165 Implement dynamic task execution capacity calculation based on QueryNode CPU core count instead of static configuration for better resource utilization. Changes include: - Add CpuCoreNum() method and WithCpuCoreNum() option to NodeInfo - Implement GetTaskExecutionCap() for dynamic capacity calculation - Add QueryNodeTaskParallelismFactor parameter for tuning - Update proto definition to include cpu_core_num field - Add unit tests for new functionality This allows QueryCoord to automatically adjust task parallelism based on actual hardware resources. Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
499e9a0a73
commit
e7c0a6ffbb
1
internal/querycoordv2/dist/dist_handler.go
vendored
1
internal/querycoordv2/dist/dist_handler.go
vendored
@ -140,6 +140,7 @@ func (dh *distHandler) handleDistResp(ctx context.Context, resp *querypb.GetData
|
||||
session.WithSegmentCnt(len(resp.GetSegments())),
|
||||
session.WithChannelCnt(len(resp.GetChannels())),
|
||||
session.WithMemCapacity(resp.GetMemCapacityInMB()),
|
||||
session.WithChannelCnt(int(resp.GetCpuNum())),
|
||||
)
|
||||
dh.updateSegmentsDistribution(ctx, resp)
|
||||
dh.updateChannelsDistribution(ctx, resp)
|
||||
|
||||
@ -176,6 +176,12 @@ func (n *NodeInfo) MemCapacity() float64 {
|
||||
return n.stats.getMemCapacity()
|
||||
}
|
||||
|
||||
func (n *NodeInfo) CPUNum() int64 {
|
||||
n.mu.RLock()
|
||||
defer n.mu.RUnlock()
|
||||
return n.stats.getCPUNum()
|
||||
}
|
||||
|
||||
func (n *NodeInfo) SetLastHeartbeat(time time.Time) {
|
||||
n.lastHeartbeat.Store(time.UnixNano())
|
||||
}
|
||||
@ -241,3 +247,9 @@ func WithMemCapacity(capacity float64) StatsOption {
|
||||
n.setMemCapacity(capacity)
|
||||
}
|
||||
}
|
||||
|
||||
func WithCPUNum(num int64) StatsOption {
|
||||
return func(n *NodeInfo) {
|
||||
n.setCPUNum(num)
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,6 +88,58 @@ func (s *NodeManagerSuite) TestNodeInfo() {
|
||||
s.NotNil(node.LastHeartbeat())
|
||||
}
|
||||
|
||||
// TestCPUNumFunctionality tests the newly added CPU core number functionality
|
||||
func (s *NodeManagerSuite) TestCPUNumFunctionality() {
|
||||
node := NewNodeInfo(ImmutableNodeInfo{
|
||||
NodeID: 1,
|
||||
Address: "localhost:19530",
|
||||
Hostname: "test-host",
|
||||
})
|
||||
|
||||
// Test initial CPU core number
|
||||
s.Equal(int64(0), node.CPUNum())
|
||||
|
||||
// Test WithCPUNum option
|
||||
node.UpdateStats(WithCPUNum(8))
|
||||
s.Equal(int64(8), node.CPUNum())
|
||||
|
||||
// Test updating CPU core number
|
||||
node.UpdateStats(WithCPUNum(16))
|
||||
s.Equal(int64(16), node.CPUNum())
|
||||
|
||||
// Test multiple stats update including CPU core number
|
||||
node.UpdateStats(
|
||||
WithSegmentCnt(100),
|
||||
WithChannelCnt(5),
|
||||
WithMemCapacity(4096.0),
|
||||
WithCPUNum(32),
|
||||
)
|
||||
s.Equal(int64(32), node.CPUNum())
|
||||
s.Equal(100, node.SegmentCnt())
|
||||
s.Equal(5, node.ChannelCnt())
|
||||
s.Equal(4096.0, node.MemCapacity())
|
||||
}
|
||||
|
||||
// TestMemCapacityFunctionality tests memory capacity related methods
|
||||
func (s *NodeManagerSuite) TestMemCapacityFunctionality() {
|
||||
node := NewNodeInfo(ImmutableNodeInfo{
|
||||
NodeID: 1,
|
||||
Address: "localhost:19530",
|
||||
Hostname: "test-host",
|
||||
})
|
||||
|
||||
// Test initial memory capacity
|
||||
s.Equal(float64(0), node.MemCapacity())
|
||||
|
||||
// Test WithMemCapacity option
|
||||
node.UpdateStats(WithMemCapacity(1024.5))
|
||||
s.Equal(1024.5, node.MemCapacity())
|
||||
|
||||
// Test updating memory capacity
|
||||
node.UpdateStats(WithMemCapacity(2048.75))
|
||||
s.Equal(2048.75, node.MemCapacity())
|
||||
}
|
||||
|
||||
func TestNodeManagerSuite(t *testing.T) {
|
||||
suite.Run(t, new(NodeManagerSuite))
|
||||
}
|
||||
|
||||
@ -20,6 +20,7 @@ type stats struct {
|
||||
segmentCnt int
|
||||
channelCnt int
|
||||
memCapacityInMB float64
|
||||
CPUNum int64
|
||||
}
|
||||
|
||||
func (s *stats) setSegmentCnt(cnt int) {
|
||||
@ -46,6 +47,14 @@ func (s *stats) getMemCapacity() float64 {
|
||||
return s.memCapacityInMB
|
||||
}
|
||||
|
||||
func (s *stats) setCPUNum(num int64) {
|
||||
s.CPUNum = num
|
||||
}
|
||||
|
||||
func (s *stats) getCPUNum() int64 {
|
||||
return s.CPUNum
|
||||
}
|
||||
|
||||
func newStats() stats {
|
||||
return stats{}
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ package task
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -54,6 +55,7 @@ var segmentsVersion = semver.Version{
|
||||
}
|
||||
|
||||
type Executor struct {
|
||||
nodeID int64
|
||||
doneCh chan struct{}
|
||||
wg sync.WaitGroup
|
||||
meta *meta.Meta
|
||||
@ -96,6 +98,17 @@ func (ex *Executor) Stop() {
|
||||
ex.wg.Wait()
|
||||
}
|
||||
|
||||
func (ex *Executor) GetTaskExecutionCap() int32 {
|
||||
nodeInfo := ex.nodeMgr.Get(ex.nodeID)
|
||||
if nodeInfo == nil || nodeInfo.CPUNum() == 0 {
|
||||
return Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32()
|
||||
}
|
||||
|
||||
ret := int32(math.Ceil(float64(nodeInfo.CPUNum()) * Params.QueryCoordCfg.QueryNodeTaskParallelismFactor.GetAsFloat()))
|
||||
|
||||
return ret
|
||||
}
|
||||
|
||||
// Execute executes the given action,
|
||||
// does nothing and returns false if the action is already committed,
|
||||
// returns true otherwise.
|
||||
@ -104,7 +117,7 @@ func (ex *Executor) Execute(task Task, step int) bool {
|
||||
if exist {
|
||||
return false
|
||||
}
|
||||
if ex.executingTaskNum.Inc() > Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32() {
|
||||
if ex.executingTaskNum.Inc() > ex.GetTaskExecutionCap() {
|
||||
ex.executingTasks.Remove(task.Index())
|
||||
ex.executingTaskNum.Dec()
|
||||
return false
|
||||
|
||||
@ -1305,6 +1305,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get
|
||||
LeaderViews: leaderViews,
|
||||
LastModifyTs: lastModifyTs,
|
||||
MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024),
|
||||
CpuNum: int64(hardware.GetCPUNum()),
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@ -616,6 +616,7 @@ message GetDataDistributionResponse {
|
||||
repeated LeaderView leader_views = 5;
|
||||
int64 lastModifyTs = 6;
|
||||
double memCapacityInMB = 7;
|
||||
int64 cpu_num = 8;
|
||||
}
|
||||
|
||||
message LeaderView {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -2120,6 +2120,9 @@ type queryCoordConfig struct {
|
||||
BalanceSegmentBatchSize ParamItem `refreshable:"true"`
|
||||
BalanceChannelBatchSize ParamItem `refreshable:"true"`
|
||||
EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"`
|
||||
|
||||
// query node task parallelism factor
|
||||
QueryNodeTaskParallelismFactor ParamItem `refreshable:"true"`
|
||||
}
|
||||
|
||||
func (p *queryCoordConfig) init(base *BaseTable) {
|
||||
@ -2743,6 +2746,15 @@ If this parameter is set false, Milvus simply searches the growing segments with
|
||||
Export: false,
|
||||
}
|
||||
p.EnableBalanceOnMultipleCollections.Init(base.mgr)
|
||||
|
||||
p.QueryNodeTaskParallelismFactor = ParamItem{
|
||||
Key: "queryCoord.queryNodeTaskParallelismFactor",
|
||||
Version: "2.5.14",
|
||||
DefaultValue: "1",
|
||||
Doc: "the parallelism factor for query node task, which permit query node execute cpuNum * parallelismFactor tasks in parallel",
|
||||
Export: false,
|
||||
}
|
||||
p.QueryNodeTaskParallelismFactor.Init(base.mgr)
|
||||
}
|
||||
|
||||
// /////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -389,6 +389,10 @@ func TestComponentParam(t *testing.T) {
|
||||
assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt())
|
||||
assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt())
|
||||
assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool())
|
||||
|
||||
assert.Equal(t, 1, Params.QueryNodeTaskParallelismFactor.GetAsInt())
|
||||
params.Save("queryCoord.queryNodeTaskParallelismFactor", "2")
|
||||
assert.Equal(t, 2, Params.QueryNodeTaskParallelismFactor.GetAsInt())
|
||||
})
|
||||
|
||||
t.Run("test queryNodeConfig", func(t *testing.T) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user