mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 09:38:39 +08:00
enhance: Refine QueryNode task parallelism based on CPU core count (#42166)
issue: #42165 Implement dynamic task execution capacity calculation based on QueryNode CPU core count instead of static configuration for better resource utilization. Changes include: - Add CpuCoreNum() method and WithCpuCoreNum() option to NodeInfo - Implement GetTaskExecutionCap() for dynamic capacity calculation - Add QueryNodeTaskParallelismFactor parameter for tuning - Update proto definition to include cpu_core_num field - Add unit tests for new functionality This allows QueryCoord to automatically adjust task parallelism based on actual hardware resources. Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
499e9a0a73
commit
e7c0a6ffbb
1
internal/querycoordv2/dist/dist_handler.go
vendored
1
internal/querycoordv2/dist/dist_handler.go
vendored
@ -140,6 +140,7 @@ func (dh *distHandler) handleDistResp(ctx context.Context, resp *querypb.GetData
|
|||||||
session.WithSegmentCnt(len(resp.GetSegments())),
|
session.WithSegmentCnt(len(resp.GetSegments())),
|
||||||
session.WithChannelCnt(len(resp.GetChannels())),
|
session.WithChannelCnt(len(resp.GetChannels())),
|
||||||
session.WithMemCapacity(resp.GetMemCapacityInMB()),
|
session.WithMemCapacity(resp.GetMemCapacityInMB()),
|
||||||
|
session.WithChannelCnt(int(resp.GetCpuNum())),
|
||||||
)
|
)
|
||||||
dh.updateSegmentsDistribution(ctx, resp)
|
dh.updateSegmentsDistribution(ctx, resp)
|
||||||
dh.updateChannelsDistribution(ctx, resp)
|
dh.updateChannelsDistribution(ctx, resp)
|
||||||
|
|||||||
@ -176,6 +176,12 @@ func (n *NodeInfo) MemCapacity() float64 {
|
|||||||
return n.stats.getMemCapacity()
|
return n.stats.getMemCapacity()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (n *NodeInfo) CPUNum() int64 {
|
||||||
|
n.mu.RLock()
|
||||||
|
defer n.mu.RUnlock()
|
||||||
|
return n.stats.getCPUNum()
|
||||||
|
}
|
||||||
|
|
||||||
func (n *NodeInfo) SetLastHeartbeat(time time.Time) {
|
func (n *NodeInfo) SetLastHeartbeat(time time.Time) {
|
||||||
n.lastHeartbeat.Store(time.UnixNano())
|
n.lastHeartbeat.Store(time.UnixNano())
|
||||||
}
|
}
|
||||||
@ -241,3 +247,9 @@ func WithMemCapacity(capacity float64) StatsOption {
|
|||||||
n.setMemCapacity(capacity)
|
n.setMemCapacity(capacity)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithCPUNum(num int64) StatsOption {
|
||||||
|
return func(n *NodeInfo) {
|
||||||
|
n.setCPUNum(num)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -88,6 +88,58 @@ func (s *NodeManagerSuite) TestNodeInfo() {
|
|||||||
s.NotNil(node.LastHeartbeat())
|
s.NotNil(node.LastHeartbeat())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestCPUNumFunctionality tests the newly added CPU core number functionality
|
||||||
|
func (s *NodeManagerSuite) TestCPUNumFunctionality() {
|
||||||
|
node := NewNodeInfo(ImmutableNodeInfo{
|
||||||
|
NodeID: 1,
|
||||||
|
Address: "localhost:19530",
|
||||||
|
Hostname: "test-host",
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test initial CPU core number
|
||||||
|
s.Equal(int64(0), node.CPUNum())
|
||||||
|
|
||||||
|
// Test WithCPUNum option
|
||||||
|
node.UpdateStats(WithCPUNum(8))
|
||||||
|
s.Equal(int64(8), node.CPUNum())
|
||||||
|
|
||||||
|
// Test updating CPU core number
|
||||||
|
node.UpdateStats(WithCPUNum(16))
|
||||||
|
s.Equal(int64(16), node.CPUNum())
|
||||||
|
|
||||||
|
// Test multiple stats update including CPU core number
|
||||||
|
node.UpdateStats(
|
||||||
|
WithSegmentCnt(100),
|
||||||
|
WithChannelCnt(5),
|
||||||
|
WithMemCapacity(4096.0),
|
||||||
|
WithCPUNum(32),
|
||||||
|
)
|
||||||
|
s.Equal(int64(32), node.CPUNum())
|
||||||
|
s.Equal(100, node.SegmentCnt())
|
||||||
|
s.Equal(5, node.ChannelCnt())
|
||||||
|
s.Equal(4096.0, node.MemCapacity())
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestMemCapacityFunctionality tests memory capacity related methods
|
||||||
|
func (s *NodeManagerSuite) TestMemCapacityFunctionality() {
|
||||||
|
node := NewNodeInfo(ImmutableNodeInfo{
|
||||||
|
NodeID: 1,
|
||||||
|
Address: "localhost:19530",
|
||||||
|
Hostname: "test-host",
|
||||||
|
})
|
||||||
|
|
||||||
|
// Test initial memory capacity
|
||||||
|
s.Equal(float64(0), node.MemCapacity())
|
||||||
|
|
||||||
|
// Test WithMemCapacity option
|
||||||
|
node.UpdateStats(WithMemCapacity(1024.5))
|
||||||
|
s.Equal(1024.5, node.MemCapacity())
|
||||||
|
|
||||||
|
// Test updating memory capacity
|
||||||
|
node.UpdateStats(WithMemCapacity(2048.75))
|
||||||
|
s.Equal(2048.75, node.MemCapacity())
|
||||||
|
}
|
||||||
|
|
||||||
func TestNodeManagerSuite(t *testing.T) {
|
func TestNodeManagerSuite(t *testing.T) {
|
||||||
suite.Run(t, new(NodeManagerSuite))
|
suite.Run(t, new(NodeManagerSuite))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -20,6 +20,7 @@ type stats struct {
|
|||||||
segmentCnt int
|
segmentCnt int
|
||||||
channelCnt int
|
channelCnt int
|
||||||
memCapacityInMB float64
|
memCapacityInMB float64
|
||||||
|
CPUNum int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *stats) setSegmentCnt(cnt int) {
|
func (s *stats) setSegmentCnt(cnt int) {
|
||||||
@ -46,6 +47,14 @@ func (s *stats) getMemCapacity() float64 {
|
|||||||
return s.memCapacityInMB
|
return s.memCapacityInMB
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *stats) setCPUNum(num int64) {
|
||||||
|
s.CPUNum = num
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *stats) getCPUNum() int64 {
|
||||||
|
return s.CPUNum
|
||||||
|
}
|
||||||
|
|
||||||
func newStats() stats {
|
func newStats() stats {
|
||||||
return stats{}
|
return stats{}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -19,6 +19,7 @@ package task
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"math"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -54,6 +55,7 @@ var segmentsVersion = semver.Version{
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Executor struct {
|
type Executor struct {
|
||||||
|
nodeID int64
|
||||||
doneCh chan struct{}
|
doneCh chan struct{}
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
meta *meta.Meta
|
meta *meta.Meta
|
||||||
@ -96,6 +98,17 @@ func (ex *Executor) Stop() {
|
|||||||
ex.wg.Wait()
|
ex.wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ex *Executor) GetTaskExecutionCap() int32 {
|
||||||
|
nodeInfo := ex.nodeMgr.Get(ex.nodeID)
|
||||||
|
if nodeInfo == nil || nodeInfo.CPUNum() == 0 {
|
||||||
|
return Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32()
|
||||||
|
}
|
||||||
|
|
||||||
|
ret := int32(math.Ceil(float64(nodeInfo.CPUNum()) * Params.QueryCoordCfg.QueryNodeTaskParallelismFactor.GetAsFloat()))
|
||||||
|
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
||||||
// Execute executes the given action,
|
// Execute executes the given action,
|
||||||
// does nothing and returns false if the action is already committed,
|
// does nothing and returns false if the action is already committed,
|
||||||
// returns true otherwise.
|
// returns true otherwise.
|
||||||
@ -104,7 +117,7 @@ func (ex *Executor) Execute(task Task, step int) bool {
|
|||||||
if exist {
|
if exist {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
if ex.executingTaskNum.Inc() > Params.QueryCoordCfg.TaskExecutionCap.GetAsInt32() {
|
if ex.executingTaskNum.Inc() > ex.GetTaskExecutionCap() {
|
||||||
ex.executingTasks.Remove(task.Index())
|
ex.executingTasks.Remove(task.Index())
|
||||||
ex.executingTaskNum.Dec()
|
ex.executingTaskNum.Dec()
|
||||||
return false
|
return false
|
||||||
|
|||||||
@ -1305,6 +1305,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get
|
|||||||
LeaderViews: leaderViews,
|
LeaderViews: leaderViews,
|
||||||
LastModifyTs: lastModifyTs,
|
LastModifyTs: lastModifyTs,
|
||||||
MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024),
|
MemCapacityInMB: float64(hardware.GetMemoryCount() / 1024 / 1024),
|
||||||
|
CpuNum: int64(hardware.GetCPUNum()),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -616,6 +616,7 @@ message GetDataDistributionResponse {
|
|||||||
repeated LeaderView leader_views = 5;
|
repeated LeaderView leader_views = 5;
|
||||||
int64 lastModifyTs = 6;
|
int64 lastModifyTs = 6;
|
||||||
double memCapacityInMB = 7;
|
double memCapacityInMB = 7;
|
||||||
|
int64 cpu_num = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
message LeaderView {
|
message LeaderView {
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -2120,6 +2120,9 @@ type queryCoordConfig struct {
|
|||||||
BalanceSegmentBatchSize ParamItem `refreshable:"true"`
|
BalanceSegmentBatchSize ParamItem `refreshable:"true"`
|
||||||
BalanceChannelBatchSize ParamItem `refreshable:"true"`
|
BalanceChannelBatchSize ParamItem `refreshable:"true"`
|
||||||
EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"`
|
EnableBalanceOnMultipleCollections ParamItem `refreshable:"true"`
|
||||||
|
|
||||||
|
// query node task parallelism factor
|
||||||
|
QueryNodeTaskParallelismFactor ParamItem `refreshable:"true"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *queryCoordConfig) init(base *BaseTable) {
|
func (p *queryCoordConfig) init(base *BaseTable) {
|
||||||
@ -2743,6 +2746,15 @@ If this parameter is set false, Milvus simply searches the growing segments with
|
|||||||
Export: false,
|
Export: false,
|
||||||
}
|
}
|
||||||
p.EnableBalanceOnMultipleCollections.Init(base.mgr)
|
p.EnableBalanceOnMultipleCollections.Init(base.mgr)
|
||||||
|
|
||||||
|
p.QueryNodeTaskParallelismFactor = ParamItem{
|
||||||
|
Key: "queryCoord.queryNodeTaskParallelismFactor",
|
||||||
|
Version: "2.5.14",
|
||||||
|
DefaultValue: "1",
|
||||||
|
Doc: "the parallelism factor for query node task, which permit query node execute cpuNum * parallelismFactor tasks in parallel",
|
||||||
|
Export: false,
|
||||||
|
}
|
||||||
|
p.QueryNodeTaskParallelismFactor.Init(base.mgr)
|
||||||
}
|
}
|
||||||
|
|
||||||
// /////////////////////////////////////////////////////////////////////////////
|
// /////////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@ -389,6 +389,10 @@ func TestComponentParam(t *testing.T) {
|
|||||||
assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt())
|
assert.Equal(t, 5, Params.BalanceSegmentBatchSize.GetAsInt())
|
||||||
assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt())
|
assert.Equal(t, 1, Params.BalanceChannelBatchSize.GetAsInt())
|
||||||
assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool())
|
assert.Equal(t, true, Params.EnableBalanceOnMultipleCollections.GetAsBool())
|
||||||
|
|
||||||
|
assert.Equal(t, 1, Params.QueryNodeTaskParallelismFactor.GetAsInt())
|
||||||
|
params.Save("queryCoord.queryNodeTaskParallelismFactor", "2")
|
||||||
|
assert.Equal(t, 2, Params.QueryNodeTaskParallelismFactor.GetAsInt())
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("test queryNodeConfig", func(t *testing.T) {
|
t.Run("test queryNodeConfig", func(t *testing.T) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user