enhance: support new server label rule for milvus and MILVUS_SERVER_LABEL_RESOURCE_GROUP (#46401)

issue: #46400

- add new server label rule.
- add `MILVUS_SERVER_LABEL_RESOURCE_GROUP` to determine the resource
group of querynode.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Automatic creation of resource groups when nodes with resource-group
labels join.
* Expanded server-label system supporting role-specific and global
labels.

* **Bug Fixes**
* Node acceptance now enforces resource-group name compatibility,
preventing cross-group assignment.

* **Refactor**
* Node handling flows updated to use richer node information for
assignment and validation.

* **Tests**
* Added tests validating resource-group labeling and node acceptance
behavior.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
Zhen Ye 2025-12-24 14:23:18 +08:00 committed by GitHub
parent 5b97cb70a0
commit 1cae4a6194
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 294 additions and 46 deletions

View File

@ -13,7 +13,6 @@ import (
var (
DefaultResourceGroupName = "__default_resource_group"
defaultResourceGroupCapacity int32 = 1000000
resourceGroupTransferBoost = 10000
)
func NewResourceGroupConfig(request int32, limit int32) *rgpb.ResourceGroupConfig {
@ -197,15 +196,19 @@ func (rg *ResourceGroup) SelectNodeForRG(targetRG *ResourceGroup) int64 {
// return node and priority.
func (rg *ResourceGroup) AcceptNode(nodeID int64) bool {
if rg.GetName() == DefaultResourceGroupName {
return true
}
nodeInfo := rg.nodeMgr.Get(nodeID)
if nodeInfo == nil {
return false
}
if nodeInfo.ResourceGroupName() != "" && nodeInfo.ResourceGroupName() != rg.GetName() {
return false
}
if rg.GetName() == DefaultResourceGroupName {
return true
}
requiredNodeLabels := rg.GetConfig().GetNodeFilter().GetNodeLabels()
if len(requiredNodeLabels) == 0 {
return true

View File

@ -8,6 +8,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/rgpb"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/sessionutil"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
@ -451,3 +452,33 @@ func TestRGNodeFilter(t *testing.T) {
}, nodeMgr)
assert.Equal(t, rg.SelectNodeForRG(rg3), int64(-1))
}
func TestRGAcceptNode(t *testing.T) {
nodeMgr := session.NewNodeManager()
nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 1,
Labels: map[string]string{
sessionutil.LabelResourceGroup: "rg1",
},
}))
nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 2,
Labels: map[string]string{
sessionutil.LabelResourceGroup: "rg2",
},
}))
rg := NewResourceGroup("rg1", &rgpb.ResourceGroupConfig{
Requests: &rgpb.ResourceGroupLimit{
NodeNum: 1,
},
Limits: &rgpb.ResourceGroupLimit{
NodeNum: 1,
},
}, nodeMgr)
assert.True(t, rg.AcceptNode(1))
assert.False(t, rg.AcceptNode(2))
}

View File

@ -797,16 +797,18 @@ func (rm *ResourceManager) selectNodeForRedundantRecover(sourceRG *ResourceGroup
// assignIncomingNodeWithNodeCheck assign node to resource group with node status check.
func (rm *ResourceManager) assignIncomingNodeWithNodeCheck(ctx context.Context, node int64) (string, error) {
// node is on stopping or stopped, remove it from incoming node set.
if rm.nodeMgr.Get(node) == nil {
nodeInfo := rm.nodeMgr.Get(node)
if nodeInfo == nil {
rm.incomingNode.Remove(node)
return "", errors.New("node is not online")
}
if ok, _ := rm.nodeMgr.IsStoppingNode(node); ok {
if nodeInfo.IsStoppingState() {
rm.incomingNode.Remove(node)
return "", errors.New("node has been stopped")
}
rgName, err := rm.assignIncomingNode(ctx, node)
rgName, err := rm.assignIncomingNode(ctx, nodeInfo)
if err != nil {
return "", err
}
@ -816,7 +818,9 @@ func (rm *ResourceManager) assignIncomingNodeWithNodeCheck(ctx context.Context,
}
// assignIncomingNode assign node to resource group.
func (rm *ResourceManager) assignIncomingNode(ctx context.Context, node int64) (string, error) {
func (rm *ResourceManager) assignIncomingNode(ctx context.Context, nodeInfo *session.NodeInfo) (string, error) {
node := nodeInfo.ID()
// If node already assign to rg.
rg := rm.getResourceGroupByNodeID(node)
if rg != nil {
@ -827,16 +831,53 @@ func (rm *ResourceManager) assignIncomingNode(ctx context.Context, node int64) (
return rg.GetName(), nil
}
if err := rm.createResourceGroupIfNotExists(ctx, nodeInfo); err != nil {
return "", err
}
// select a resource group to assign incoming node.
rg = rm.mustSelectAssignIncomingNodeTargetRG(node)
rg = rm.mustSelectAssignIncomingNodeTargetRG(nodeInfo)
if err := rm.transferNode(ctx, rg.GetName(), node); err != nil {
return "", errors.Wrap(err, "at finally assign to default resource group")
}
return rg.GetName(), nil
}
// createResourceGroupIfNotExists create resource group if not exists.
func (rm *ResourceManager) createResourceGroupIfNotExists(ctx context.Context, nodeInfo *session.NodeInfo) error {
rgName := nodeInfo.ResourceGroupName()
nodeID := nodeInfo.ID()
if rgName == "" {
return nil
}
if _, ok := rm.groups[rgName]; ok {
return nil
}
if err := rm.updateResourceGroups(ctx, map[string]*rgpb.ResourceGroupConfig{
rgName: {
Requests: &rgpb.ResourceGroupLimit{
NodeNum: 0,
},
Limits: &rgpb.ResourceGroupLimit{
NodeNum: defaultResourceGroupCapacity,
},
},
}); err != nil {
log.Warn("failed to create resource group from session of new incoming node", zap.String("rgName", rgName), zap.Int64("nodeID", nodeID), zap.Error(err))
return err
}
log.Info("create resource group from session of new incoming node", zap.String("rgName", rgName), zap.Int64("nodeID", nodeID))
return nil
}
// mustSelectAssignIncomingNodeTargetRG select resource group for assign incoming node.
func (rm *ResourceManager) mustSelectAssignIncomingNodeTargetRG(nodeID int64) *ResourceGroup {
func (rm *ResourceManager) mustSelectAssignIncomingNodeTargetRG(nodeInfo *session.NodeInfo) *ResourceGroup {
if nodeInfo.ResourceGroupName() != "" {
// rg will be created if not exists by createResourceGroupIfNotExists
return rm.groups[nodeInfo.ResourceGroupName()]
}
nodeID := nodeInfo.ID()
// First, Assign it to rg with the most missing nodes at high priority.
if rg := rm.findMaxRGWithGivenFilter(
func(rg *ResourceGroup) bool {
@ -1020,6 +1061,12 @@ func (rm *ResourceManager) validateResourceGroupIsDeletable(rgName string) error
return merr.WrapErrParameterInvalid("not empty resource group", rgName, "resource group's limits node num is not 0")
}
for _, nodeInfo := range rm.nodeMgr.GetAll() {
if nodeInfo.ResourceGroupName() == rgName {
return merr.WrapErrParameterInvalid("not empty resource group", fmt.Sprintf("node %d is still in the resource group", nodeInfo.ID()))
}
}
// If rg is used by other rg, it's not deletable.
for _, rg := range rm.groups {
for _, transferCfg := range rg.GetConfig().GetTransferFrom() {

View File

@ -232,6 +232,17 @@ func (suite *ResourceManagerSuite) TestManipulateResourceGroup() {
},
}))
suite.manager.HandleNodeUp(ctx, 10)
suite.manager.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 11,
Address: "localhost",
Hostname: "localhost",
Labels: map[string]string{
sessionutil.LabelResourceGroup: "rg11",
},
}))
suite.manager.HandleNodeUp(ctx, 11)
suite.Equal(1, suite.manager.GetResourceGroup(ctx, "rg11").NodeNum())
}
func (suite *ResourceManagerSuite) TestNodeUpAndDown() {

View File

@ -171,7 +171,15 @@ func (n *NodeInfo) IsInStandalone() bool {
}
func (n *NodeInfo) IsEmbeddedQueryNodeInStreamingNode() bool {
return n.immutableInfo.Labels[sessionutil.LabelStreamingNodeEmbeddedQueryNode] == "1"
// Since 2.6.8, we introduce a new label rule for session,
// the LegacyLabelStreamingNodeEmbeddedQueryNode is used before 2.6.8, so we need to check both key to keep compatibility.
return n.immutableInfo.Labels[sessionutil.LabelStreamingNodeEmbeddedQueryNode] == "1" ||
n.immutableInfo.Labels[sessionutil.LegacyLabelStreamingNodeEmbeddedQueryNode] == "1"
}
// ResourceGroupName returns the resource group name of the current node.
func (n *NodeInfo) ResourceGroupName() string {
return n.immutableInfo.Labels[sessionutil.LabelResourceGroup]
}
func (n *NodeInfo) SegmentCnt() int {

View File

@ -21,6 +21,8 @@ import (
"time"
"github.com/stretchr/testify/suite"
"github.com/milvus-io/milvus/internal/util/sessionutil"
)
type NodeManagerSuite struct {
@ -230,6 +232,29 @@ func (s *NodeManagerSuite) TestMemCapacityFunctionality() {
s.Equal(2048.75, node.MemCapacity())
}
func (s *NodeManagerSuite) TestNodeInfoLabels() {
info := ImmutableNodeInfo{
NodeID: 1,
Address: "localhost",
Hostname: "localhost",
Labels: map[string]string{
sessionutil.LegacyLabelStreamingNodeEmbeddedQueryNode: "1",
},
}
info2 := NodeInfo{
immutableInfo: info,
}
s.True(info2.IsEmbeddedQueryNodeInStreamingNode())
info2.immutableInfo.Labels[sessionutil.LabelStreamingNodeEmbeddedQueryNode] = "1"
delete(info2.immutableInfo.Labels, sessionutil.LegacyLabelStreamingNodeEmbeddedQueryNode)
s.True(info2.IsEmbeddedQueryNodeInStreamingNode())
info.Labels[sessionutil.LabelResourceGroup] = "rg1"
s.Equal("rg1", info2.ResourceGroupName())
}
func TestNodeManagerSuite(t *testing.T) {
suite.Run(t, new(NodeManagerSuite))
}

View File

@ -0,0 +1,125 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sessionutil
import (
"os"
"strings"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
// milvus server label rules:
// The server label of milvus will be injected into the session of milvus server,
// which can be found at SessionRaw.ServerLabels.
// The label is a envionment variable, the key must be prefixed with "MILVUS_SERVER_LABEL_".
// There are two types of labels:
//
// role-specific labels and non-role-specific labels.
// The key of role-specific labels have the format of "MILVUS_SERVER_LABEL_<ROLE>_<LABEL>",
// only the session of the role can see the label.
// The key of non-role-specific labels have the format of "MILVUS_SERVER_LABEL_<LABEL>",
// all sessions can see the label.
// e.g.
// export MILVUS_SERVER_LABEL_label1=value1, all roles can see the `label1:value1` in the server label of session.
// export MILVUS_SERVER_LABEL_querynode_label2=value2, only the session of querynode can see the `label2:value2` in the server label of session.
//
// the key of non-role-specific labels will be overwritten by the key of role-specific labels.
// e.g.
// export MILVUS_SERVER_LABEL_label1=value1, export MILVUS_SERVER_LABEL_querynode_label1=value2,
// the session of querynode will see the `label1:value2` in the server label of session because of the overwrite.
// the session of other roles will see the `label1:value1` in the server label of session.
const (
SupportedLabelPrefix = "MILVUS_SERVER_LABEL_"
// QueryNode
LabelStreamingNodeEmbeddedQueryNode = "STREAMING-EMBEDDED"
LegacyLabelStreamingNodeEmbeddedQueryNode = "QUERYNODE_" + LabelStreamingNodeEmbeddedQueryNode
// All Roles
LabelStandalone = "STANDALONE"
LabelResourceGroup = "RESOURCE_GROUP"
)
// NewServerLabel creates a new server label with the given role and label.
func NewServerLabel(role string, label string) string {
if role == "" {
return SupportedLabelPrefix + strings.ToUpper(label)
}
return SupportedLabelPrefix + strings.ToUpper(role) + "_" + strings.ToUpper(label)
}
func getServerLabelsFromEnv(role string) map[string]string {
labelsSpecifiedByRole := make(map[string]string)
labels := make(map[string]string)
for _, value := range os.Environ() {
rs := []rune(value)
in := strings.Index(value, "=")
key := string(rs[0:in])
value := string(rs[in+1:])
roleFromEnv, label, ok := getRoleFromEnvKey(key)
if !ok || (roleFromEnv != "" && roleFromEnv != role) {
continue
}
if roleFromEnv == "" {
labels[label] = value
} else {
labelsSpecifiedByRole[label] = value
}
}
// use role specified labels to overwrite labels not specified by role.
for label, value := range labelsSpecifiedByRole {
labels[label] = value
}
return labels
}
func getRoleFromEnvKey(key string) (string, string, bool) {
if !strings.HasPrefix(key, SupportedLabelPrefix) {
return "", "", false
}
labelWithRole := strings.TrimPrefix(key, SupportedLabelPrefix)
if len(labelWithRole) == 0 {
return "", "", false
}
splits := strings.Split(labelWithRole, "_")
var role string
switch strings.ToLower(splits[0]) {
case typeutil.MixCoordRole, "coord":
role = typeutil.MixCoordRole
case typeutil.QueryNodeRole, "qn":
role = typeutil.QueryNodeRole
case typeutil.DataNodeRole, "dn":
role = typeutil.DataNodeRole
case typeutil.StreamingNodeRole, "sn":
role = typeutil.StreamingNodeRole
case typeutil.ProxyRole:
role = typeutil.ProxyRole
default:
}
// if role is not found, the label can be seen session in all roles.
if role == "" {
return "", labelWithRole, true
}
if len(splits) <= 1 {
return "", "", false
}
// else, it's a selected role label, only can be seen in the selected role.
return role, strings.Join(splits[1:], "_"), true
}

View File

@ -51,12 +51,9 @@ const (
// DefaultServiceRoot default root path used in kv by Session
DefaultServiceRoot = "session/"
// DefaultIDKey default id key for Session
DefaultIDKey = "id"
SupportedLabelPrefix = "MILVUS_SERVER_LABEL_"
LabelStreamingNodeEmbeddedQueryNode = "QUERYNODE_STREAMING-EMBEDDED"
LabelStandalone = "STANDALONE"
MilvusNodeIDForTesting = "MILVUS_NODE_ID_FOR_TESTING"
exitCodeSessionLeaseExpired = 1
DefaultIDKey = "id"
MilvusNodeIDForTesting = "MILVUS_NODE_ID_FOR_TESTING"
exitCodeSessionLeaseExpired = 1
serverVersionKey = "version"
)
@ -70,12 +67,12 @@ func isNotSessionVersionCheckFailure(err error) bool {
// EnableEmbededQueryNodeLabel set server labels for embedded query node.
func EnableEmbededQueryNodeLabel() {
os.Setenv(SupportedLabelPrefix+LabelStreamingNodeEmbeddedQueryNode, "1")
os.Setenv(NewServerLabel(typeutil.QueryNodeRole, LabelStreamingNodeEmbeddedQueryNode), "1")
}
// EnableStandaloneLabel set server labels for standalone.
func EnableStandaloneLabel() {
os.Setenv(SupportedLabelPrefix+LabelStandalone, "1")
os.Setenv(NewServerLabel("", LabelStandalone), "1")
}
// SessionEventType session event type
@ -309,7 +306,7 @@ func (s *Session) Init(serverName, address string, exclusive bool, triggerKill b
panic(err)
}
s.ServerID = serverID
s.ServerLabels = GetServerLabelsFromEnv(serverName)
s.ServerLabels = getServerLabelsFromEnv(serverName)
s.versionKey = path.Join(s.metaRoot, DefaultServiceRoot, serverVersionKey)
s.SetLogger(log.With(
@ -388,25 +385,6 @@ func (s *Session) getServerID() (int64, error) {
return nodeID, nil
}
func GetServerLabelsFromEnv(role string) map[string]string {
ret := make(map[string]string)
switch role {
case "querynode":
for _, value := range os.Environ() {
rs := []rune(value)
in := strings.Index(value, "=")
key := string(rs[0:in])
value := string(rs[in+1:])
if strings.HasPrefix(key, SupportedLabelPrefix) {
label := strings.TrimPrefix(key, SupportedLabelPrefix)
ret[label] = value
}
}
}
return ret
}
func (s *Session) checkIDExist() {
s.etcdCli.Txn(s.ctx).If(
clientv3.Compare(

View File

@ -759,16 +759,36 @@ func (s *SessionSuite) TestKeepAliveRetryChannelClose() {
func (s *SessionSuite) TestGetSessions() {
os.Setenv("MILVUS_SERVER_LABEL_key1", "value1")
os.Setenv("MILVUS_SERVER_LABEL_key2", "value2")
os.Setenv("key3", "value3")
os.Setenv("MILVUS_SERVER_LABEL_key3", "value3")
os.Setenv("MILVUS_SERVER_LABEL_qn_key3", "value33")
os.Setenv("MILVUS_SERVER_LABEL_sn_key3", "value33")
os.Setenv("key4", "value4")
os.Setenv("MILVUS_SERVER_LABEL_", "value5")
os.Setenv("MILVUS_SERVER_LABEL_qn", "value6")
defer os.Unsetenv("MILVUS_SERVER_LABEL_key1")
defer os.Unsetenv("MILVUS_SERVER_LABEL_key2")
defer os.Unsetenv("key3")
defer os.Unsetenv("MILVUS_SERVER_LABEL_qn_key3")
defer os.Unsetenv("MILVUS_SERVER_LABEL_sn_key3")
defer os.Unsetenv("key4")
ret := GetServerLabelsFromEnv("querynode")
assert.Equal(s.T(), 2, len(ret))
assert.Equal(s.T(), "value1", ret["key1"])
assert.Equal(s.T(), "value2", ret["key2"])
roles := []string{typeutil.QueryNodeRole, typeutil.MixCoordRole, typeutil.StreamingNodeRole, typeutil.ProxyRole}
for _, role := range roles {
ret := getServerLabelsFromEnv(role)
switch role {
case typeutil.QueryNodeRole, typeutil.StreamingNodeRole:
assert.Equal(s.T(), 3, len(ret))
assert.Equal(s.T(), "value1", ret["key1"])
assert.Equal(s.T(), "value2", ret["key2"])
assert.Equal(s.T(), "value33", ret["key3"], "role: %s", role)
default:
assert.Equal(s.T(), 3, len(ret))
assert.Equal(s.T(), "value1", ret["key1"])
assert.Equal(s.T(), "value2", ret["key2"])
assert.Equal(s.T(), "value3", ret["key3"])
}
}
}
func (s *SessionSuite) TestVersionKey() {