fix: remove the streamingnode checking when loading segment (#45859)

issue: #43117

If we enable checking when loading segments, all segment should always
be loaded by streamingnode but not 2.5 querynode, make some search and
query failure when upgrading. Otherwise, some search and query result
will be wrong when upgrading. We choose to disable this checking for now
to promise available search and query when upgrading.

also see pr: #43346

Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
Zhen Ye 2025-11-28 10:09:08 +08:00 committed by GitHub
parent 31976d8adb
commit 4f080bd3a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 34 additions and 90 deletions

View File

@ -365,45 +365,23 @@ func (m *ChannelDistManager) GetShardLeader(channelName string, replica *Replica
candidatesServiceable := candidates.IsServiceable() candidatesServiceable := candidates.IsServiceable()
channelServiceable := channel.IsServiceable() channelServiceable := channel.IsServiceable()
candidateIsStreamingNode := m.checkIfStreamingNode(candidates.Node) updateNeeded := false
channelIsStreamingNode := m.checkIfStreamingNode(channel.Node) switch {
logger.Debug("check whether stream node is serviceable", case !candidatesServiceable && channelServiceable:
zap.Bool("candidatesServiceable", candidatesServiceable), // Current candidate is not serviceable but new channel is
zap.Bool("channelServiceable", channelServiceable), updateNeeded = true
zap.Bool("candidateIsStreamingNode", candidateIsStreamingNode), logger.Debug("set serviceable delegator to candidate shard leader", zap.Int64("node", channel.Node),
zap.Bool("channelIsStreamingNode", channelIsStreamingNode))
if channelIsStreamingNode && !candidateIsStreamingNode {
// When upgrading from 2.5 to 2.6, the delegator leader may not locate at streaming node.
// We always use the streaming node as the delegator leader to avoid the delete data lost when loading segment.
logger.Debug("set delegator on stream node to candidate shard leader", zap.Int64("node", channel.Node),
zap.Int64("channel version", channel.Version)) zap.Int64("channel version", channel.Version))
case candidatesServiceable == channelServiceable && channel.Version > candidates.Version:
// Same service status but higher version
updateNeeded = true
logger.Debug("set serviceable delegator with larger version to candidate shard leader", zap.Int64("node", channel.Node),
zap.Int64("channel version", channel.Version), zap.Int64("candidate version", candidates.Version))
}
if updateNeeded {
candidates = channel candidates = channel
} else if !channelIsStreamingNode && candidateIsStreamingNode {
// When downgrading from 2.6 to 2.5, the delegator leader may locate at non-streaming node.
// We always use the non-streaming node as the delegator leader to avoid the delete data lost when loading segment.
logger.Debug("found delegator which is not on stream node", zap.Int64("node", channel.Node),
zap.Int64("channel version", channel.Version))
continue
} else { } else {
updateNeeded := false logger.Debug("not set any channel to candidates in this round")
switch {
case !candidatesServiceable && channelServiceable:
// Current candidate is not serviceable but new channel is
updateNeeded = true
logger.Debug("set serviceable delegator to candidate shard leader", zap.Int64("node", channel.Node),
zap.Int64("channel version", channel.Version))
case candidatesServiceable == channelServiceable && channel.Version > candidates.Version:
// Same service status but higher version
updateNeeded = true
logger.Debug("set serviceable delegator with larger version to candidate shard leader", zap.Int64("node", channel.Node),
zap.Int64("channel version", channel.Version), zap.Int64("candidate version", candidates.Version))
}
if updateNeeded {
candidates = channel
} else {
logger.Debug("not set any channel to candidates in this round")
}
} }
} }
} }
@ -416,17 +394,6 @@ func (m *ChannelDistManager) GetShardLeader(channelName string, replica *Replica
return candidates return candidates
} }
// checkIfStreamingNode checks if the node is a streaming node.
// Because the session of streaming node and embedded query node are different,
// So we need to check if the node is a streaming node from the query node session but not streaming node session to avoid the wrong check result.
func (m *ChannelDistManager) checkIfStreamingNode(nodeID int64) bool {
node := m.nodeManager.Get(nodeID)
if node == nil {
return false
}
return node.IsEmbeddedQueryNodeInStreamingNode() || node.IsInStandalone()
}
func (m *ChannelDistManager) GetChannelDist(collectionID int64) []*metricsinfo.DmChannel { func (m *ChannelDistManager) GetChannelDist(collectionID int64) []*metricsinfo.DmChannel {
m.rwmutex.RLock() m.rwmutex.RLock()
defer m.rwmutex.RUnlock() defer m.rwmutex.RUnlock()

View File

@ -24,7 +24,6 @@ import (
"github.com/milvus-io/milvus/internal/coordinator/snmanager" "github.com/milvus-io/milvus/internal/coordinator/snmanager"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/sessionutil"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb" "github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/metricsinfo" "github.com/milvus-io/milvus/pkg/v2/util/metricsinfo"
@ -344,25 +343,6 @@ func (suite *ChannelDistManagerSuite) TestGetShardLeader() {
// Test nonexistent channel // Test nonexistent channel
leader = dist.GetShardLeader("nonexistent", replica) leader = dist.GetShardLeader("nonexistent", replica)
suite.Nil(leader) suite.Nil(leader)
// Test streaming node
nodeManager.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
NodeID: 4,
Address: "localhost:1",
Hostname: "localhost",
Labels: map[string]string{sessionutil.LabelStreamingNodeEmbeddedQueryNode: "1"},
}))
channel1Node4 := suite.channels["dmc0"].Clone()
channel1Node4.Node = 4
channel1Node4.Version = 3
channel1Node4.View.Status.Serviceable = false
dist.Update(4, channel1Node4)
leader = dist.GetShardLeader("dmc0", replica)
suite.NotNil(leader)
suite.Equal(int64(4), leader.Node)
suite.Equal(int64(3), leader.Version)
suite.False(leader.IsServiceable())
} }
func TestGetChannelDistJSON(t *testing.T) { func TestGetChannelDistJSON(t *testing.T) {

View File

@ -31,12 +31,10 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus/internal/coordinator/snmanager"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/streamingutil"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb" "github.com/milvus-io/milvus/pkg/v2/proto/datapb"
@ -243,11 +241,6 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
return err return err
} }
if err := ex.checkIfShardLeaderIsStreamingNode(view); err != nil {
log.Warn("shard leader is not a streamingnode, skip load segment", zap.Error(err))
return err
}
log = log.With(zap.Int64("shardLeader", view.Node)) log = log.With(zap.Int64("shardLeader", view.Node))
// NOTE: for balance segment task, expected load and release execution on the same shard leader // NOTE: for balance segment task, expected load and release execution on the same shard leader
@ -270,25 +263,29 @@ func (ex *Executor) loadSegment(task *SegmentTask, step int) error {
return nil return nil
} }
// checkIfShardLeaderIsStreamingNode checks if the shard leader is a streamingnode. // If we enable following checking when loading segments,
// 1. all segment should always be loaded by streamingnode but not 2.5 querynode, make some search and query failure when upgrading.
// Otherwise, some search and query result will be wrong when upgrading.
// We choose to disable this checking for now to promise available search and query when upgrading.
//
// Because the L0 management at 2.6 and 2.5 is different, so when upgrading mixcoord, // Because the L0 management at 2.6 and 2.5 is different, so when upgrading mixcoord,
// the new mixcoord will make a wrong plan when balancing a segment from one query node to another by 2.5 delegator. // the new mixcoord will make a wrong plan when balancing a segment from one query node to another by 2.5 delegator.
// We need to balance the 2.5 delegator to 2.6 delegator before balancing any segment by 2.6 mixcoord. // We need to balance the 2.5 delegator to 2.6 delegator before balancing any segment by 2.6 mixcoord.
func (ex *Executor) checkIfShardLeaderIsStreamingNode(view *meta.DmChannel) error { // func (ex *Executor) checkIfShardLeaderIsStreamingNode(view *meta.DmChannel) error {
if !streamingutil.IsStreamingServiceEnabled() { // if !streamingutil.IsStreamingServiceEnabled() {
return nil // return nil
} // }
//
node := ex.nodeMgr.Get(view.Node) // node := ex.nodeMgr.Get(view.Node)
if node == nil { // if node == nil {
return merr.WrapErrServiceInternal(fmt.Sprintf("node %d is not found", view.Node)) // return merr.WrapErrServiceInternal(fmt.Sprintf("node %d is not found", view.Node))
} // }
nodes := snmanager.StaticStreamingNodeManager.GetStreamingQueryNodeIDs() // nodes := snmanager.StaticStreamingNodeManager.GetStreamingQueryNodeIDs()
if !nodes.Contain(view.Node) { // if !nodes.Contain(view.Node) {
return merr.WrapErrServiceInternal(fmt.Sprintf("channel %s at node %d is not working at streamingnode, skip load segment", view.GetChannelName(), view.Node)) // return merr.WrapErrServiceInternal(fmt.Sprintf("channel %s at node %d is not working at streamingnode, skip load segment", view.GetChannelName(), view.Node))
} // }
return nil // return nil
} // }
func (ex *Executor) releaseSegment(task *SegmentTask, step int) { func (ex *Executor) releaseSegment(task *SegmentTask, step int) {
defer ex.removeTask(task, step) defer ex.removeTask(task, step)