mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
Fix queryCoord panic during query node down (#21400)
Signed-off-by: SimFG <bang.fu@zilliz.com>
This commit is contained in:
parent
d0d0db8c64
commit
6a29a964df
@ -19,11 +19,13 @@ package balance
|
|||||||
import (
|
import (
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/log"
|
||||||
"github.com/milvus-io/milvus/internal/proto/querypb"
|
"github.com/milvus-io/milvus/internal/proto/querypb"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
type RowCountBasedBalancer struct {
|
type RowCountBasedBalancer struct {
|
||||||
@ -117,17 +119,21 @@ func (b *RowCountBasedBalancer) balanceReplica(replica *meta.Replica) ([]Segment
|
|||||||
segments = lo.Filter(segments, func(segment *meta.Segment, _ int) bool {
|
segments = lo.Filter(segments, func(segment *meta.Segment, _ int) bool {
|
||||||
return b.targetMgr.GetHistoricalSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil
|
return b.targetMgr.GetHistoricalSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil
|
||||||
})
|
})
|
||||||
|
|
||||||
|
if isStopping, err := b.nodeManager.IsStoppingNode(nid); err != nil {
|
||||||
|
log.Info("not existed node", zap.Int64("nid", nid), zap.Any("segments", segments), zap.Error(err))
|
||||||
|
continue
|
||||||
|
} else if isStopping {
|
||||||
|
stoppingNodesSegments[nid] = segments
|
||||||
|
} else {
|
||||||
|
nodesSegments[nid] = segments
|
||||||
|
}
|
||||||
|
|
||||||
cnt := 0
|
cnt := 0
|
||||||
for _, s := range segments {
|
for _, s := range segments {
|
||||||
cnt += int(s.GetNumOfRows())
|
cnt += int(s.GetNumOfRows())
|
||||||
}
|
}
|
||||||
nodesRowCnt[nid] = cnt
|
nodesRowCnt[nid] = cnt
|
||||||
|
|
||||||
if nodeInfo := b.nodeManager.Get(nid); nodeInfo.IsStoppingState() {
|
|
||||||
stoppingNodesSegments[nid] = segments
|
|
||||||
} else {
|
|
||||||
nodesSegments[nid] = segments
|
|
||||||
}
|
|
||||||
totalCnt += cnt
|
totalCnt += cnt
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -134,6 +134,7 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
cases := []struct {
|
cases := []struct {
|
||||||
name string
|
name string
|
||||||
nodes []int64
|
nodes []int64
|
||||||
|
notExistedNodes []int64
|
||||||
segmentCnts []int
|
segmentCnts []int
|
||||||
states []session.State
|
states []session.State
|
||||||
shouldMock bool
|
shouldMock bool
|
||||||
@ -212,16 +213,18 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "already balanced",
|
name: "already balanced",
|
||||||
nodes: []int64{1, 2},
|
nodes: []int64{1, 2},
|
||||||
segmentCnts: []int{1, 2},
|
notExistedNodes: []int64{10},
|
||||||
states: []session.State{session.NodeStateNormal, session.NodeStateNormal},
|
segmentCnts: []int{1, 2},
|
||||||
|
states: []session.State{session.NodeStateNormal, session.NodeStateNormal},
|
||||||
distributions: map[int64][]*meta.Segment{
|
distributions: map[int64][]*meta.Segment{
|
||||||
1: {{SegmentInfo: &datapb.SegmentInfo{ID: 1, CollectionID: 1, NumOfRows: 30}, Node: 1}},
|
1: {{SegmentInfo: &datapb.SegmentInfo{ID: 1, CollectionID: 1, NumOfRows: 30}, Node: 1}},
|
||||||
2: {
|
2: {
|
||||||
{SegmentInfo: &datapb.SegmentInfo{ID: 2, CollectionID: 1, NumOfRows: 20}, Node: 2},
|
{SegmentInfo: &datapb.SegmentInfo{ID: 2, CollectionID: 1, NumOfRows: 20}, Node: 2},
|
||||||
{SegmentInfo: &datapb.SegmentInfo{ID: 3, CollectionID: 1, NumOfRows: 30}, Node: 2},
|
{SegmentInfo: &datapb.SegmentInfo{ID: 3, CollectionID: 1, NumOfRows: 30}, Node: 2},
|
||||||
},
|
},
|
||||||
|
10: {{SegmentInfo: &datapb.SegmentInfo{ID: 4, CollectionID: 1, NumOfRows: 30}, Node: 10}},
|
||||||
},
|
},
|
||||||
expectPlans: []SegmentAssignPlan{},
|
expectPlans: []SegmentAssignPlan{},
|
||||||
expectChannelPlans: []ChannelAssignPlan{},
|
expectChannelPlans: []ChannelAssignPlan{},
|
||||||
@ -259,7 +262,7 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
collection.LoadPercentage = 100
|
collection.LoadPercentage = 100
|
||||||
collection.Status = querypb.LoadStatus_Loaded
|
collection.Status = querypb.LoadStatus_Loaded
|
||||||
balancer.meta.CollectionManager.PutCollection(collection)
|
balancer.meta.CollectionManager.PutCollection(collection)
|
||||||
balancer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, c.nodes))
|
balancer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, append(c.nodes, c.notExistedNodes...)))
|
||||||
for node, s := range c.distributions {
|
for node, s := range c.distributions {
|
||||||
balancer.dist.SegmentDistManager.Update(node, s...)
|
balancer.dist.SegmentDistManager.Update(node, s...)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -17,6 +17,7 @@
|
|||||||
package session
|
package session
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -59,6 +60,17 @@ func (m *NodeManager) Stopping(nodeID int64) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *NodeManager) IsStoppingNode(nodeID int64) (bool, error) {
|
||||||
|
m.mu.RLock()
|
||||||
|
defer m.mu.RUnlock()
|
||||||
|
|
||||||
|
node := m.nodes[nodeID]
|
||||||
|
if node == nil {
|
||||||
|
return false, fmt.Errorf("nodeID[%d] isn't existed", nodeID)
|
||||||
|
}
|
||||||
|
return node.IsStoppingState(), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *NodeManager) Get(nodeID int64) *NodeInfo {
|
func (m *NodeManager) Get(nodeID int64) *NodeInfo {
|
||||||
m.mu.RLock()
|
m.mu.RLock()
|
||||||
defer m.mu.RUnlock()
|
defer m.mu.RUnlock()
|
||||||
|
|||||||
@ -328,7 +328,6 @@ func (node *QueryNode) Stop() error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("session fail to go stopping state", zap.Error(err))
|
log.Warn("session fail to go stopping state", zap.Error(err))
|
||||||
} else {
|
} else {
|
||||||
node.UpdateStateCode(commonpb.StateCode_Stopping)
|
|
||||||
noSegmentChan := node.metaReplica.getNoSegmentChan()
|
noSegmentChan := node.metaReplica.getNoSegmentChan()
|
||||||
select {
|
select {
|
||||||
case <-noSegmentChan:
|
case <-noSegmentChan:
|
||||||
|
|||||||
@ -15,7 +15,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
function get_milvus_process() {
|
function get_milvus_process() {
|
||||||
return $(ps -e | grep milvus | grep -v grep | awk '{print $1}')
|
echo $(ps -e | grep milvus | grep -v grep | awk '{print $1}')
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "Stopping milvus..."
|
echo "Stopping milvus..."
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user