mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 09:38:39 +08:00
fix: query node may stuck at stopping progress (#33104)
issue: #33103 when try to do stopping balance for stopping query node, balancer will try to get node list from replica.GetNodes, then check whether node is stopping, if so, stopping balance will be triggered for this replica. after the replica refactor, replica.GetNodes only return rwNodes, and the stopping node maintains in roNodes, so balancer couldn't find replica which contains stopping node, and stopping balance for replica won't be triggered, then query node will stuck forever due to segment/channel doesn't move out. --------- Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
c6e2dd05fc
commit
a7f6193bfc
@ -77,68 +77,51 @@ func (b *ChannelLevelScoreBalancer) BalanceReplica(replica *meta.Replica) ([]Seg
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
onlineNodes := make([]int64, 0)
|
rwNodes := replica.GetChannelRWNodes(channelName)
|
||||||
offlineNodes := make([]int64, 0)
|
roNodes := replica.GetRONodes()
|
||||||
// read only nodes is offline in current replica.
|
|
||||||
if replica.RONodesCount() > 0 {
|
|
||||||
// if node is stop or transfer to other rg
|
|
||||||
log.RatedInfo(10, "meet read only node, try to move out all segment/channel", zap.Int64s("node", replica.GetRONodes()))
|
|
||||||
offlineNodes = append(offlineNodes, replica.GetRONodes()...)
|
|
||||||
}
|
|
||||||
|
|
||||||
// mark channel's outbound access node as offline
|
// mark channel's outbound access node as offline
|
||||||
channelRWNode := typeutil.NewUniqueSet(replica.GetChannelRWNodes(channelName)...)
|
channelRWNode := typeutil.NewUniqueSet(rwNodes...)
|
||||||
channelDist := b.dist.ChannelDistManager.GetByFilter(meta.WithChannelName2Channel(channelName), meta.WithReplica2Channel(replica))
|
channelDist := b.dist.ChannelDistManager.GetByFilter(meta.WithChannelName2Channel(channelName), meta.WithReplica2Channel(replica))
|
||||||
for _, channel := range channelDist {
|
for _, channel := range channelDist {
|
||||||
if !channelRWNode.Contain(channel.Node) {
|
if !channelRWNode.Contain(channel.Node) {
|
||||||
offlineNodes = append(offlineNodes, channel.Node)
|
roNodes = append(roNodes, channel.Node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segmentDist := b.dist.SegmentDistManager.GetByFilter(meta.WithChannel(channelName), meta.WithReplica(replica))
|
segmentDist := b.dist.SegmentDistManager.GetByFilter(meta.WithChannel(channelName), meta.WithReplica(replica))
|
||||||
for _, segment := range segmentDist {
|
for _, segment := range segmentDist {
|
||||||
if !channelRWNode.Contain(segment.Node) {
|
if !channelRWNode.Contain(segment.Node) {
|
||||||
offlineNodes = append(offlineNodes, segment.Node)
|
roNodes = append(roNodes, segment.Node)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for nid := range channelRWNode {
|
if len(rwNodes) == 0 {
|
||||||
if isStopping, err := b.nodeManager.IsStoppingNode(nid); err != nil {
|
|
||||||
log.Info("not existed node", zap.Int64("nid", nid), zap.Error(err))
|
|
||||||
continue
|
|
||||||
} else if isStopping {
|
|
||||||
offlineNodes = append(offlineNodes, nid)
|
|
||||||
} else {
|
|
||||||
onlineNodes = append(onlineNodes, nid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(onlineNodes) == 0 {
|
|
||||||
// no available nodes to balance
|
// no available nodes to balance
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(offlineNodes) != 0 {
|
if len(roNodes) != 0 {
|
||||||
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
||||||
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", offlineNodes))
|
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", roNodes))
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Handle stopping nodes",
|
log.Info("Handle stopping nodes",
|
||||||
zap.Any("stopping nodes", offlineNodes),
|
zap.Any("stopping nodes", roNodes),
|
||||||
zap.Any("available nodes", onlineNodes),
|
zap.Any("available nodes", rwNodes),
|
||||||
)
|
)
|
||||||
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
||||||
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, channelName, onlineNodes, offlineNodes)...)
|
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, channelName, rwNodes, roNodes)...)
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, channelName, onlineNodes, offlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, channelName, rwNodes, roNodes)...)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
||||||
channelPlans = append(channelPlans, b.genChannelPlan(replica, channelName, onlineNodes)...)
|
channelPlans = append(channelPlans, b.genChannelPlan(replica, channelName, rwNodes)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, channelName, onlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, channelName, rwNodes)...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1162,8 +1162,13 @@ func (suite *ChannelLevelScoreBalancerTestSuite) TestExclusiveChannelBalance_Nod
|
|||||||
},
|
},
|
||||||
}...)
|
}...)
|
||||||
|
|
||||||
suite.balancer.nodeManager.Stopping(ch1Nodes[0])
|
balancer.nodeManager.Stopping(ch1Nodes[0])
|
||||||
suite.balancer.nodeManager.Stopping(ch2Nodes[0])
|
balancer.nodeManager.Stopping(ch2Nodes[0])
|
||||||
|
suite.balancer.meta.ResourceManager.HandleNodeStopping(ch1Nodes[0])
|
||||||
|
suite.balancer.meta.ResourceManager.HandleNodeStopping(ch2Nodes[0])
|
||||||
|
utils.RecoverAllCollection(balancer.meta)
|
||||||
|
|
||||||
|
replica = balancer.meta.ReplicaManager.Get(replica.GetID())
|
||||||
sPlans, cPlans := balancer.BalanceReplica(replica)
|
sPlans, cPlans := balancer.BalanceReplica(replica)
|
||||||
suite.Len(sPlans, 0)
|
suite.Len(sPlans, 0)
|
||||||
suite.Len(cPlans, 2)
|
suite.Len(cPlans, 2)
|
||||||
|
|||||||
@ -466,67 +466,49 @@ func (b *MultiTargetBalancer) BalanceReplica(replica *meta.Replica) ([]SegmentAs
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
onlineNodes := make([]int64, 0)
|
rwNodes := replica.GetRWNodes()
|
||||||
offlineNodes := make([]int64, 0)
|
roNodes := replica.GetRONodes()
|
||||||
|
|
||||||
// read only nodes is offline in current replica.
|
if len(rwNodes) == 0 {
|
||||||
if replica.RONodesCount() > 0 {
|
|
||||||
// if node is stop or transfer to other rg
|
|
||||||
log.RatedInfo(10, "meet read only node, try to move out all segment/channel", zap.Int64s("node", replica.GetRONodes()))
|
|
||||||
offlineNodes = append(offlineNodes, replica.GetRONodes()...)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, nid := range replica.GetNodes() {
|
|
||||||
if isStopping, err := b.nodeManager.IsStoppingNode(nid); err != nil {
|
|
||||||
log.Info("not existed node", zap.Int64("nid", nid), zap.Error(err))
|
|
||||||
continue
|
|
||||||
} else if isStopping {
|
|
||||||
offlineNodes = append(offlineNodes, nid)
|
|
||||||
} else {
|
|
||||||
onlineNodes = append(onlineNodes, nid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(onlineNodes) == 0 {
|
|
||||||
// no available nodes to balance
|
// no available nodes to balance
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// print current distribution before generating plans
|
// print current distribution before generating plans
|
||||||
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
||||||
if len(offlineNodes) != 0 {
|
if len(roNodes) != 0 {
|
||||||
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
||||||
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", offlineNodes))
|
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", roNodes))
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Handle stopping nodes",
|
log.Info("Handle stopping nodes",
|
||||||
zap.Any("stopping nodes", offlineNodes),
|
zap.Any("stopping nodes", roNodes),
|
||||||
zap.Any("available nodes", onlineNodes),
|
zap.Any("available nodes", rwNodes),
|
||||||
)
|
)
|
||||||
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
||||||
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, onlineNodes, offlineNodes)...)
|
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, rwNodes, roNodes)...)
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, onlineNodes, offlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, rwNodes, roNodes)...)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
||||||
channelPlans = append(channelPlans, b.genChannelPlan(replica, onlineNodes)...)
|
channelPlans = append(channelPlans, b.genChannelPlan(replica, rwNodes)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = b.genSegmentPlan(replica)
|
segmentPlans = b.genSegmentPlan(replica, rwNodes)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return segmentPlans, channelPlans
|
return segmentPlans, channelPlans
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *MultiTargetBalancer) genSegmentPlan(replica *meta.Replica) []SegmentAssignPlan {
|
func (b *MultiTargetBalancer) genSegmentPlan(replica *meta.Replica, rwNodes []int64) []SegmentAssignPlan {
|
||||||
// get segments distribution on replica level and global level
|
// get segments distribution on replica level and global level
|
||||||
nodeSegments := make(map[int64][]*meta.Segment)
|
nodeSegments := make(map[int64][]*meta.Segment)
|
||||||
globalNodeSegments := make(map[int64][]*meta.Segment)
|
globalNodeSegments := make(map[int64][]*meta.Segment)
|
||||||
for _, node := range replica.GetNodes() {
|
for _, node := range rwNodes {
|
||||||
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))
|
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))
|
||||||
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
||||||
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
||||||
|
|||||||
@ -126,9 +126,7 @@ func (b *RowCountBasedBalancer) AssignChannel(channels []*meta.DmChannel, nodes
|
|||||||
|
|
||||||
func (b *RowCountBasedBalancer) convertToNodeItemsBySegment(nodeIDs []int64) []*nodeItem {
|
func (b *RowCountBasedBalancer) convertToNodeItemsBySegment(nodeIDs []int64) []*nodeItem {
|
||||||
ret := make([]*nodeItem, 0, len(nodeIDs))
|
ret := make([]*nodeItem, 0, len(nodeIDs))
|
||||||
for _, nodeInfo := range b.getNodes(nodeIDs) {
|
for _, node := range nodeIDs {
|
||||||
node := nodeInfo.ID()
|
|
||||||
|
|
||||||
// calculate sealed segment row count on node
|
// calculate sealed segment row count on node
|
||||||
segments := b.dist.SegmentDistManager.GetByFilter(meta.WithNodeID(node))
|
segments := b.dist.SegmentDistManager.GetByFilter(meta.WithNodeID(node))
|
||||||
rowcnt := 0
|
rowcnt := 0
|
||||||
@ -151,8 +149,7 @@ func (b *RowCountBasedBalancer) convertToNodeItemsBySegment(nodeIDs []int64) []*
|
|||||||
|
|
||||||
func (b *RowCountBasedBalancer) convertToNodeItemsByChannel(nodeIDs []int64) []*nodeItem {
|
func (b *RowCountBasedBalancer) convertToNodeItemsByChannel(nodeIDs []int64) []*nodeItem {
|
||||||
ret := make([]*nodeItem, 0, len(nodeIDs))
|
ret := make([]*nodeItem, 0, len(nodeIDs))
|
||||||
for _, nodeInfo := range b.getNodes(nodeIDs) {
|
for _, node := range nodeIDs {
|
||||||
node := nodeInfo.ID()
|
|
||||||
channels := b.dist.ChannelDistManager.GetByFilter(meta.WithNodeID2Channel(node))
|
channels := b.dist.ChannelDistManager.GetByFilter(meta.WithNodeID2Channel(node))
|
||||||
|
|
||||||
// more channel num, less priority
|
// more channel num, less priority
|
||||||
@ -172,71 +169,52 @@ func (b *RowCountBasedBalancer) BalanceReplica(replica *meta.Replica) ([]Segment
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
onlineNodes := make([]int64, 0)
|
rwNodes := replica.GetRWNodes()
|
||||||
offlineNodes := make([]int64, 0)
|
roNodes := replica.GetRONodes()
|
||||||
|
if len(rwNodes) == 0 {
|
||||||
// read only nodes is offline in current replica.
|
|
||||||
if replica.RONodesCount() > 0 {
|
|
||||||
// if node is stop or transfer to other rg
|
|
||||||
log.RatedInfo(10, "meet read only node, try to move out all segment/channel", zap.Int64s("node", replica.GetRONodes()))
|
|
||||||
offlineNodes = append(offlineNodes, replica.GetRONodes()...)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, nid := range replica.GetNodes() {
|
|
||||||
if isStopping, err := b.nodeManager.IsStoppingNode(nid); err != nil {
|
|
||||||
log.Info("not existed node", zap.Int64("nid", nid), zap.Error(err))
|
|
||||||
continue
|
|
||||||
} else if isStopping {
|
|
||||||
offlineNodes = append(offlineNodes, nid)
|
|
||||||
} else {
|
|
||||||
onlineNodes = append(onlineNodes, nid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(onlineNodes) == 0 {
|
|
||||||
// no available nodes to balance
|
// no available nodes to balance
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
||||||
if len(offlineNodes) != 0 {
|
if len(roNodes) != 0 {
|
||||||
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
||||||
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", offlineNodes))
|
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", roNodes))
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Handle stopping nodes",
|
log.Info("Handle stopping nodes",
|
||||||
zap.Any("stopping nodes", offlineNodes),
|
zap.Any("stopping nodes", roNodes),
|
||||||
zap.Any("available nodes", onlineNodes),
|
zap.Any("available nodes", rwNodes),
|
||||||
)
|
)
|
||||||
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
||||||
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, onlineNodes, offlineNodes)...)
|
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, rwNodes, roNodes)...)
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, onlineNodes, offlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, rwNodes, roNodes)...)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
||||||
channelPlans = append(channelPlans, b.genChannelPlan(replica, onlineNodes)...)
|
channelPlans = append(channelPlans, b.genChannelPlan(replica, rwNodes)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, onlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, rwNodes)...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return segmentPlans, channelPlans
|
return segmentPlans, channelPlans
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *RowCountBasedBalancer) genStoppingSegmentPlan(replica *meta.Replica, onlineNodes []int64, offlineNodes []int64) []SegmentAssignPlan {
|
func (b *RowCountBasedBalancer) genStoppingSegmentPlan(replica *meta.Replica, rwNodes []int64, roNodes []int64) []SegmentAssignPlan {
|
||||||
segmentPlans := make([]SegmentAssignPlan, 0)
|
segmentPlans := make([]SegmentAssignPlan, 0)
|
||||||
for _, nodeID := range offlineNodes {
|
for _, nodeID := range roNodes {
|
||||||
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(nodeID))
|
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(nodeID))
|
||||||
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
||||||
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
||||||
b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.NextTarget) != nil &&
|
b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.NextTarget) != nil &&
|
||||||
segment.GetLevel() != datapb.SegmentLevel_L0
|
segment.GetLevel() != datapb.SegmentLevel_L0
|
||||||
})
|
})
|
||||||
plans := b.AssignSegment(replica.GetCollectionID(), segments, onlineNodes, false)
|
plans := b.AssignSegment(replica.GetCollectionID(), segments, rwNodes, false)
|
||||||
for i := range plans {
|
for i := range plans {
|
||||||
plans[i].From = nodeID
|
plans[i].From = nodeID
|
||||||
plans[i].Replica = replica
|
plans[i].Replica = replica
|
||||||
@ -246,13 +224,13 @@ func (b *RowCountBasedBalancer) genStoppingSegmentPlan(replica *meta.Replica, on
|
|||||||
return segmentPlans
|
return segmentPlans
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *RowCountBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNodes []int64) []SegmentAssignPlan {
|
func (b *RowCountBasedBalancer) genSegmentPlan(replica *meta.Replica, rwNodes []int64) []SegmentAssignPlan {
|
||||||
segmentsToMove := make([]*meta.Segment, 0)
|
segmentsToMove := make([]*meta.Segment, 0)
|
||||||
|
|
||||||
nodeRowCount := make(map[int64]int, 0)
|
nodeRowCount := make(map[int64]int, 0)
|
||||||
segmentDist := make(map[int64][]*meta.Segment)
|
segmentDist := make(map[int64][]*meta.Segment)
|
||||||
totalRowCount := 0
|
totalRowCount := 0
|
||||||
for _, node := range onlineNodes {
|
for _, node := range rwNodes {
|
||||||
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))
|
dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))
|
||||||
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool {
|
||||||
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil &&
|
||||||
@ -273,7 +251,7 @@ func (b *RowCountBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNode
|
|||||||
}
|
}
|
||||||
|
|
||||||
// find nodes with less row count than average
|
// find nodes with less row count than average
|
||||||
average := totalRowCount / len(onlineNodes)
|
average := totalRowCount / len(rwNodes)
|
||||||
nodesWithLessRow := make([]int64, 0)
|
nodesWithLessRow := make([]int64, 0)
|
||||||
for node, segments := range segmentDist {
|
for node, segments := range segmentDist {
|
||||||
sort.Slice(segments, func(i, j int) bool {
|
sort.Slice(segments, func(i, j int) bool {
|
||||||
@ -313,11 +291,11 @@ func (b *RowCountBasedBalancer) genSegmentPlan(replica *meta.Replica, onlineNode
|
|||||||
return segmentPlans
|
return segmentPlans
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *RowCountBasedBalancer) genStoppingChannelPlan(replica *meta.Replica, onlineNodes []int64, offlineNodes []int64) []ChannelAssignPlan {
|
func (b *RowCountBasedBalancer) genStoppingChannelPlan(replica *meta.Replica, rwNodes []int64, roNodes []int64) []ChannelAssignPlan {
|
||||||
channelPlans := make([]ChannelAssignPlan, 0)
|
channelPlans := make([]ChannelAssignPlan, 0)
|
||||||
for _, nodeID := range offlineNodes {
|
for _, nodeID := range roNodes {
|
||||||
dmChannels := b.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(nodeID))
|
dmChannels := b.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(nodeID))
|
||||||
plans := b.AssignChannel(dmChannels, onlineNodes, false)
|
plans := b.AssignChannel(dmChannels, rwNodes, false)
|
||||||
for i := range plans {
|
for i := range plans {
|
||||||
plans[i].From = nodeID
|
plans[i].From = nodeID
|
||||||
plans[i].Replica = replica
|
plans[i].Replica = replica
|
||||||
@ -327,20 +305,20 @@ func (b *RowCountBasedBalancer) genStoppingChannelPlan(replica *meta.Replica, on
|
|||||||
return channelPlans
|
return channelPlans
|
||||||
}
|
}
|
||||||
|
|
||||||
func (b *RowCountBasedBalancer) genChannelPlan(replica *meta.Replica, onlineNodes []int64) []ChannelAssignPlan {
|
func (b *RowCountBasedBalancer) genChannelPlan(replica *meta.Replica, rwNodes []int64) []ChannelAssignPlan {
|
||||||
channelPlans := make([]ChannelAssignPlan, 0)
|
channelPlans := make([]ChannelAssignPlan, 0)
|
||||||
if len(onlineNodes) > 1 {
|
if len(rwNodes) > 1 {
|
||||||
// start to balance channels on all available nodes
|
// start to balance channels on all available nodes
|
||||||
channelDist := b.dist.ChannelDistManager.GetByFilter(meta.WithReplica2Channel(replica))
|
channelDist := b.dist.ChannelDistManager.GetByFilter(meta.WithReplica2Channel(replica))
|
||||||
if len(channelDist) == 0 {
|
if len(channelDist) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
average := int(math.Ceil(float64(len(channelDist)) / float64(len(onlineNodes))))
|
average := int(math.Ceil(float64(len(channelDist)) / float64(len(rwNodes))))
|
||||||
|
|
||||||
// find nodes with less channel count than average
|
// find nodes with less channel count than average
|
||||||
nodeWithLessChannel := make([]int64, 0)
|
nodeWithLessChannel := make([]int64, 0)
|
||||||
channelsToMove := make([]*meta.DmChannel, 0)
|
channelsToMove := make([]*meta.DmChannel, 0)
|
||||||
for _, node := range onlineNodes {
|
for _, node := range rwNodes {
|
||||||
channels := b.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(node))
|
channels := b.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(node))
|
||||||
|
|
||||||
if len(channels) <= average {
|
if len(channels) <= average {
|
||||||
|
|||||||
@ -409,8 +409,8 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
segmentCnts: []int{1, 2},
|
segmentCnts: []int{1, 2},
|
||||||
states: []session.State{session.NodeStateNormal, session.NodeStateNormal},
|
states: []session.State{session.NodeStateNormal, session.NodeStateNormal},
|
||||||
distributions: map[int64][]*meta.Segment{
|
distributions: map[int64][]*meta.Segment{
|
||||||
1: {{SegmentInfo: &datapb.SegmentInfo{ID: 1, CollectionID: 1, NumOfRows: 30}, Node: 11}},
|
11: {{SegmentInfo: &datapb.SegmentInfo{ID: 1, CollectionID: 1, NumOfRows: 30}, Node: 11}},
|
||||||
2: {
|
22: {
|
||||||
{SegmentInfo: &datapb.SegmentInfo{ID: 2, CollectionID: 1, NumOfRows: 20}, Node: 22},
|
{SegmentInfo: &datapb.SegmentInfo{ID: 2, CollectionID: 1, NumOfRows: 20}, Node: 22},
|
||||||
{SegmentInfo: &datapb.SegmentInfo{ID: 3, CollectionID: 1, NumOfRows: 30}, Node: 22},
|
{SegmentInfo: &datapb.SegmentInfo{ID: 3, CollectionID: 1, NumOfRows: 30}, Node: 22},
|
||||||
},
|
},
|
||||||
@ -455,7 +455,7 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
collection.LoadType = querypb.LoadType_LoadCollection
|
collection.LoadType = querypb.LoadType_LoadCollection
|
||||||
balancer.meta.CollectionManager.PutCollection(collection)
|
balancer.meta.CollectionManager.PutCollection(collection)
|
||||||
balancer.meta.CollectionManager.PutPartition(utils.CreateTestPartition(1, 1))
|
balancer.meta.CollectionManager.PutPartition(utils.CreateTestPartition(1, 1))
|
||||||
balancer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, append(c.nodes, c.notExistedNodes...)))
|
balancer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, c.nodes))
|
||||||
suite.broker.ExpectedCalls = nil
|
suite.broker.ExpectedCalls = nil
|
||||||
suite.broker.EXPECT().GetRecoveryInfoV2(mock.Anything, int64(1)).Return(nil, segments, nil)
|
suite.broker.EXPECT().GetRecoveryInfoV2(mock.Anything, int64(1)).Return(nil, segments, nil)
|
||||||
balancer.targetMgr.UpdateCollectionNextTarget(int64(1))
|
balancer.targetMgr.UpdateCollectionNextTarget(int64(1))
|
||||||
@ -481,6 +481,7 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
suite.balancer.nodeManager.Add(nodeInfo)
|
suite.balancer.nodeManager.Add(nodeInfo)
|
||||||
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
||||||
}
|
}
|
||||||
|
utils.RecoverAllCollection(balancer.meta)
|
||||||
|
|
||||||
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, 1)
|
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, 1)
|
||||||
if !c.multiple {
|
if !c.multiple {
|
||||||
@ -492,10 +493,11 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// clear distribution
|
// clear distribution
|
||||||
for node := range c.distributions {
|
|
||||||
|
for _, node := range c.nodes {
|
||||||
|
balancer.meta.ResourceManager.HandleNodeDown(node)
|
||||||
|
balancer.nodeManager.Remove(node)
|
||||||
balancer.dist.SegmentDistManager.Update(node)
|
balancer.dist.SegmentDistManager.Update(node)
|
||||||
}
|
|
||||||
for node := range c.distributionChannels {
|
|
||||||
balancer.dist.ChannelDistManager.Update(node)
|
balancer.dist.ChannelDistManager.Update(node)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
@ -693,6 +695,8 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalanceOnPartStopping() {
|
|||||||
suite.balancer.nodeManager.Add(nodeInfo)
|
suite.balancer.nodeManager.Add(nodeInfo)
|
||||||
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
||||||
}
|
}
|
||||||
|
utils.RecoverAllCollection(balancer.meta)
|
||||||
|
|
||||||
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, 1)
|
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, 1)
|
||||||
assertSegmentAssignPlanElementMatch(&suite.Suite, c.expectPlans, segmentPlans)
|
assertSegmentAssignPlanElementMatch(&suite.Suite, c.expectPlans, segmentPlans)
|
||||||
assertChannelAssignPlanElementMatch(&suite.Suite, c.expectChannelPlans, channelPlans)
|
assertChannelAssignPlanElementMatch(&suite.Suite, c.expectChannelPlans, channelPlans)
|
||||||
|
|||||||
@ -141,8 +141,7 @@ func (b *ScoreBasedBalancer) hasEnoughBenefit(sourceNode *nodeItem, targetNode *
|
|||||||
|
|
||||||
func (b *ScoreBasedBalancer) convertToNodeItems(collectionID int64, nodeIDs []int64) []*nodeItem {
|
func (b *ScoreBasedBalancer) convertToNodeItems(collectionID int64, nodeIDs []int64) []*nodeItem {
|
||||||
ret := make([]*nodeItem, 0, len(nodeIDs))
|
ret := make([]*nodeItem, 0, len(nodeIDs))
|
||||||
for _, nodeInfo := range b.getNodes(nodeIDs) {
|
for _, node := range nodeIDs {
|
||||||
node := nodeInfo.ID()
|
|
||||||
priority := b.calculateScore(collectionID, node)
|
priority := b.calculateScore(collectionID, node)
|
||||||
nodeItem := newNodeItem(priority, node)
|
nodeItem := newNodeItem(priority, node)
|
||||||
ret = append(ret, &nodeItem)
|
ret = append(ret, &nodeItem)
|
||||||
@ -195,56 +194,38 @@ func (b *ScoreBasedBalancer) BalanceReplica(replica *meta.Replica) ([]SegmentAss
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
onlineNodes := make([]int64, 0)
|
rwNodes := replica.GetRWNodes()
|
||||||
offlineNodes := make([]int64, 0)
|
roNodes := replica.GetRONodes()
|
||||||
|
|
||||||
// read only nodes is offline in current replica.
|
if len(rwNodes) == 0 {
|
||||||
if replica.RONodesCount() > 0 {
|
|
||||||
// if node is stop or transfer to other rg
|
|
||||||
log.RatedInfo(10, "meet read only node, try to move out all segment/channel", zap.Int64s("node", replica.GetRONodes()))
|
|
||||||
offlineNodes = append(offlineNodes, replica.GetRONodes()...)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, nid := range replica.GetNodes() {
|
|
||||||
if isStopping, err := b.nodeManager.IsStoppingNode(nid); err != nil {
|
|
||||||
log.Info("not existed node", zap.Int64("nid", nid), zap.Error(err))
|
|
||||||
continue
|
|
||||||
} else if isStopping {
|
|
||||||
offlineNodes = append(offlineNodes, nid)
|
|
||||||
} else {
|
|
||||||
onlineNodes = append(onlineNodes, nid)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(onlineNodes) == 0 {
|
|
||||||
// no available nodes to balance
|
// no available nodes to balance
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// print current distribution before generating plans
|
// print current distribution before generating plans
|
||||||
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0)
|
||||||
if len(offlineNodes) != 0 {
|
if len(roNodes) != 0 {
|
||||||
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() {
|
||||||
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", offlineNodes))
|
log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", roNodes))
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("Handle stopping nodes",
|
log.Info("Handle stopping nodes",
|
||||||
zap.Any("stopping nodes", offlineNodes),
|
zap.Any("stopping nodes", roNodes),
|
||||||
zap.Any("available nodes", onlineNodes),
|
zap.Any("available nodes", rwNodes),
|
||||||
)
|
)
|
||||||
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
// handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score
|
||||||
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, onlineNodes, offlineNodes)...)
|
channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, rwNodes, roNodes)...)
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, onlineNodes, offlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, rwNodes, roNodes)...)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() {
|
||||||
channelPlans = append(channelPlans, b.genChannelPlan(replica, onlineNodes)...)
|
channelPlans = append(channelPlans, b.genChannelPlan(replica, rwNodes)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(channelPlans) == 0 {
|
if len(channelPlans) == 0 {
|
||||||
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, onlineNodes)...)
|
segmentPlans = append(segmentPlans, b.genSegmentPlan(replica, rwNodes)...)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -439,6 +439,7 @@ func (suite *ScoreBasedBalancerTestSuite) TestBalanceOneRound() {
|
|||||||
suite.balancer.nodeManager.Add(nodeInfo)
|
suite.balancer.nodeManager.Add(nodeInfo)
|
||||||
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
suite.balancer.meta.ResourceManager.HandleNodeUp(c.nodes[i])
|
||||||
}
|
}
|
||||||
|
utils.RecoverAllCollection(balancer.meta)
|
||||||
|
|
||||||
// 4. balance and verify result
|
// 4. balance and verify result
|
||||||
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, c.collectionID)
|
segmentPlans, channelPlans := suite.getCollectionBalancePlans(balancer, c.collectionID)
|
||||||
|
|||||||
@ -101,12 +101,8 @@ func (b *BalanceChecker) replicasToBalance() []int64 {
|
|||||||
}
|
}
|
||||||
replicas := b.meta.ReplicaManager.GetByCollection(cid)
|
replicas := b.meta.ReplicaManager.GetByCollection(cid)
|
||||||
for _, replica := range replicas {
|
for _, replica := range replicas {
|
||||||
for _, nodeID := range replica.GetNodes() {
|
if replica.RONodesCount() > 0 {
|
||||||
isStopping, _ := b.nodeManager.IsStoppingNode(nodeID)
|
stoppingReplicas = append(stoppingReplicas, replica.GetID())
|
||||||
if isStopping {
|
|
||||||
stoppingReplicas = append(stoppingReplicas, replica.GetID())
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -278,6 +278,14 @@ func (suite *BalanceCheckerTestSuite) TestStoppingBalance() {
|
|||||||
suite.targetMgr.UpdateCollectionNextTarget(int64(cid2))
|
suite.targetMgr.UpdateCollectionNextTarget(int64(cid2))
|
||||||
suite.targetMgr.UpdateCollectionCurrentTarget(int64(cid2))
|
suite.targetMgr.UpdateCollectionCurrentTarget(int64(cid2))
|
||||||
|
|
||||||
|
mr1 := replica1.CopyForWrite()
|
||||||
|
mr1.AddRONode(1)
|
||||||
|
suite.checker.meta.ReplicaManager.Put(mr1.IntoReplica())
|
||||||
|
|
||||||
|
mr2 := replica2.CopyForWrite()
|
||||||
|
mr2.AddRONode(1)
|
||||||
|
suite.checker.meta.ReplicaManager.Put(mr2.IntoReplica())
|
||||||
|
|
||||||
// test stopping balance
|
// test stopping balance
|
||||||
idsToBalance := []int64{int64(replicaID1), int64(replicaID2)}
|
idsToBalance := []int64{int64(replicaID1), int64(replicaID2)}
|
||||||
replicasToBalance := suite.checker.replicasToBalance()
|
replicasToBalance := suite.checker.replicasToBalance()
|
||||||
@ -348,6 +356,14 @@ func (suite *BalanceCheckerTestSuite) TestTargetNotReady() {
|
|||||||
suite.checker.meta.CollectionManager.PutCollection(collection2, partition2)
|
suite.checker.meta.CollectionManager.PutCollection(collection2, partition2)
|
||||||
suite.checker.meta.ReplicaManager.Put(replica2)
|
suite.checker.meta.ReplicaManager.Put(replica2)
|
||||||
|
|
||||||
|
mr1 := replica1.CopyForWrite()
|
||||||
|
mr1.AddRONode(1)
|
||||||
|
suite.checker.meta.ReplicaManager.Put(mr1.IntoReplica())
|
||||||
|
|
||||||
|
mr2 := replica2.CopyForWrite()
|
||||||
|
mr2.AddRONode(1)
|
||||||
|
suite.checker.meta.ReplicaManager.Put(mr2.IntoReplica())
|
||||||
|
|
||||||
// test stopping balance
|
// test stopping balance
|
||||||
idsToBalance := []int64{int64(replicaID1)}
|
idsToBalance := []int64{int64(replicaID1)}
|
||||||
replicasToBalance := suite.checker.replicasToBalance()
|
replicasToBalance := suite.checker.replicasToBalance()
|
||||||
|
|||||||
@ -130,7 +130,7 @@ func (c *ChannelChecker) getDmChannelDiff(collectionID int64,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
dist := c.getChannelDist(replica)
|
dist := c.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithReplica2Channel(replica))
|
||||||
distMap := typeutil.NewSet[string]()
|
distMap := typeutil.NewSet[string]()
|
||||||
for _, ch := range dist {
|
for _, ch := range dist {
|
||||||
distMap.Insert(ch.GetChannelName())
|
distMap.Insert(ch.GetChannelName())
|
||||||
@ -159,14 +159,6 @@ func (c *ChannelChecker) getDmChannelDiff(collectionID int64,
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ChannelChecker) getChannelDist(replica *meta.Replica) []*meta.DmChannel {
|
|
||||||
dist := make([]*meta.DmChannel, 0)
|
|
||||||
for _, nodeID := range replica.GetNodes() {
|
|
||||||
dist = append(dist, c.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithNodeID2Channel(nodeID))...)
|
|
||||||
}
|
|
||||||
return dist
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *ChannelChecker) findRepeatedChannels(ctx context.Context, replicaID int64) []*meta.DmChannel {
|
func (c *ChannelChecker) findRepeatedChannels(ctx context.Context, replicaID int64) []*meta.DmChannel {
|
||||||
log := log.Ctx(ctx).WithRateGroup("ChannelChecker.findRepeatedChannels", 1, 60)
|
log := log.Ctx(ctx).WithRateGroup("ChannelChecker.findRepeatedChannels", 1, 60)
|
||||||
replica := c.meta.Get(replicaID)
|
replica := c.meta.Get(replicaID)
|
||||||
@ -176,7 +168,7 @@ func (c *ChannelChecker) findRepeatedChannels(ctx context.Context, replicaID int
|
|||||||
log.Info("replica does not exist, skip it")
|
log.Info("replica does not exist, skip it")
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
dist := c.getChannelDist(replica)
|
dist := c.dist.ChannelDistManager.GetByCollectionAndFilter(replica.GetCollectionID(), meta.WithReplica2Channel(replica))
|
||||||
|
|
||||||
targets := c.targetMgr.GetSealedSegmentsByCollection(replica.GetCollectionID(), meta.CurrentTarget)
|
targets := c.targetMgr.GetSealedSegmentsByCollection(replica.GetCollectionID(), meta.CurrentTarget)
|
||||||
versionsMap := make(map[string]*meta.DmChannel)
|
versionsMap := make(map[string]*meta.DmChannel)
|
||||||
@ -221,7 +213,7 @@ func (c *ChannelChecker) createChannelLoadTask(ctx context.Context, channels []*
|
|||||||
for _, ch := range channels {
|
for _, ch := range channels {
|
||||||
rwNodes := replica.GetChannelRWNodes(ch.GetChannelName())
|
rwNodes := replica.GetChannelRWNodes(ch.GetChannelName())
|
||||||
if len(rwNodes) == 0 {
|
if len(rwNodes) == 0 {
|
||||||
rwNodes = replica.GetNodes()
|
rwNodes = replica.GetRWNodes()
|
||||||
}
|
}
|
||||||
plan := c.balancer.AssignChannel([]*meta.DmChannel{ch}, rwNodes, false)
|
plan := c.balancer.AssignChannel([]*meta.DmChannel{ch}, rwNodes, false)
|
||||||
plans = append(plans, plan...)
|
plans = append(plans, plan...)
|
||||||
|
|||||||
@ -102,16 +102,17 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
|
|||||||
)
|
)
|
||||||
var tasks []task.Task
|
var tasks []task.Task
|
||||||
|
|
||||||
segments := c.getSealedSegmentsDist(replica)
|
segments := c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithReplica(replica))
|
||||||
idSegments := make(map[int64]*meta.Segment)
|
idSegments := make(map[int64]*meta.Segment)
|
||||||
|
|
||||||
|
roNodeSet := typeutil.NewUniqueSet(replica.GetRONodes()...)
|
||||||
targets := make(map[int64][]int64) // segmentID => FieldID
|
targets := make(map[int64][]int64) // segmentID => FieldID
|
||||||
for _, segment := range segments {
|
for _, segment := range segments {
|
||||||
// skip update index in stopping node
|
// skip update index in read only node
|
||||||
if ok, _ := c.nodeMgr.IsStoppingNode(segment.Node); ok {
|
if roNodeSet.Contain(segment.Node) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
missing := c.checkSegment(ctx, segment, indexInfos)
|
missing := c.checkSegment(segment, indexInfos)
|
||||||
if len(missing) > 0 {
|
if len(missing) > 0 {
|
||||||
targets[segment.GetID()] = missing
|
targets[segment.GetID()] = missing
|
||||||
idSegments[segment.GetID()] = segment
|
idSegments[segment.GetID()] = segment
|
||||||
@ -142,7 +143,7 @@ func (c *IndexChecker) checkReplica(ctx context.Context, collection *meta.Collec
|
|||||||
return tasks
|
return tasks
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *IndexChecker) checkSegment(ctx context.Context, segment *meta.Segment, indexInfos []*indexpb.IndexInfo) (fieldIDs []int64) {
|
func (c *IndexChecker) checkSegment(segment *meta.Segment, indexInfos []*indexpb.IndexInfo) (fieldIDs []int64) {
|
||||||
var result []int64
|
var result []int64
|
||||||
for _, indexInfo := range indexInfos {
|
for _, indexInfo := range indexInfos {
|
||||||
fieldID, indexID := indexInfo.FieldID, indexInfo.IndexID
|
fieldID, indexID := indexInfo.FieldID, indexInfo.IndexID
|
||||||
@ -158,14 +159,6 @@ func (c *IndexChecker) checkSegment(ctx context.Context, segment *meta.Segment,
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *IndexChecker) getSealedSegmentsDist(replica *meta.Replica) []*meta.Segment {
|
|
||||||
var ret []*meta.Segment
|
|
||||||
for _, node := range replica.GetNodes() {
|
|
||||||
ret = append(ret, c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))...)
|
|
||||||
}
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *IndexChecker) createSegmentUpdateTask(ctx context.Context, segment *meta.Segment, replica *meta.Replica) (task.Task, bool) {
|
func (c *IndexChecker) createSegmentUpdateTask(ctx context.Context, segment *meta.Segment, replica *meta.Replica) (task.Task, bool) {
|
||||||
action := task.NewSegmentActionWithScope(segment.Node, task.ActionTypeUpdate, segment.GetInsertChannel(), segment.GetID(), querypb.DataScope_Historical)
|
action := task.NewSegmentActionWithScope(segment.Node, task.ActionTypeUpdate, segment.GetInsertChannel(), segment.GetID(), querypb.DataScope_Historical)
|
||||||
t, err := task.NewSegmentTask(
|
t, err := task.NewSegmentTask(
|
||||||
|
|||||||
@ -134,9 +134,12 @@ func (suite *IndexCheckerSuite) TestLoadIndex() {
|
|||||||
suite.Equal(task.ActionTypeUpdate, action.Type())
|
suite.Equal(task.ActionTypeUpdate, action.Type())
|
||||||
suite.EqualValues(2, action.SegmentID())
|
suite.EqualValues(2, action.SegmentID())
|
||||||
|
|
||||||
// test skip load index for stopping node
|
// test skip load index for read only node
|
||||||
suite.nodeMgr.Stopping(1)
|
suite.nodeMgr.Stopping(1)
|
||||||
suite.nodeMgr.Stopping(2)
|
suite.nodeMgr.Stopping(2)
|
||||||
|
suite.meta.ResourceManager.HandleNodeStopping(1)
|
||||||
|
suite.meta.ResourceManager.HandleNodeStopping(2)
|
||||||
|
utils.RecoverAllCollection(suite.meta)
|
||||||
tasks = checker.Check(context.Background())
|
tasks = checker.Check(context.Background())
|
||||||
suite.Require().Len(tasks, 0)
|
suite.Require().Len(tasks, 0)
|
||||||
}
|
}
|
||||||
|
|||||||
@ -93,12 +93,7 @@ func (c *LeaderChecker) Check(ctx context.Context) []task.Task {
|
|||||||
|
|
||||||
replicas := c.meta.ReplicaManager.GetByCollection(collectionID)
|
replicas := c.meta.ReplicaManager.GetByCollection(collectionID)
|
||||||
for _, replica := range replicas {
|
for _, replica := range replicas {
|
||||||
for _, node := range replica.GetNodes() {
|
for _, node := range replica.GetRWNodes() {
|
||||||
if ok, _ := c.nodeMgr.IsStoppingNode(node); ok {
|
|
||||||
// no need to correct leader's view which is loaded on stopping node
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
leaderViews := c.dist.LeaderViewManager.GetByFilter(meta.WithCollectionID2LeaderView(replica.GetCollectionID()), meta.WithNodeID2LeaderView(node))
|
leaderViews := c.dist.LeaderViewManager.GetByFilter(meta.WithCollectionID2LeaderView(replica.GetCollectionID()), meta.WithNodeID2LeaderView(node))
|
||||||
for _, leaderView := range leaderViews {
|
for _, leaderView := range leaderViews {
|
||||||
dist := c.dist.SegmentDistManager.GetByFilter(meta.WithChannel(leaderView.Channel), meta.WithReplica(replica))
|
dist := c.dist.SegmentDistManager.GetByFilter(meta.WithChannel(leaderView.Channel), meta.WithReplica(replica))
|
||||||
|
|||||||
@ -237,7 +237,8 @@ func (suite *LeaderCheckerTestSuite) TestStoppingNode() {
|
|||||||
observer := suite.checker
|
observer := suite.checker
|
||||||
observer.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
|
observer.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
|
||||||
observer.meta.CollectionManager.PutPartition(utils.CreateTestPartition(1, 1))
|
observer.meta.CollectionManager.PutPartition(utils.CreateTestPartition(1, 1))
|
||||||
observer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1, 2}))
|
replica := utils.CreateTestReplica(1, 1, []int64{1, 2})
|
||||||
|
observer.meta.ReplicaManager.Put(replica)
|
||||||
segments := []*datapb.SegmentInfo{
|
segments := []*datapb.SegmentInfo{
|
||||||
{
|
{
|
||||||
ID: 1,
|
ID: 1,
|
||||||
@ -261,12 +262,9 @@ func (suite *LeaderCheckerTestSuite) TestStoppingNode() {
|
|||||||
view.TargetVersion = observer.target.GetCollectionTargetVersion(1, meta.CurrentTarget)
|
view.TargetVersion = observer.target.GetCollectionTargetVersion(1, meta.CurrentTarget)
|
||||||
observer.dist.LeaderViewManager.Update(2, view)
|
observer.dist.LeaderViewManager.Update(2, view)
|
||||||
|
|
||||||
suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{
|
mutableReplica := replica.CopyForWrite()
|
||||||
NodeID: 2,
|
mutableReplica.AddRONode(2)
|
||||||
Address: "localhost",
|
observer.meta.ReplicaManager.Put(mutableReplica.IntoReplica())
|
||||||
Hostname: "localhost",
|
|
||||||
}))
|
|
||||||
suite.nodeMgr.Stopping(2)
|
|
||||||
|
|
||||||
tasks := suite.checker.Check(context.TODO())
|
tasks := suite.checker.Check(context.TODO())
|
||||||
suite.Len(tasks, 0)
|
suite.Len(tasks, 0)
|
||||||
|
|||||||
@ -204,7 +204,7 @@ func (c *SegmentChecker) getSealedSegmentDiff(
|
|||||||
log.Info("replica does not exist, skip it")
|
log.Info("replica does not exist, skip it")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
dist := c.getSealedSegmentsDist(replica)
|
dist := c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithReplica(replica))
|
||||||
sort.Slice(dist, func(i, j int) bool {
|
sort.Slice(dist, func(i, j int) bool {
|
||||||
return dist[i].Version < dist[j].Version
|
return dist[i].Version < dist[j].Version
|
||||||
})
|
})
|
||||||
@ -293,14 +293,6 @@ func (c *SegmentChecker) getSealedSegmentDiff(
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *SegmentChecker) getSealedSegmentsDist(replica *meta.Replica) []*meta.Segment {
|
|
||||||
ret := make([]*meta.Segment, 0)
|
|
||||||
for _, node := range replica.GetNodes() {
|
|
||||||
ret = append(ret, c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node))...)
|
|
||||||
}
|
|
||||||
return ret
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *SegmentChecker) findRepeatedSealedSegments(replicaID int64) []*meta.Segment {
|
func (c *SegmentChecker) findRepeatedSealedSegments(replicaID int64) []*meta.Segment {
|
||||||
segments := make([]*meta.Segment, 0)
|
segments := make([]*meta.Segment, 0)
|
||||||
replica := c.meta.Get(replicaID)
|
replica := c.meta.Get(replicaID)
|
||||||
@ -308,7 +300,7 @@ func (c *SegmentChecker) findRepeatedSealedSegments(replicaID int64) []*meta.Seg
|
|||||||
log.Info("replica does not exist, skip it")
|
log.Info("replica does not exist, skip it")
|
||||||
return segments
|
return segments
|
||||||
}
|
}
|
||||||
dist := c.getSealedSegmentsDist(replica)
|
dist := c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithReplica(replica))
|
||||||
versions := make(map[int64]*meta.Segment)
|
versions := make(map[int64]*meta.Segment)
|
||||||
for _, s := range dist {
|
for _, s := range dist {
|
||||||
// l0 segment should be release with channel together
|
// l0 segment should be release with channel together
|
||||||
@ -398,25 +390,12 @@ func (c *SegmentChecker) createSegmentLoadTasks(ctx context.Context, segments []
|
|||||||
|
|
||||||
rwNodes := replica.GetChannelRWNodes(shard)
|
rwNodes := replica.GetChannelRWNodes(shard)
|
||||||
if len(rwNodes) == 0 {
|
if len(rwNodes) == 0 {
|
||||||
rwNodes = replica.GetNodes()
|
rwNodes = replica.GetRWNodes()
|
||||||
}
|
|
||||||
|
|
||||||
// filter out stopping nodes.
|
|
||||||
availableNodes := lo.Filter(rwNodes, func(node int64, _ int) bool {
|
|
||||||
stop, err := c.nodeMgr.IsStoppingNode(node)
|
|
||||||
if err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return !stop
|
|
||||||
})
|
|
||||||
|
|
||||||
if len(availableNodes) == 0 {
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// L0 segment can only be assign to shard leader's node
|
// L0 segment can only be assign to shard leader's node
|
||||||
if isLevel0 {
|
if isLevel0 {
|
||||||
availableNodes = []int64{leader.ID}
|
rwNodes = []int64{leader.ID}
|
||||||
}
|
}
|
||||||
|
|
||||||
segmentInfos := lo.Map(segments, func(s *datapb.SegmentInfo, _ int) *meta.Segment {
|
segmentInfos := lo.Map(segments, func(s *datapb.SegmentInfo, _ int) *meta.Segment {
|
||||||
@ -424,7 +403,7 @@ func (c *SegmentChecker) createSegmentLoadTasks(ctx context.Context, segments []
|
|||||||
SegmentInfo: s,
|
SegmentInfo: s,
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
shardPlans := c.balancer.AssignSegment(replica.GetCollectionID(), segmentInfos, availableNodes, false)
|
shardPlans := c.balancer.AssignSegment(replica.GetCollectionID(), segmentInfos, rwNodes, false)
|
||||||
for i := range shardPlans {
|
for i := range shardPlans {
|
||||||
shardPlans[i].Replica = replica
|
shardPlans[i].Replica = replica
|
||||||
}
|
}
|
||||||
|
|||||||
@ -46,7 +46,7 @@ import (
|
|||||||
func (s *Server) checkAnyReplicaAvailable(collectionID int64) bool {
|
func (s *Server) checkAnyReplicaAvailable(collectionID int64) bool {
|
||||||
for _, replica := range s.meta.ReplicaManager.GetByCollection(collectionID) {
|
for _, replica := range s.meta.ReplicaManager.GetByCollection(collectionID) {
|
||||||
isAvailable := true
|
isAvailable := true
|
||||||
for _, node := range replica.GetNodes() {
|
for _, node := range replica.GetRONodes() {
|
||||||
if s.nodeMgr.Get(node) == nil {
|
if s.nodeMgr.Get(node) == nil {
|
||||||
isAvailable = false
|
isAvailable = false
|
||||||
break
|
break
|
||||||
|
|||||||
@ -159,16 +159,12 @@ func (job *LoadCollectionJob) Execute() error {
|
|||||||
|
|
||||||
// API of LoadCollection is wired, we should use map[resourceGroupNames]replicaNumber as input, to keep consistency with `TransferReplica` API.
|
// API of LoadCollection is wired, we should use map[resourceGroupNames]replicaNumber as input, to keep consistency with `TransferReplica` API.
|
||||||
// Then we can implement dynamic replica changed in different resource group independently.
|
// Then we can implement dynamic replica changed in different resource group independently.
|
||||||
replicas, err = utils.SpawnReplicasWithRG(job.meta, req.GetCollectionID(), req.GetResourceGroups(), req.GetReplicaNumber(), collectionInfo.GetVirtualChannelNames())
|
_, err = utils.SpawnReplicasWithRG(job.meta, req.GetCollectionID(), req.GetResourceGroups(), req.GetReplicaNumber(), collectionInfo.GetVirtualChannelNames())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
msg := "failed to spawn replica for collection"
|
msg := "failed to spawn replica for collection"
|
||||||
log.Warn(msg, zap.Error(err))
|
log.Warn(msg, zap.Error(err))
|
||||||
return errors.Wrap(err, msg)
|
return errors.Wrap(err, msg)
|
||||||
}
|
}
|
||||||
for _, replica := range replicas {
|
|
||||||
log.Info("replica created", zap.Int64("replicaID", replica.GetID()),
|
|
||||||
zap.Int64s("nodes", replica.GetNodes()), zap.String("resourceGroup", replica.GetResourceGroup()))
|
|
||||||
}
|
|
||||||
job.undo.IsReplicaCreated = true
|
job.undo.IsReplicaCreated = true
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,16 +342,12 @@ func (job *LoadPartitionJob) Execute() error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
replicas, err = utils.SpawnReplicasWithRG(job.meta, req.GetCollectionID(), req.GetResourceGroups(), req.GetReplicaNumber(), collectionInfo.GetVirtualChannelNames())
|
_, err = utils.SpawnReplicasWithRG(job.meta, req.GetCollectionID(), req.GetResourceGroups(), req.GetReplicaNumber(), collectionInfo.GetVirtualChannelNames())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
msg := "failed to spawn replica for collection"
|
msg := "failed to spawn replica for collection"
|
||||||
log.Warn(msg, zap.Error(err))
|
log.Warn(msg, zap.Error(err))
|
||||||
return errors.Wrap(err, msg)
|
return errors.Wrap(err, msg)
|
||||||
}
|
}
|
||||||
for _, replica := range replicas {
|
|
||||||
log.Info("replica created", zap.Int64("replicaID", replica.GetID()),
|
|
||||||
zap.Int64s("nodes", replica.GetNodes()), zap.String("resourceGroup", replica.GetResourceGroup()))
|
|
||||||
}
|
|
||||||
job.undo.IsReplicaCreated = true
|
job.undo.IsReplicaCreated = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -39,7 +39,7 @@ func NewReplica(replica *querypb.Replica, nodes ...typeutil.UniqueSet) *Replica
|
|||||||
}
|
}
|
||||||
|
|
||||||
// newReplica creates a new replica from pb.
|
// newReplica creates a new replica from pb.
|
||||||
func newReplica(replica *querypb.Replica, channels ...string) *Replica {
|
func newReplica(replica *querypb.Replica) *Replica {
|
||||||
return &Replica{
|
return &Replica{
|
||||||
replicaPB: proto.Clone(replica).(*querypb.Replica),
|
replicaPB: proto.Clone(replica).(*querypb.Replica),
|
||||||
rwNodes: typeutil.NewUniqueSet(replica.Nodes...),
|
rwNodes: typeutil.NewUniqueSet(replica.Nodes...),
|
||||||
@ -65,7 +65,10 @@ func (replica *Replica) GetResourceGroup() string {
|
|||||||
// GetNodes returns the rw nodes of the replica.
|
// GetNodes returns the rw nodes of the replica.
|
||||||
// readonly, don't modify the returned slice.
|
// readonly, don't modify the returned slice.
|
||||||
func (replica *Replica) GetNodes() []int64 {
|
func (replica *Replica) GetNodes() []int64 {
|
||||||
return replica.replicaPB.GetNodes()
|
nodes := make([]int64, 0)
|
||||||
|
nodes = append(nodes, replica.replicaPB.GetRoNodes()...)
|
||||||
|
nodes = append(nodes, replica.replicaPB.GetNodes()...)
|
||||||
|
return nodes
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetRONodes returns the ro nodes of the replica.
|
// GetRONodes returns the ro nodes of the replica.
|
||||||
@ -74,6 +77,12 @@ func (replica *Replica) GetRONodes() []int64 {
|
|||||||
return replica.replicaPB.GetRoNodes()
|
return replica.replicaPB.GetRoNodes()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetRONodes returns the rw nodes of the replica.
|
||||||
|
// readonly, don't modify the returned slice.
|
||||||
|
func (replica *Replica) GetRWNodes() []int64 {
|
||||||
|
return replica.replicaPB.GetNodes()
|
||||||
|
}
|
||||||
|
|
||||||
// RangeOverRWNodes iterates over the read and write nodes of the replica.
|
// RangeOverRWNodes iterates over the read and write nodes of the replica.
|
||||||
func (replica *Replica) RangeOverRWNodes(f func(node int64) bool) {
|
func (replica *Replica) RangeOverRWNodes(f func(node int64) bool) {
|
||||||
replica.rwNodes.Range(f)
|
replica.rwNodes.Range(f)
|
||||||
@ -131,8 +140,8 @@ func (replica *Replica) GetChannelRWNodes(channelName string) []int64 {
|
|||||||
return replica.replicaPB.ChannelNodeInfos[channelName].GetRwNodes()
|
return replica.replicaPB.ChannelNodeInfos[channelName].GetRwNodes()
|
||||||
}
|
}
|
||||||
|
|
||||||
// copyForWrite returns a mutable replica for write operations.
|
// CopyForWrite returns a mutable replica for write operations.
|
||||||
func (replica *Replica) copyForWrite() *mutableReplica {
|
func (replica *Replica) CopyForWrite() *mutableReplica {
|
||||||
exclusiveRWNodeToChannel := make(map[int64]string)
|
exclusiveRWNodeToChannel := make(map[int64]string)
|
||||||
for name, channelNodeInfo := range replica.replicaPB.GetChannelNodeInfos() {
|
for name, channelNodeInfo := range replica.replicaPB.GetChannelNodeInfos() {
|
||||||
for _, nodeID := range channelNodeInfo.GetRwNodes() {
|
for _, nodeID := range channelNodeInfo.GetRwNodes() {
|
||||||
|
|||||||
@ -195,7 +195,7 @@ func (m *ReplicaManager) TransferReplica(collectionID typeutil.UniqueID, srcRGNa
|
|||||||
// Node Change will be executed by replica_observer in background.
|
// Node Change will be executed by replica_observer in background.
|
||||||
replicas := make([]*Replica, 0, replicaNum)
|
replicas := make([]*Replica, 0, replicaNum)
|
||||||
for i := 0; i < replicaNum; i++ {
|
for i := 0; i < replicaNum; i++ {
|
||||||
mutableReplica := srcReplicas[i].copyForWrite()
|
mutableReplica := srcReplicas[i].CopyForWrite()
|
||||||
mutableReplica.SetResourceGroup(dstRGName)
|
mutableReplica.SetResourceGroup(dstRGName)
|
||||||
replicas = append(replicas, mutableReplica.IntoReplica())
|
replicas = append(replicas, mutableReplica.IntoReplica())
|
||||||
}
|
}
|
||||||
@ -350,7 +350,7 @@ func (m *ReplicaManager) RecoverNodesInCollection(collectionID typeutil.UniqueID
|
|||||||
// nothing to do.
|
// nothing to do.
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
mutableReplica := m.replicas[assignment.GetReplicaID()].copyForWrite()
|
mutableReplica := m.replicas[assignment.GetReplicaID()].CopyForWrite()
|
||||||
mutableReplica.AddRONode(roNodes...) // rw -> ro
|
mutableReplica.AddRONode(roNodes...) // rw -> ro
|
||||||
mutableReplica.AddRWNode(recoverableNodes...) // ro -> rw
|
mutableReplica.AddRWNode(recoverableNodes...) // ro -> rw
|
||||||
mutableReplica.AddRWNode(incomingNode...) // unused -> rw
|
mutableReplica.AddRWNode(incomingNode...) // unused -> rw
|
||||||
@ -414,7 +414,7 @@ func (m *ReplicaManager) RemoveNode(replicaID typeutil.UniqueID, nodes ...typeut
|
|||||||
return merr.WrapErrReplicaNotFound(replicaID)
|
return merr.WrapErrReplicaNotFound(replicaID)
|
||||||
}
|
}
|
||||||
|
|
||||||
mutableReplica := replica.copyForWrite()
|
mutableReplica := replica.CopyForWrite()
|
||||||
mutableReplica.RemoveNode(nodes...) // ro -> unused
|
mutableReplica.RemoveNode(nodes...) // ro -> unused
|
||||||
return m.put(mutableReplica.IntoReplica())
|
return m.put(mutableReplica.IntoReplica())
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,13 +30,13 @@ func (suite *ReplicaSuite) TestReadOperations() {
|
|||||||
r := newReplica(suite.replicaPB)
|
r := newReplica(suite.replicaPB)
|
||||||
suite.testRead(r)
|
suite.testRead(r)
|
||||||
// keep same after clone.
|
// keep same after clone.
|
||||||
mutableReplica := r.copyForWrite()
|
mutableReplica := r.CopyForWrite()
|
||||||
suite.testRead(mutableReplica.IntoReplica())
|
suite.testRead(mutableReplica.IntoReplica())
|
||||||
}
|
}
|
||||||
|
|
||||||
func (suite *ReplicaSuite) TestClone() {
|
func (suite *ReplicaSuite) TestClone() {
|
||||||
r := newReplica(suite.replicaPB)
|
r := newReplica(suite.replicaPB)
|
||||||
r2 := r.copyForWrite()
|
r2 := r.CopyForWrite()
|
||||||
suite.testRead(r)
|
suite.testRead(r)
|
||||||
|
|
||||||
// after apply write operation on copy, the original should not be affected.
|
// after apply write operation on copy, the original should not be affected.
|
||||||
@ -68,7 +68,7 @@ func (suite *ReplicaSuite) TestRange() {
|
|||||||
})
|
})
|
||||||
suite.Equal(1, count)
|
suite.Equal(1, count)
|
||||||
|
|
||||||
mr := r.copyForWrite()
|
mr := r.CopyForWrite()
|
||||||
mr.AddRONode(1)
|
mr.AddRONode(1)
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
@ -81,7 +81,7 @@ func (suite *ReplicaSuite) TestRange() {
|
|||||||
|
|
||||||
func (suite *ReplicaSuite) TestWriteOperation() {
|
func (suite *ReplicaSuite) TestWriteOperation() {
|
||||||
r := newReplica(suite.replicaPB)
|
r := newReplica(suite.replicaPB)
|
||||||
mr := r.copyForWrite()
|
mr := r.CopyForWrite()
|
||||||
|
|
||||||
// test add available node.
|
// test add available node.
|
||||||
suite.False(mr.Contains(5))
|
suite.False(mr.Contains(5))
|
||||||
@ -158,7 +158,7 @@ func (suite *ReplicaSuite) testRead(r *Replica) {
|
|||||||
suite.Equal(suite.replicaPB.GetResourceGroup(), r.GetResourceGroup())
|
suite.Equal(suite.replicaPB.GetResourceGroup(), r.GetResourceGroup())
|
||||||
|
|
||||||
// Test GetNodes()
|
// Test GetNodes()
|
||||||
suite.ElementsMatch(suite.replicaPB.GetNodes(), r.GetNodes())
|
suite.ElementsMatch(suite.replicaPB.GetNodes(), r.GetRWNodes())
|
||||||
|
|
||||||
// Test GetRONodes()
|
// Test GetRONodes()
|
||||||
suite.ElementsMatch(suite.replicaPB.GetRoNodes(), r.GetRONodes())
|
suite.ElementsMatch(suite.replicaPB.GetRoNodes(), r.GetRONodes())
|
||||||
@ -195,7 +195,7 @@ func (suite *ReplicaSuite) TestChannelExclusiveMode() {
|
|||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
mutableReplica := r.copyForWrite()
|
mutableReplica := r.CopyForWrite()
|
||||||
// add 10 rw nodes, exclusive mode is false.
|
// add 10 rw nodes, exclusive mode is false.
|
||||||
for i := 0; i < 10; i++ {
|
for i := 0; i < 10; i++ {
|
||||||
mutableReplica.AddRWNode(int64(i))
|
mutableReplica.AddRWNode(int64(i))
|
||||||
@ -205,7 +205,7 @@ func (suite *ReplicaSuite) TestChannelExclusiveMode() {
|
|||||||
suite.Equal(0, len(channelNodeInfo.GetRwNodes()))
|
suite.Equal(0, len(channelNodeInfo.GetRwNodes()))
|
||||||
}
|
}
|
||||||
|
|
||||||
mutableReplica = r.copyForWrite()
|
mutableReplica = r.CopyForWrite()
|
||||||
// add 10 rw nodes, exclusive mode is true.
|
// add 10 rw nodes, exclusive mode is true.
|
||||||
for i := 10; i < 20; i++ {
|
for i := 10; i < 20; i++ {
|
||||||
mutableReplica.AddRWNode(int64(i))
|
mutableReplica.AddRWNode(int64(i))
|
||||||
@ -216,7 +216,7 @@ func (suite *ReplicaSuite) TestChannelExclusiveMode() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 4 node become read only, exclusive mode still be true
|
// 4 node become read only, exclusive mode still be true
|
||||||
mutableReplica = r.copyForWrite()
|
mutableReplica = r.CopyForWrite()
|
||||||
for i := 0; i < 4; i++ {
|
for i := 0; i < 4; i++ {
|
||||||
mutableReplica.AddRONode(int64(i))
|
mutableReplica.AddRONode(int64(i))
|
||||||
}
|
}
|
||||||
@ -226,7 +226,7 @@ func (suite *ReplicaSuite) TestChannelExclusiveMode() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// 4 node has been removed, exclusive mode back to false
|
// 4 node has been removed, exclusive mode back to false
|
||||||
mutableReplica = r.copyForWrite()
|
mutableReplica = r.CopyForWrite()
|
||||||
for i := 4; i < 8; i++ {
|
for i := 4; i < 8; i++ {
|
||||||
mutableReplica.RemoveNode(int64(i))
|
mutableReplica.RemoveNode(int64(i))
|
||||||
}
|
}
|
||||||
|
|||||||
@ -453,7 +453,6 @@ func (rm *ResourceManager) HandleNodeDown(node int64) {
|
|||||||
rm.rwmutex.Lock()
|
rm.rwmutex.Lock()
|
||||||
defer rm.rwmutex.Unlock()
|
defer rm.rwmutex.Unlock()
|
||||||
|
|
||||||
// failure of node down can be ignored, node down can be done by `RemoveAllDownNode`.
|
|
||||||
rm.incomingNode.Remove(node)
|
rm.incomingNode.Remove(node)
|
||||||
|
|
||||||
// for stopping query node becomes offline, node change won't be triggered,
|
// for stopping query node becomes offline, node change won't be triggered,
|
||||||
@ -470,6 +469,19 @@ func (rm *ResourceManager) HandleNodeDown(node int64) {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (rm *ResourceManager) HandleNodeStopping(node int64) {
|
||||||
|
rm.rwmutex.Lock()
|
||||||
|
defer rm.rwmutex.Unlock()
|
||||||
|
|
||||||
|
rm.incomingNode.Remove(node)
|
||||||
|
rgName, err := rm.unassignNode(node)
|
||||||
|
log.Info("HandleNodeStopping: remove node from resource group",
|
||||||
|
zap.String("rgName", rgName),
|
||||||
|
zap.Int64("node", node),
|
||||||
|
zap.Error(err),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// ListenResourceGroupChanged return a listener for resource group changed.
|
// ListenResourceGroupChanged return a listener for resource group changed.
|
||||||
func (rm *ResourceManager) ListenResourceGroupChanged() *syncutil.VersionedListener {
|
func (rm *ResourceManager) ListenResourceGroupChanged() *syncutil.VersionedListener {
|
||||||
return rm.rgChangedNotifier.Listen(syncutil.VersionedListenAtEarliest)
|
return rm.rgChangedNotifier.Listen(syncutil.VersionedListenAtEarliest)
|
||||||
@ -495,25 +507,6 @@ func (rm *ResourceManager) AssignPendingIncomingNode() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RemoveAllDownNode remove all down node from resource group.
|
|
||||||
func (rm *ResourceManager) RemoveAllDownNode() {
|
|
||||||
rm.rwmutex.Lock()
|
|
||||||
defer rm.rwmutex.Unlock()
|
|
||||||
|
|
||||||
for nodeID := range rm.nodeIDMap {
|
|
||||||
if node := rm.nodeMgr.Get(nodeID); node == nil || node.IsStoppingState() {
|
|
||||||
// unassignNode failure can be skip.
|
|
||||||
rgName, err := rm.unassignNode(nodeID)
|
|
||||||
log.Info("remove down node from resource group",
|
|
||||||
zap.Bool("nodeExist", node != nil),
|
|
||||||
zap.Int64("nodeID", nodeID),
|
|
||||||
zap.String("rgName", rgName),
|
|
||||||
zap.Error(err),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// AutoRecoverResourceGroup auto recover rg, return recover used node num
|
// AutoRecoverResourceGroup auto recover rg, return recover used node num
|
||||||
func (rm *ResourceManager) AutoRecoverResourceGroup(rgName string) error {
|
func (rm *ResourceManager) AutoRecoverResourceGroup(rgName string) error {
|
||||||
rm.rwmutex.Lock()
|
rm.rwmutex.Lock()
|
||||||
@ -847,7 +840,8 @@ func (rm *ResourceManager) unassignNode(node int64) (string, error) {
|
|||||||
rm.nodeChangedNotifier.NotifyAll()
|
rm.nodeChangedNotifier.NotifyAll()
|
||||||
return rg.GetName(), nil
|
return rg.GetName(), nil
|
||||||
}
|
}
|
||||||
return "", nil
|
|
||||||
|
return "", errors.Errorf("node %d not found in any resource group", node)
|
||||||
}
|
}
|
||||||
|
|
||||||
// validateResourceGroupConfig validate resource group config.
|
// validateResourceGroupConfig validate resource group config.
|
||||||
|
|||||||
@ -524,16 +524,6 @@ func (suite *ResourceManagerSuite) TestAutoRecover() {
|
|||||||
suite.Equal(80, suite.manager.GetResourceGroup("rg2").NodeNum())
|
suite.Equal(80, suite.manager.GetResourceGroup("rg2").NodeNum())
|
||||||
suite.Equal(5, suite.manager.GetResourceGroup("rg3").NodeNum())
|
suite.Equal(5, suite.manager.GetResourceGroup("rg3").NodeNum())
|
||||||
suite.Equal(5, suite.manager.GetResourceGroup(DefaultResourceGroupName).NodeNum())
|
suite.Equal(5, suite.manager.GetResourceGroup(DefaultResourceGroupName).NodeNum())
|
||||||
|
|
||||||
// Test down all nodes.
|
|
||||||
for i := 1; i <= 100; i++ {
|
|
||||||
suite.manager.nodeMgr.Remove(int64(i))
|
|
||||||
}
|
|
||||||
suite.manager.RemoveAllDownNode()
|
|
||||||
suite.Zero(suite.manager.GetResourceGroup("rg1").NodeNum())
|
|
||||||
suite.Zero(suite.manager.GetResourceGroup("rg2").NodeNum())
|
|
||||||
suite.Zero(suite.manager.GetResourceGroup("rg3").NodeNum())
|
|
||||||
suite.Zero(suite.manager.GetResourceGroup(DefaultResourceGroupName).NodeNum())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (suite *ResourceManagerSuite) testTransferNode() {
|
func (suite *ResourceManagerSuite) testTransferNode() {
|
||||||
|
|||||||
@ -100,6 +100,7 @@ func (ob *ReplicaObserver) checkNodesInReplica() {
|
|||||||
replicas := ob.meta.ReplicaManager.GetByCollection(collectionID)
|
replicas := ob.meta.ReplicaManager.GetByCollection(collectionID)
|
||||||
for _, replica := range replicas {
|
for _, replica := range replicas {
|
||||||
roNodes := replica.GetRONodes()
|
roNodes := replica.GetRONodes()
|
||||||
|
rwNodes := replica.GetRWNodes()
|
||||||
if len(roNodes) == 0 {
|
if len(roNodes) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -124,7 +125,7 @@ func (ob *ReplicaObserver) checkNodesInReplica() {
|
|||||||
zap.Int64("replicaID", replica.GetID()),
|
zap.Int64("replicaID", replica.GetID()),
|
||||||
zap.Int64s("removedNodes", removeNodes),
|
zap.Int64s("removedNodes", removeNodes),
|
||||||
zap.Int64s("roNodes", roNodes),
|
zap.Int64s("roNodes", roNodes),
|
||||||
zap.Int64s("availableNodes", replica.GetNodes()),
|
zap.Int64s("rwNodes", rwNodes),
|
||||||
)
|
)
|
||||||
if err := ob.meta.ReplicaManager.RemoveNode(replica.GetID(), removeNodes...); err != nil {
|
if err := ob.meta.ReplicaManager.RemoveNode(replica.GetID(), removeNodes...); err != nil {
|
||||||
logger.Warn("fail to remove node from replica", zap.Error(err))
|
logger.Warn("fail to remove node from replica", zap.Error(err))
|
||||||
|
|||||||
@ -98,10 +98,6 @@ func (ob *ResourceObserver) checkAndRecoverResourceGroup() {
|
|||||||
manager.AssignPendingIncomingNode()
|
manager.AssignPendingIncomingNode()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove all down nodes in resource group manager.
|
|
||||||
log.Debug("remove all down nodes in resource group manager...")
|
|
||||||
ob.meta.RemoveAllDownNode()
|
|
||||||
|
|
||||||
log.Debug("recover resource groups...")
|
log.Debug("recover resource groups...")
|
||||||
// Recover all resource group into expected configuration.
|
// Recover all resource group into expected configuration.
|
||||||
for _, rgName := range rgNames {
|
for _, rgName := range rgNames {
|
||||||
|
|||||||
@ -136,6 +136,7 @@ func (suite *ResourceObserverSuite) TestObserverRecoverOperation() {
|
|||||||
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg2"))
|
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg2"))
|
||||||
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg3"))
|
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg3"))
|
||||||
// new node is down, rg3 cannot use that node anymore.
|
// new node is down, rg3 cannot use that node anymore.
|
||||||
|
suite.meta.ResourceManager.HandleNodeDown(10)
|
||||||
suite.observer.checkAndRecoverResourceGroup()
|
suite.observer.checkAndRecoverResourceGroup()
|
||||||
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg1"))
|
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg1"))
|
||||||
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg2"))
|
suite.NoError(suite.meta.ResourceManager.MeetRequirement("rg2"))
|
||||||
|
|||||||
@ -276,7 +276,7 @@ func (s *Server) TransferSegment(ctx context.Context, req *querypb.TransferSegme
|
|||||||
// when no dst node specified, default to use all other nodes in same
|
// when no dst node specified, default to use all other nodes in same
|
||||||
dstNodeSet := typeutil.NewUniqueSet()
|
dstNodeSet := typeutil.NewUniqueSet()
|
||||||
if req.GetToAllNodes() {
|
if req.GetToAllNodes() {
|
||||||
dstNodeSet.Insert(replica.GetNodes()...)
|
dstNodeSet.Insert(replica.GetRWNodes()...)
|
||||||
} else {
|
} else {
|
||||||
// check whether dstNode is healthy
|
// check whether dstNode is healthy
|
||||||
if err := s.isStoppingNode(req.GetTargetNodeID()); err != nil {
|
if err := s.isStoppingNode(req.GetTargetNodeID()); err != nil {
|
||||||
@ -348,7 +348,7 @@ func (s *Server) TransferChannel(ctx context.Context, req *querypb.TransferChann
|
|||||||
// when no dst node specified, default to use all other nodes in same
|
// when no dst node specified, default to use all other nodes in same
|
||||||
dstNodeSet := typeutil.NewUniqueSet()
|
dstNodeSet := typeutil.NewUniqueSet()
|
||||||
if req.GetToAllNodes() {
|
if req.GetToAllNodes() {
|
||||||
dstNodeSet.Insert(replica.GetNodes()...)
|
dstNodeSet.Insert(replica.GetRWNodes()...)
|
||||||
} else {
|
} else {
|
||||||
// check whether dstNode is healthy
|
// check whether dstNode is healthy
|
||||||
if err := s.isStoppingNode(req.GetTargetNodeID()); err != nil {
|
if err := s.isStoppingNode(req.GetTargetNodeID()); err != nil {
|
||||||
|
|||||||
@ -441,7 +441,6 @@ func (s *Server) startQueryCoord() error {
|
|||||||
s.nodeMgr.Stopping(node.ServerID)
|
s.nodeMgr.Stopping(node.ServerID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s.checkReplicas()
|
|
||||||
for _, node := range sessions {
|
for _, node := range sessions {
|
||||||
s.handleNodeUp(node.ServerID)
|
s.handleNodeUp(node.ServerID)
|
||||||
}
|
}
|
||||||
@ -685,6 +684,7 @@ func (s *Server) watchNodes(revision int64) {
|
|||||||
)
|
)
|
||||||
s.nodeMgr.Stopping(nodeID)
|
s.nodeMgr.Stopping(nodeID)
|
||||||
s.checkerController.Check()
|
s.checkerController.Check()
|
||||||
|
s.meta.ResourceManager.HandleNodeStopping(nodeID)
|
||||||
|
|
||||||
case sessionutil.SessionDelEvent:
|
case sessionutil.SessionDelEvent:
|
||||||
nodeID := event.Session.ServerID
|
nodeID := event.Session.ServerID
|
||||||
@ -748,7 +748,6 @@ func (s *Server) handleNodeUp(node int64) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) handleNodeDown(node int64) {
|
func (s *Server) handleNodeDown(node int64) {
|
||||||
log := log.With(zap.Int64("nodeID", node))
|
|
||||||
s.taskScheduler.RemoveExecutor(node)
|
s.taskScheduler.RemoveExecutor(node)
|
||||||
s.distController.Remove(node)
|
s.distController.Remove(node)
|
||||||
|
|
||||||
@ -757,57 +756,12 @@ func (s *Server) handleNodeDown(node int64) {
|
|||||||
s.dist.ChannelDistManager.Update(node)
|
s.dist.ChannelDistManager.Update(node)
|
||||||
s.dist.SegmentDistManager.Update(node)
|
s.dist.SegmentDistManager.Update(node)
|
||||||
|
|
||||||
// Clear meta
|
|
||||||
for _, collection := range s.meta.CollectionManager.GetAll() {
|
|
||||||
log := log.With(zap.Int64("collectionID", collection))
|
|
||||||
replica := s.meta.ReplicaManager.GetByCollectionAndNode(collection, node)
|
|
||||||
if replica == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), node)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("failed to remove node from collection's replicas",
|
|
||||||
zap.Int64("replicaID", replica.GetID()),
|
|
||||||
zap.Error(err),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
log.Info("remove node from replica",
|
|
||||||
zap.Int64("replicaID", replica.GetID()))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear tasks
|
// Clear tasks
|
||||||
s.taskScheduler.RemoveByNode(node)
|
s.taskScheduler.RemoveByNode(node)
|
||||||
|
|
||||||
s.meta.ResourceManager.HandleNodeDown(node)
|
s.meta.ResourceManager.HandleNodeDown(node)
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkReplicas checks whether replica contains offline node, and remove those nodes
|
|
||||||
func (s *Server) checkReplicas() {
|
|
||||||
for _, collection := range s.meta.CollectionManager.GetAll() {
|
|
||||||
log := log.With(zap.Int64("collectionID", collection))
|
|
||||||
replicas := s.meta.ReplicaManager.GetByCollection(collection)
|
|
||||||
for _, replica := range replicas {
|
|
||||||
toRemove := make([]int64, 0)
|
|
||||||
for _, node := range replica.GetNodes() {
|
|
||||||
if s.nodeMgr.Get(node) == nil {
|
|
||||||
toRemove = append(toRemove, node)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(toRemove) > 0 {
|
|
||||||
log := log.With(
|
|
||||||
zap.Int64("replicaID", replica.GetID()),
|
|
||||||
zap.Int64s("offlineNodes", toRemove),
|
|
||||||
)
|
|
||||||
log.Info("some nodes are offline, remove them from replica", zap.Any("toRemove", toRemove))
|
|
||||||
if err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), toRemove...); err != nil {
|
|
||||||
log.Warn("failed to remove offline nodes from replica")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Server) updateBalanceConfigLoop(ctx context.Context) {
|
func (s *Server) updateBalanceConfigLoop(ctx context.Context) {
|
||||||
success := s.updateBalanceConfig()
|
success := s.updateBalanceConfig()
|
||||||
if success {
|
if success {
|
||||||
|
|||||||
@ -686,7 +686,7 @@ func (s *Server) LoadBalance(ctx context.Context, req *querypb.LoadBalanceReques
|
|||||||
// when no dst node specified, default to use all other nodes in same
|
// when no dst node specified, default to use all other nodes in same
|
||||||
dstNodeSet := typeutil.NewUniqueSet()
|
dstNodeSet := typeutil.NewUniqueSet()
|
||||||
if len(req.GetDstNodeIDs()) == 0 {
|
if len(req.GetDstNodeIDs()) == 0 {
|
||||||
dstNodeSet.Insert(replica.GetNodes()...)
|
dstNodeSet.Insert(replica.GetRWNodes()...)
|
||||||
} else {
|
} else {
|
||||||
for _, dstNode := range req.GetDstNodeIDs() {
|
for _, dstNode := range req.GetDstNodeIDs() {
|
||||||
if !replica.Contains(dstNode) {
|
if !replica.Contains(dstNode) {
|
||||||
@ -1075,7 +1075,7 @@ func (s *Server) DescribeResourceGroup(ctx context.Context, req *querypb.Describ
|
|||||||
replicasInRG := s.meta.GetByResourceGroup(req.GetResourceGroup())
|
replicasInRG := s.meta.GetByResourceGroup(req.GetResourceGroup())
|
||||||
for _, replica := range replicasInRG {
|
for _, replica := range replicasInRG {
|
||||||
loadedReplicas[replica.GetCollectionID()]++
|
loadedReplicas[replica.GetCollectionID()]++
|
||||||
for _, node := range replica.GetNodes() {
|
for _, node := range replica.GetRONodes() {
|
||||||
if !s.meta.ContainsNode(replica.GetResourceGroup(), node) {
|
if !s.meta.ContainsNode(replica.GetResourceGroup(), node) {
|
||||||
outgoingNodes[replica.GetCollectionID()]++
|
outgoingNodes[replica.GetCollectionID()]++
|
||||||
}
|
}
|
||||||
@ -1090,7 +1090,7 @@ func (s *Server) DescribeResourceGroup(ctx context.Context, req *querypb.Describ
|
|||||||
if replica.GetResourceGroup() == req.GetResourceGroup() {
|
if replica.GetResourceGroup() == req.GetResourceGroup() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
for _, node := range replica.GetNodes() {
|
for _, node := range replica.GetRONodes() {
|
||||||
if s.meta.ContainsNode(req.GetResourceGroup(), node) {
|
if s.meta.ContainsNode(req.GetResourceGroup(), node) {
|
||||||
incomingNodes[collection]++
|
incomingNodes[collection]++
|
||||||
}
|
}
|
||||||
@ -1101,8 +1101,7 @@ func (s *Server) DescribeResourceGroup(ctx context.Context, req *querypb.Describ
|
|||||||
nodes := make([]*commonpb.NodeInfo, 0, len(rg.GetNodes()))
|
nodes := make([]*commonpb.NodeInfo, 0, len(rg.GetNodes()))
|
||||||
for _, nodeID := range rg.GetNodes() {
|
for _, nodeID := range rg.GetNodes() {
|
||||||
nodeSessionInfo := s.nodeMgr.Get(nodeID)
|
nodeSessionInfo := s.nodeMgr.Get(nodeID)
|
||||||
// Filter offline nodes and nodes in stopping state
|
if nodeSessionInfo != nil {
|
||||||
if nodeSessionInfo != nil && !nodeSessionInfo.IsStoppingState() {
|
|
||||||
nodes = append(nodes, &commonpb.NodeInfo{
|
nodes = append(nodes, &commonpb.NodeInfo{
|
||||||
NodeId: nodeSessionInfo.ID(),
|
NodeId: nodeSessionInfo.ID(),
|
||||||
Address: nodeSessionInfo.Addr(),
|
Address: nodeSessionInfo.Addr(),
|
||||||
|
|||||||
@ -432,7 +432,8 @@ func (suite *ServiceSuite) TestResourceGroup() {
|
|||||||
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
|
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
|
||||||
ID: 1,
|
ID: 1,
|
||||||
CollectionID: 1,
|
CollectionID: 1,
|
||||||
Nodes: []int64{1011, 1013},
|
Nodes: []int64{1011},
|
||||||
|
RoNodes: []int64{1013},
|
||||||
ResourceGroup: "rg11",
|
ResourceGroup: "rg11",
|
||||||
},
|
},
|
||||||
typeutil.NewUniqueSet(1011, 1013)),
|
typeutil.NewUniqueSet(1011, 1013)),
|
||||||
@ -440,7 +441,8 @@ func (suite *ServiceSuite) TestResourceGroup() {
|
|||||||
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
|
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
|
||||||
ID: 2,
|
ID: 2,
|
||||||
CollectionID: 2,
|
CollectionID: 2,
|
||||||
Nodes: []int64{1012, 1014},
|
Nodes: []int64{1014},
|
||||||
|
RoNodes: []int64{1012},
|
||||||
ResourceGroup: "rg12",
|
ResourceGroup: "rg12",
|
||||||
},
|
},
|
||||||
typeutil.NewUniqueSet(1012, 1014)),
|
typeutil.NewUniqueSet(1012, 1014)),
|
||||||
|
|||||||
@ -22,7 +22,6 @@ import (
|
|||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
|
||||||
"github.com/milvus-io/milvus/pkg/log"
|
"github.com/milvus-io/milvus/pkg/log"
|
||||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
@ -35,19 +34,6 @@ var (
|
|||||||
ErrUseWrongNumRG = errors.New("resource group num can only be 0, 1 or same as replica number")
|
ErrUseWrongNumRG = errors.New("resource group num can only be 0, 1 or same as replica number")
|
||||||
)
|
)
|
||||||
|
|
||||||
func GetReplicaNodesInfo(replicaMgr *meta.ReplicaManager, nodeMgr *session.NodeManager, replicaID int64) []*session.NodeInfo {
|
|
||||||
replica := replicaMgr.Get(replicaID)
|
|
||||||
if replica == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
nodes := make([]*session.NodeInfo, 0, len(replica.GetNodes()))
|
|
||||||
for _, node := range replica.GetNodes() {
|
|
||||||
nodes = append(nodes, nodeMgr.Get(node))
|
|
||||||
}
|
|
||||||
return nodes
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetPartitions(collectionMgr *meta.CollectionManager, collectionID int64) ([]int64, error) {
|
func GetPartitions(collectionMgr *meta.CollectionManager, collectionID int64) ([]int64, error) {
|
||||||
collection := collectionMgr.GetCollection(collectionID)
|
collection := collectionMgr.GetCollection(collectionID)
|
||||||
if collection != nil {
|
if collection != nil {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user