mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 01:58:34 +08:00
fix: streamingnode get stucked when stop (#42501)
issue: #42498 - fix: sealed segment cannot be flushed after upgrading - fix: get mvcc panic when upgrading - ignore the L0 segment when graceful stop of querynode. --------- Signed-off-by: chyezh <chyezh@outlook.com>
This commit is contained in:
parent
35c17523de
commit
0567f512b3
@ -1204,6 +1204,9 @@ streaming:
|
|||||||
# It's ok to set it into duration string, such as 30s or 1m30s, see time.ParseDuration
|
# It's ok to set it into duration string, such as 30s or 1m30s, see time.ParseDuration
|
||||||
backoffInitialInterval: 50ms
|
backoffInitialInterval: 50ms
|
||||||
backoffMultiplier: 2 # The multiplier of balance task trigger backoff, 2 by default
|
backoffMultiplier: 2 # The multiplier of balance task trigger backoff, 2 by default
|
||||||
|
# The timeout of wal balancer operation, 10s by default.
|
||||||
|
# If the operation exceeds this timeout, it will be canceled.
|
||||||
|
operationTimeout: 10s
|
||||||
balancePolicy:
|
balancePolicy:
|
||||||
name: vchannelFair # The multiplier of balance task trigger backoff, 2 by default
|
name: vchannelFair # The multiplier of balance task trigger backoff, 2 by default
|
||||||
vchannelFair:
|
vchannelFair:
|
||||||
|
|||||||
@ -79,8 +79,8 @@ func (h *ServerHandler) GetDataVChanPositions(channel RWChannel, partitionID Uni
|
|||||||
)
|
)
|
||||||
for _, s := range segments {
|
for _, s := range segments {
|
||||||
if (partitionID > allPartitionID && s.PartitionID != partitionID) ||
|
if (partitionID > allPartitionID && s.PartitionID != partitionID) ||
|
||||||
(s.GetState() != commonpb.SegmentState_Growing && s.GetStartPosition() == nil && s.GetDmlPosition() == nil) {
|
((s.GetState() != commonpb.SegmentState_Growing && s.GetState() != commonpb.SegmentState_Sealed) && s.GetStartPosition() == nil && s.GetDmlPosition() == nil) {
|
||||||
// empty growing segment don't have dml position and start position
|
// empty growing and sealed segment don't have dml position and start position
|
||||||
// and it should be recovered for streamingnode, so we add the state-filter here.
|
// and it should be recovered for streamingnode, so we add the state-filter here.
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@ -562,6 +562,9 @@ func (node *QueryNode) Stop() error {
|
|||||||
if node.pipelineManager != nil {
|
if node.pipelineManager != nil {
|
||||||
channelNum = node.pipelineManager.Num()
|
channelNum = node.pipelineManager.Num()
|
||||||
}
|
}
|
||||||
|
if len(sealedSegments) == 0 && len(growingSegments) == 0 && channelNum == 0 {
|
||||||
|
break outer
|
||||||
|
}
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-timeoutCh:
|
case <-timeoutCh:
|
||||||
|
|||||||
@ -315,13 +315,16 @@ func (b *balancerImpl) applyBalanceResultToStreamingNode(ctx context.Context, mo
|
|||||||
|
|
||||||
// different channel can be execute concurrently.
|
// different channel can be execute concurrently.
|
||||||
g, _ := errgroup.WithContext(ctx)
|
g, _ := errgroup.WithContext(ctx)
|
||||||
|
opTimeout := paramtable.Get().StreamingCfg.WALBalancerOperationTimeout.GetAsDurationByParse()
|
||||||
// generate balance operations and applied them.
|
// generate balance operations and applied them.
|
||||||
for _, channel := range modifiedChannels {
|
for _, channel := range modifiedChannels {
|
||||||
channel := channel
|
channel := channel
|
||||||
g.Go(func() error {
|
g.Go(func() error {
|
||||||
// all history channels should be remove from related nodes.
|
// all history channels should be remove from related nodes.
|
||||||
for _, assignment := range channel.AssignHistories() {
|
for _, assignment := range channel.AssignHistories() {
|
||||||
if err := resource.Resource().StreamingNodeManagerClient().Remove(ctx, assignment); err != nil {
|
opCtx, cancel := context.WithTimeout(ctx, opTimeout)
|
||||||
|
defer cancel()
|
||||||
|
if err := resource.Resource().StreamingNodeManagerClient().Remove(opCtx, assignment); err != nil {
|
||||||
b.Logger().Warn("fail to remove channel", zap.String("assignment", assignment.String()), zap.Error(err))
|
b.Logger().Warn("fail to remove channel", zap.String("assignment", assignment.String()), zap.Error(err))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -329,7 +332,9 @@ func (b *balancerImpl) applyBalanceResultToStreamingNode(ctx context.Context, mo
|
|||||||
}
|
}
|
||||||
|
|
||||||
// assign the channel to the target node.
|
// assign the channel to the target node.
|
||||||
if err := resource.Resource().StreamingNodeManagerClient().Assign(ctx, channel.CurrentAssignment()); err != nil {
|
opCtx, cancel := context.WithTimeout(ctx, opTimeout)
|
||||||
|
defer cancel()
|
||||||
|
if err := resource.Resource().StreamingNodeManagerClient().Assign(opCtx, channel.CurrentAssignment()); err != nil {
|
||||||
b.Logger().Warn("fail to assign channel", zap.String("assignment", channel.CurrentAssignment().String()), zap.Error(err))
|
b.Logger().Warn("fail to assign channel", zap.String("assignment", channel.CurrentAssignment().String()), zap.Error(err))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,6 +31,7 @@ var (
|
|||||||
_ HandlerClient = (*handlerClientImpl)(nil)
|
_ HandlerClient = (*handlerClientImpl)(nil)
|
||||||
ErrClientClosed = errors.New("handler client is closed")
|
ErrClientClosed = errors.New("handler client is closed")
|
||||||
ErrClientAssignmentNotReady = errors.New("handler client assignment not ready")
|
ErrClientAssignmentNotReady = errors.New("handler client assignment not ready")
|
||||||
|
ErrReadOnlyWAL = errors.New("wal is read only")
|
||||||
)
|
)
|
||||||
|
|
||||||
type (
|
type (
|
||||||
|
|||||||
@ -59,6 +59,9 @@ func (hc *handlerClientImpl) GetLatestMVCCTimestampIfLocal(ctx context.Context,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
if w.Channel().AccessMode != types.AccessModeRW {
|
||||||
|
return 0, ErrReadOnlyWAL
|
||||||
|
}
|
||||||
return w.GetLatestMVCCTimestamp(ctx, vchannel)
|
return w.GetLatestMVCCTimestamp(ctx, vchannel)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -93,6 +93,7 @@ func (w *walLifetime) Close() {
|
|||||||
logger := log.With(zap.String("current", toStateString(currentState)))
|
logger := log.With(zap.String("current", toStateString(currentState)))
|
||||||
if oldWAL := currentState.GetWAL(); oldWAL != nil {
|
if oldWAL := currentState.GetWAL(); oldWAL != nil {
|
||||||
oldWAL.Close()
|
oldWAL.Close()
|
||||||
|
w.statePair.SetCurrentState(newUnavailableCurrentState(currentState.Term(), nil))
|
||||||
logger.Info("close current term wal done at wal life time close")
|
logger.Info("close current term wal done at wal life time close")
|
||||||
}
|
}
|
||||||
logger.Info("wal lifetime closed")
|
logger.Info("wal lifetime closed")
|
||||||
|
|||||||
@ -5572,6 +5572,7 @@ type streamingConfig struct {
|
|||||||
WALBalancerTriggerInterval ParamItem `refreshable:"true"`
|
WALBalancerTriggerInterval ParamItem `refreshable:"true"`
|
||||||
WALBalancerBackoffInitialInterval ParamItem `refreshable:"true"`
|
WALBalancerBackoffInitialInterval ParamItem `refreshable:"true"`
|
||||||
WALBalancerBackoffMultiplier ParamItem `refreshable:"true"`
|
WALBalancerBackoffMultiplier ParamItem `refreshable:"true"`
|
||||||
|
WALBalancerOperationTimeout ParamItem `refreshable:"true"`
|
||||||
|
|
||||||
// balancer Policy
|
// balancer Policy
|
||||||
WALBalancerPolicyName ParamItem `refreshable:"true"`
|
WALBalancerPolicyName ParamItem `refreshable:"true"`
|
||||||
@ -5637,6 +5638,15 @@ It's ok to set it into duration string, such as 30s or 1m30s, see time.ParseDura
|
|||||||
Export: true,
|
Export: true,
|
||||||
}
|
}
|
||||||
p.WALBalancerBackoffMultiplier.Init(base.mgr)
|
p.WALBalancerBackoffMultiplier.Init(base.mgr)
|
||||||
|
p.WALBalancerOperationTimeout = ParamItem{
|
||||||
|
Key: "streaming.walBalancer.operationTimeout",
|
||||||
|
Version: "2.6.0",
|
||||||
|
Doc: `The timeout of wal balancer operation, 10s by default.
|
||||||
|
If the operation exceeds this timeout, it will be canceled.`,
|
||||||
|
DefaultValue: "10s",
|
||||||
|
Export: true,
|
||||||
|
}
|
||||||
|
p.WALBalancerOperationTimeout.Init(base.mgr)
|
||||||
|
|
||||||
p.WALBalancerPolicyName = ParamItem{
|
p.WALBalancerPolicyName = ParamItem{
|
||||||
Key: "streaming.walBalancer.balancePolicy.name",
|
Key: "streaming.walBalancer.balancePolicy.name",
|
||||||
|
|||||||
@ -628,6 +628,7 @@ func TestComponentParam(t *testing.T) {
|
|||||||
assert.Equal(t, 0.01, params.StreamingCfg.WALBalancerPolicyVChannelFairAntiAffinityWeight.GetAsFloat())
|
assert.Equal(t, 0.01, params.StreamingCfg.WALBalancerPolicyVChannelFairAntiAffinityWeight.GetAsFloat())
|
||||||
assert.Equal(t, 0.01, params.StreamingCfg.WALBalancerPolicyVChannelFairRebalanceTolerance.GetAsFloat())
|
assert.Equal(t, 0.01, params.StreamingCfg.WALBalancerPolicyVChannelFairRebalanceTolerance.GetAsFloat())
|
||||||
assert.Equal(t, 3, params.StreamingCfg.WALBalancerPolicyVChannelFairRebalanceMaxStep.GetAsInt())
|
assert.Equal(t, 3, params.StreamingCfg.WALBalancerPolicyVChannelFairRebalanceMaxStep.GetAsInt())
|
||||||
|
assert.Equal(t, 10*time.Second, params.StreamingCfg.WALBalancerOperationTimeout.GetAsDurationByParse())
|
||||||
assert.Equal(t, 1.0, params.StreamingCfg.WALBroadcasterConcurrencyRatio.GetAsFloat())
|
assert.Equal(t, 1.0, params.StreamingCfg.WALBroadcasterConcurrencyRatio.GetAsFloat())
|
||||||
assert.Equal(t, 10*time.Second, params.StreamingCfg.TxnDefaultKeepaliveTimeout.GetAsDurationByParse())
|
assert.Equal(t, 10*time.Second, params.StreamingCfg.TxnDefaultKeepaliveTimeout.GetAsDurationByParse())
|
||||||
assert.Equal(t, 30*time.Second, params.StreamingCfg.WALWriteAheadBufferKeepalive.GetAsDurationByParse())
|
assert.Equal(t, 30*time.Second, params.StreamingCfg.WALWriteAheadBufferKeepalive.GetAsDurationByParse())
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user