Fix watchDmChannel may be out-of-date after compaction issue and add context (#18681)

Signed-off-by: wayblink <anyang.wang@zilliz.com>

Signed-off-by: wayblink <anyang.wang@zilliz.com>
This commit is contained in:
wayblink 2022-08-23 17:44:56 +08:00 committed by GitHub
parent d5bb377bc7
commit 6fd83464cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 143 additions and 29 deletions

View File

@ -318,9 +318,12 @@ func (s *Server) GetSegmentInfo(ctx context.Context, req *datapb.GetSegmentInfoR
var info *SegmentInfo
if req.IncludeUnHealthy {
info = s.meta.GetAllSegment(id)
if info != nil {
infos = append(infos, info.SegmentInfo)
if info == nil {
log.Warn("failed to get segment, this may have been cleaned", zap.Int64("segmentID", id))
resp.Status.Reason = fmt.Sprintf("failed to get segment %d", id)
return resp, nil
}
infos = append(infos, info.SegmentInfo)
} else {
info = s.meta.GetSegment(id)
if info == nil {

View File

@ -534,9 +534,9 @@ func (broker *globalMetaBroker) releaseSegmentReferLock(ctx context.Context, tas
}
// getDataSegmentInfosByIDs return the SegmentInfo details according to the given ids through RPC to datacoord
func (broker *globalMetaBroker) getDataSegmentInfosByIDs(segmentIds []int64) ([]*datapb.SegmentInfo, error) {
func (broker *globalMetaBroker) getDataSegmentInfosByIDs(ctx context.Context, segmentIds []int64) ([]*datapb.SegmentInfo, error) {
var segmentInfos []*datapb.SegmentInfo
infoResp, err := broker.dataCoord.GetSegmentInfo(broker.ctx, &datapb.GetSegmentInfoRequest{
infoResp, err := broker.dataCoord.GetSegmentInfo(ctx, &datapb.GetSegmentInfoRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_SegmentInfo,
MsgID: 0,

View File

@ -170,18 +170,18 @@ func TestGetDataSegmentInfosByIDs(t *testing.T) {
handler, err := newGlobalMetaBroker(ctx, nil, dataCoord, nil, cm)
assert.Nil(t, err)
segmentInfos, err := handler.getDataSegmentInfosByIDs([]int64{1})
segmentInfos, err := handler.getDataSegmentInfosByIDs(ctx, []int64{1})
assert.Nil(t, err)
assert.Equal(t, 1, len(segmentInfos))
dataCoord.returnError = true
segmentInfos2, err := handler.getDataSegmentInfosByIDs([]int64{1})
segmentInfos2, err := handler.getDataSegmentInfosByIDs(ctx, []int64{1})
assert.Error(t, err)
assert.Empty(t, segmentInfos2)
dataCoord.returnError = false
dataCoord.returnGrpcError = true
segmentInfos3, err := handler.getDataSegmentInfosByIDs([]int64{1})
segmentInfos3, err := handler.getDataSegmentInfosByIDs(ctx, []int64{1})
assert.Error(t, err)
assert.Empty(t, segmentInfos3)

View File

@ -22,19 +22,19 @@ import (
"fmt"
"sync"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"go.uber.org/atomic"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
"github.com/milvus-io/milvus/internal/common"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/etcdpb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/proto/milvuspb"
"github.com/milvus-io/milvus/internal/proto/proxypb"
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
"github.com/milvus-io/milvus/internal/proto/schemapb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/types"
@ -334,6 +334,7 @@ type dataCoordMock struct {
channelNumPerCol int
returnError bool
returnGrpcError bool
returnErrorCount atomic.Int32
segmentState commonpb.SegmentState
errLevel int
@ -376,14 +377,27 @@ func (data *dataCoordMock) GetRecoveryInfo(ctx context.Context, req *datapb.GetR
}, nil
}
if data.returnErrorCount.Load() > 0 {
data.returnErrorCount.Dec()
return &datapb.GetRecoveryInfoResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: "limited get recovery info failed",
},
}, nil
}
if _, ok := data.col2DmChannels[collectionID]; !ok {
channelInfos := make([]*datapb.VchannelInfo, 0)
data.collections = append(data.collections, collectionID)
for i := int32(0); i < common.DefaultShardsNum; i++ {
vChannel := fmt.Sprintf("%s_%d_%dv%d", Params.CommonCfg.RootCoordDml, i, collectionID, i)
channelInfo := &datapb.VchannelInfo{
CollectionID: collectionID,
ChannelName: vChannel,
CollectionID: collectionID,
ChannelName: vChannel,
UnflushedSegmentIds: []int64{int64(i*1000 + 1)},
FlushedSegmentIds: []int64{int64(i*1000 + 2)},
DroppedSegmentIds: []int64{int64(i*1000 + 3)},
SeekPosition: &internalpb.MsgPosition{
ChannelName: vChannel,
},
@ -528,6 +542,16 @@ func (data *dataCoordMock) GetSegmentInfo(ctx context.Context, req *datapb.GetSe
}, nil
}
if data.returnErrorCount.Load() > 0 {
data.returnErrorCount.Dec()
return &datapb.GetSegmentInfoResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: "limited mock get segmentInfo failed",
},
}, nil
}
var segmentInfos []*datapb.SegmentInfo
for _, segmentID := range req.SegmentIDs {
segmentInfos = append(segmentInfos, &datapb.SegmentInfo{

View File

@ -514,7 +514,7 @@ func (lct *loadCollectionTask) execute(ctx context.Context) error {
ReplicaID: replica.GetReplicaID(),
}
fullWatchRequest, err := generateFullWatchDmChannelsRequest(lct.broker, watchRequest)
fullWatchRequest, err := generateFullWatchDmChannelsRequest(ctx, lct.broker, watchRequest)
if err != nil {
lct.setResultInfo(err)
return err
@ -1008,7 +1008,7 @@ func (lpt *loadPartitionTask) execute(ctx context.Context) error {
ReplicaID: replica.GetReplicaID(),
}
fullWatchRequest, err := generateFullWatchDmChannelsRequest(lpt.broker, watchRequest)
fullWatchRequest, err := generateFullWatchDmChannelsRequest(ctx, lpt.broker, watchRequest)
if err != nil {
lpt.setResultInfo(err)
return err
@ -2059,7 +2059,7 @@ func (lbt *loadBalanceTask) processNodeDownLoadBalance(ctx context.Context) erro
watchRequest.PartitionIDs = toRecoverPartitionIDs
}
fullWatchRequest, err := generateFullWatchDmChannelsRequest(lbt.broker, watchRequest)
fullWatchRequest, err := generateFullWatchDmChannelsRequest(ctx, lbt.broker, watchRequest)
if err != nil {
lbt.setResultInfo(err)
return err

View File

@ -393,7 +393,7 @@ func (scheduler *TaskScheduler) unmarshalTask(taskID UniqueID, t string) (task,
if err != nil {
return nil, err
}
fullReq, err := generateFullWatchDmChannelsRequest(scheduler.broker, &req)
fullReq, err := generateFullWatchDmChannelsRequest(baseTask.traceCtx(), scheduler.broker, &req)
if err != nil {
return nil, err
}

View File

@ -17,15 +17,18 @@
package querycoord
import (
"context"
"github.com/golang/protobuf/proto"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/util/retry"
"go.uber.org/zap"
)
// generateFullWatchDmChannelsRequest fill the WatchDmChannelsRequest by get segment infos from meta broker
func generateFullWatchDmChannelsRequest(broker *globalMetaBroker, request *querypb.WatchDmChannelsRequest) (*querypb.WatchDmChannelsRequest, error) {
func generateFullWatchDmChannelsRequest(ctx context.Context, broker *globalMetaBroker, request *querypb.WatchDmChannelsRequest) (*querypb.WatchDmChannelsRequest, error) {
cloned := proto.Clone(request).(*querypb.WatchDmChannelsRequest)
vChannels := cloned.GetInfos()
@ -34,6 +37,49 @@ func generateFullWatchDmChannelsRequest(broker *globalMetaBroker, request *query
reviseVChannelInfo(vChannel)
}
vChannelDict := make(map[string]bool, len(vChannels))
for _, info := range vChannels {
vChannelDict[info.ChannelName] = true
}
var segmentInfos []*datapb.SegmentInfo
// if the return segmentInfos is less than required, this may because the segment is compacted.
// refresh the vchannels and segmentInfos needed.
retryFunc := func() error {
newVChannels := make([]*datapb.VchannelInfo, 0)
newSegmentIds := make([]int64, 0)
newVChannelDict := make(map[string]bool)
for _, partitionID := range request.GetLoadMeta().GetPartitionIDs() {
partitionVChannels, _, err := broker.getRecoveryInfo(ctx, request.GetCollectionID(), partitionID)
if err != nil {
log.Error("GetRecoveryInfo failed, retrying...", zap.Error(err))
return err
}
for _, vchannel := range partitionVChannels {
if vChannelDict[vchannel.GetChannelName()] && !newVChannelDict[vchannel.GetChannelName()] {
newVChannels = append(newVChannels, vchannel)
newVChannelDict[vchannel.GetChannelName()] = true
}
}
}
for _, vChannel := range newVChannels {
newSegmentIds = append(newSegmentIds, vChannel.FlushedSegmentIds...)
newSegmentIds = append(newSegmentIds, vChannel.UnflushedSegmentIds...)
newSegmentIds = append(newSegmentIds, vChannel.DroppedSegmentIds...)
}
newSegmentInfos, err := broker.getDataSegmentInfosByIDs(ctx, newSegmentIds)
if err != nil {
log.Error("Get Vchannel SegmentInfos failed, retrying...", zap.Error(err))
return err
}
cloned.Infos = newVChannels
segmentInfos = newSegmentInfos
return nil
}
// fill segmentInfos
segmentIds := make([]int64, 0)
for _, vChannel := range vChannels {
@ -41,18 +87,24 @@ func generateFullWatchDmChannelsRequest(broker *globalMetaBroker, request *query
segmentIds = append(segmentIds, vChannel.UnflushedSegmentIds...)
segmentIds = append(segmentIds, vChannel.DroppedSegmentIds...)
}
segmentInfos, err := broker.getDataSegmentInfosByIDs(segmentIds)
segmentInfos, err := broker.getDataSegmentInfosByIDs(ctx, segmentIds)
if err != nil {
log.Error("Get Vchannel SegmentInfos failed", zap.Error(err))
return nil, err
retryErr := retry.Do(ctx, retryFunc, retry.Attempts(20))
if retryErr != nil {
log.Error("Get Vchannel SegmentInfos failed after retry", zap.Error(retryErr))
return nil, retryErr
}
}
segmentDict := make(map[int64]*datapb.SegmentInfo)
for _, info := range segmentInfos {
segmentDict[info.ID] = info
}
cloned.SegmentInfos = segmentDict
return cloned, err
return cloned, nil
}
// thinWatchDmChannelsRequest will return a thin version of WatchDmChannelsRequest

View File

@ -27,8 +27,8 @@ import (
)
func TestGenerateFullWatchDmChannelsRequest(t *testing.T) {
dataCoord := &dataCoordMock{}
ctx, cancel := context.WithCancel(context.Background())
dataCoord := newDataCoordMock(ctx)
handler, err := newGlobalMetaBroker(ctx, nil, dataCoord, nil, nil)
assert.Nil(t, err)
@ -46,12 +46,12 @@ func TestGenerateFullWatchDmChannelsRequest(t *testing.T) {
NodeID: 1,
}
fullWatchDmChannelsRequest, err := generateFullWatchDmChannelsRequest(handler, watchDmChannelsRequest)
fullWatchDmChannelsRequest, err := generateFullWatchDmChannelsRequest(ctx, handler, watchDmChannelsRequest)
assert.Nil(t, err)
assert.NotEmpty(t, fullWatchDmChannelsRequest.GetSegmentInfos())
dataCoord.returnError = true
fullWatchDmChannelsRequest2, err := generateFullWatchDmChannelsRequest(handler, watchDmChannelsRequest)
fullWatchDmChannelsRequest2, err := generateFullWatchDmChannelsRequest(ctx, handler, watchDmChannelsRequest)
assert.Error(t, err)
assert.Empty(t, fullWatchDmChannelsRequest2.GetSegmentInfos())
@ -88,8 +88,8 @@ func TestThinWatchDmChannelsRequest(t *testing.T) {
}
func TestUpgradeCompatibility(t *testing.T) {
dataCoord := &dataCoordMock{}
ctx, cancel := context.WithCancel(context.Background())
dataCoord := newDataCoordMock(ctx)
handler, err := newGlobalMetaBroker(ctx, nil, dataCoord, nil, nil)
assert.Nil(t, err)
@ -110,7 +110,7 @@ func TestUpgradeCompatibility(t *testing.T) {
NodeID: 1,
}
fullWatchDmChannelsRequest, err := generateFullWatchDmChannelsRequest(handler, watchDmChannelsRequest)
fullWatchDmChannelsRequest, err := generateFullWatchDmChannelsRequest(ctx, handler, watchDmChannelsRequest)
assert.Nil(t, err)
assert.NotEmpty(t, fullWatchDmChannelsRequest.GetSegmentInfos())
vChannel := fullWatchDmChannelsRequest.GetInfos()[0]
@ -124,3 +124,35 @@ func TestUpgradeCompatibility(t *testing.T) {
assert.Equal(t, 1, len(vChannel.GetUnflushedSegmentIds()))
cancel()
}
func TestGetMissSegment(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
dataCoord := newDataCoordMock(ctx)
broker, err := newGlobalMetaBroker(ctx, nil, dataCoord, nil, nil)
assert.Nil(t, err)
vChannels, _, err := broker.getRecoveryInfo(ctx, defaultCollectionID, 0)
assert.Nil(t, err)
watchDmChannelsRequest := &querypb.WatchDmChannelsRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_WatchDmChannels,
},
CollectionID: defaultCollectionID,
PartitionIDs: []int64{1},
Infos: vChannels,
NodeID: 1,
LoadMeta: &querypb.LoadMetaInfo{
LoadType: querypb.LoadType_LoadCollection,
CollectionID: defaultCollectionID,
PartitionIDs: []int64{1},
},
}
// inject certain number of error
dataCoord.returnErrorCount.Store(3)
_, err = generateFullWatchDmChannelsRequest(ctx, broker, watchDmChannelsRequest)
assert.NoError(t, err)
cancel()
}

View File

@ -170,8 +170,11 @@ func (w *watchDmChannelsTask) Execute(ctx context.Context) (err error) {
for _, info := range w.req.Infos {
for _, ufInfoID := range info.GetUnflushedSegmentIds() {
// unFlushed segment may not have binLogs, skip loading
ufInfo := w.req.SegmentInfos[ufInfoID]
if len(ufInfo.Binlogs) > 0 {
ufInfo := w.req.GetSegmentInfos()[ufInfoID]
if ufInfo == nil {
log.Warn("an unflushed segment is not found in segment infos", zap.Int64("segment ID", ufInfoID))
}
if len(ufInfo.GetBinlogs()) > 0 {
unFlushedSegments = append(unFlushedSegments, &queryPb.SegmentLoadInfo{
SegmentID: ufInfo.ID,
PartitionID: ufInfo.PartitionID,
@ -182,7 +185,7 @@ func (w *watchDmChannelsTask) Execute(ctx context.Context) (err error) {
Deltalogs: ufInfo.Deltalogs,
InsertChannel: ufInfo.InsertChannel,
})
unFlushedSegmentIDs = append(unFlushedSegmentIDs, ufInfo.ID)
unFlushedSegmentIDs = append(unFlushedSegmentIDs, ufInfo.GetID())
}
}
}