mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
issue: #43074 - fix: panic when logging a old message should be skipped, #43074 - fix: make the ack of broadcaster idompotent, #43026 - fix: lost dropping collection when upgrading, #43092 - fix: panic when DropPartition happen after DropCollection, #43027, #43078 --------- Signed-off-by: chyezh <chyezh@outlook.com>
156 lines
6.8 KiB
Go
156 lines
6.8 KiB
Go
package recovery
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/streamingnode/server/resource"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/etcdpb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/messagespb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/rootcoordpb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
|
|
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
|
|
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/commonpbutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/conc"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
)
|
|
|
|
// recoverRecoveryInfoFromMeta retrieves the recovery info for the given channel.
|
|
func (r *recoveryStorageImpl) recoverRecoveryInfoFromMeta(ctx context.Context, walName string, channelInfo types.PChannelInfo, lastTimeTickMessage message.ImmutableMessage) error {
|
|
r.metrics.ObserveStateChange(recoveryStorageStatePersistRecovering)
|
|
r.SetLogger(resource.Resource().Logger().With(
|
|
log.FieldComponent(componentRecoveryStorage),
|
|
zap.String("channel", channelInfo.String()),
|
|
zap.String("state", recoveryStorageStatePersistRecovering),
|
|
))
|
|
|
|
catalog := resource.Resource().StreamingNodeCatalog()
|
|
cpProto, err := catalog.GetConsumeCheckpoint(ctx, channelInfo.Name)
|
|
if err != nil {
|
|
return errors.Wrap(err, "failed to get checkpoint from catalog")
|
|
}
|
|
if cpProto == nil {
|
|
// There's no checkpoint for current pchannel, so we need to initialize the recover info.
|
|
if cpProto, err = r.initializeRecoverInfo(ctx, channelInfo, lastTimeTickMessage); err != nil {
|
|
return errors.Wrap(err, "failed to initialize checkpoint")
|
|
}
|
|
}
|
|
r.checkpoint = newWALCheckpointFromProto(walName, cpProto)
|
|
r.Logger().Info("recover checkpoint done",
|
|
zap.String("checkpoint", r.checkpoint.MessageID.String()),
|
|
zap.Uint64("timetick", r.checkpoint.TimeTick),
|
|
zap.Int64("magic", r.checkpoint.Magic),
|
|
)
|
|
|
|
fVChannel := conc.Go(func() (struct{}, error) {
|
|
var err error
|
|
vchannels, err := catalog.ListVChannel(ctx, channelInfo.Name)
|
|
if err != nil {
|
|
return struct{}{}, errors.Wrap(err, "failed to get vchannel from catalog")
|
|
}
|
|
r.vchannels = newVChannelRecoveryInfoFromVChannelMeta(vchannels)
|
|
r.Logger().Info("recovery vchannel info done", zap.Int("vchannels", len(r.vchannels)))
|
|
return struct{}{}, nil
|
|
})
|
|
|
|
fSegment := conc.Go(func() (struct{}, error) {
|
|
var err error
|
|
segmentAssign, err := catalog.ListSegmentAssignment(ctx, channelInfo.Name)
|
|
if err != nil {
|
|
return struct{}{}, errors.Wrap(err, "failed to get segment assignment from catalog")
|
|
}
|
|
r.segments = newSegmentRecoveryInfoFromSegmentAssignmentMeta(segmentAssign)
|
|
r.Logger().Info("recover segment info done", zap.Int("segments", len(r.segments)))
|
|
return struct{}{}, nil
|
|
})
|
|
if err = conc.BlockOnAll(fVChannel, fSegment); err != nil {
|
|
return err
|
|
}
|
|
return conc.BlockOnAll(fVChannel, fSegment)
|
|
}
|
|
|
|
// initializeRecoverInfo initializes the recover info for the given channel.
|
|
// before first streaming service is enabled, there's no recovery info for channel.
|
|
// we should initialize the recover info for the channel.
|
|
// !!! This function will only call once for each channel when the streaming service is enabled.
|
|
func (r *recoveryStorageImpl) initializeRecoverInfo(ctx context.Context, channelInfo types.PChannelInfo, untilMessage message.ImmutableMessage) (*streamingpb.WALCheckpoint, error) {
|
|
// The message that is not generated by the streaming service is not managed by the recovery storage at streamingnode.
|
|
// So we ignore it, just use the global milvus metainfo to initialize the recovery storage.
|
|
// !!! It's not a strong guarantee that keep the consistency of old arch and new arch.
|
|
r.Logger().Info("checkpoint not found in catalog, may upgrading from old arch, initializing it...", log.FieldMessage(untilMessage))
|
|
|
|
coord, err := resource.Resource().MixCoordClient().GetWithContext(ctx)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "when wait for rootcoord client ready")
|
|
}
|
|
resp, err := coord.GetPChannelInfo(ctx, &rootcoordpb.GetPChannelInfoRequest{
|
|
Pchannel: channelInfo.Name,
|
|
})
|
|
if err = merr.CheckRPCCall(resp, err); err != nil {
|
|
return nil, errors.Wrap(err, "failed to get pchannel info from rootcoord")
|
|
}
|
|
|
|
// save the vchannel recovery info into the catalog
|
|
vchannels := make(map[string]*streamingpb.VChannelMeta, len(resp.GetCollections()))
|
|
for _, collection := range resp.GetCollections() {
|
|
if collection.State == etcdpb.CollectionState_CollectionDropping {
|
|
// Drop the already dropping collection before streaming arch enabled.
|
|
// Otherwise, the dropping collection message will be lost,
|
|
// and the data of collection can not be dropped.
|
|
coordClient, err := resource.Resource().MixCoordClient().GetWithContext(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
resp, err := coordClient.DropVirtualChannel(ctx, &datapb.DropVirtualChannelRequest{
|
|
Base: commonpbutil.NewMsgBase(commonpbutil.WithSourceID(paramtable.GetNodeID())),
|
|
ChannelName: collection.Vchannel,
|
|
})
|
|
if err = merr.CheckRPCCall(resp, err); err != nil {
|
|
return nil, errors.Wrap(err, "failed to drop virtual channel")
|
|
}
|
|
continue
|
|
}
|
|
partitions := make([]*streamingpb.PartitionInfoOfVChannel, 0, len(collection.Partitions))
|
|
for _, partition := range collection.Partitions {
|
|
partitions = append(partitions, &streamingpb.PartitionInfoOfVChannel{PartitionId: partition.PartitionId})
|
|
}
|
|
vchannels[collection.Vchannel] = &streamingpb.VChannelMeta{
|
|
Vchannel: collection.Vchannel,
|
|
State: streamingpb.VChannelState_VCHANNEL_STATE_NORMAL,
|
|
CollectionInfo: &streamingpb.CollectionInfoOfVChannel{
|
|
CollectionId: collection.CollectionId,
|
|
Partitions: partitions,
|
|
},
|
|
}
|
|
}
|
|
|
|
// SaveVChannels saves the vchannels into the catalog.
|
|
if err := resource.Resource().StreamingNodeCatalog().SaveVChannels(ctx, channelInfo.Name, vchannels); err != nil {
|
|
return nil, errors.Wrap(err, "failed to save vchannels to catalog")
|
|
}
|
|
// Use the first timesync message as the initial checkpoint.
|
|
checkpoint := &streamingpb.WALCheckpoint{
|
|
MessageId: &messagespb.MessageID{
|
|
Id: untilMessage.LastConfirmedMessageID().Marshal(),
|
|
},
|
|
TimeTick: untilMessage.TimeTick(),
|
|
RecoveryMagic: RecoveryMagicStreamingInitialized,
|
|
}
|
|
if err := resource.Resource().StreamingNodeCatalog().SaveConsumeCheckpoint(ctx, channelInfo.Name, checkpoint); err != nil {
|
|
return nil, errors.Wrap(err, "failed to save checkpoint to catalog")
|
|
}
|
|
r.Logger().Info("initialize checkpoint done",
|
|
zap.Int("vchannels", len(vchannels)),
|
|
zap.String("checkpoint", checkpoint.MessageId.String()),
|
|
zap.Uint64("timetick", checkpoint.TimeTick),
|
|
zap.Int64("magic", checkpoint.RecoveryMagic),
|
|
)
|
|
return checkpoint, nil
|
|
}
|