package recovery import ( "context" "github.com/cockroachdb/errors" "go.uber.org/zap" "github.com/milvus-io/milvus/internal/streamingnode/server/resource" "github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/proto/messagespb" "github.com/milvus-io/milvus/pkg/v2/proto/rootcoordpb" "github.com/milvus-io/milvus/pkg/v2/proto/streamingpb" "github.com/milvus-io/milvus/pkg/v2/streaming/util/message" "github.com/milvus-io/milvus/pkg/v2/streaming/util/types" "github.com/milvus-io/milvus/pkg/v2/util/conc" "github.com/milvus-io/milvus/pkg/v2/util/merr" ) // recoverRecoveryInfoFromMeta retrieves the recovery info for the given channel. func (r *recoveryStorageImpl) recoverRecoveryInfoFromMeta(ctx context.Context, walName string, channelInfo types.PChannelInfo, lastTimeTickMessage message.ImmutableMessage) error { r.metrics.ObserveStateChange(recoveryStorageStatePersistRecovering) r.SetLogger(resource.Resource().Logger().With( log.FieldComponent(componentRecoveryStorage), zap.String("channel", channelInfo.String()), zap.String("state", recoveryStorageStatePersistRecovering), )) catalog := resource.Resource().StreamingNodeCatalog() cpProto, err := catalog.GetConsumeCheckpoint(ctx, channelInfo.Name) if err != nil { return errors.Wrap(err, "failed to get checkpoint from catalog") } if cpProto == nil { // There's no checkpoint for current pchannel, so we need to initialize the recover info. if cpProto, err = r.initializeRecoverInfo(ctx, channelInfo, lastTimeTickMessage); err != nil { return errors.Wrap(err, "failed to initialize checkpoint") } } r.checkpoint = newWALCheckpointFromProto(walName, cpProto) r.Logger().Info("recover checkpoint done", zap.String("checkpoint", r.checkpoint.MessageID.String()), zap.Uint64("timetick", r.checkpoint.TimeTick), zap.Int64("magic", r.checkpoint.Magic), ) fVChannel := conc.Go(func() (struct{}, error) { var err error vchannels, err := catalog.ListVChannel(ctx, channelInfo.Name) if err != nil { return struct{}{}, errors.Wrap(err, "failed to get vchannel from catalog") } r.vchannels = newVChannelRecoveryInfoFromVChannelMeta(vchannels) r.Logger().Info("recovery vchannel info done", zap.Int("vchannels", len(r.vchannels))) return struct{}{}, nil }) fSegment := conc.Go(func() (struct{}, error) { var err error segmentAssign, err := catalog.ListSegmentAssignment(ctx, channelInfo.Name) if err != nil { return struct{}{}, errors.Wrap(err, "failed to get segment assignment from catalog") } r.segments = newSegmentRecoveryInfoFromSegmentAssignmentMeta(segmentAssign) r.Logger().Info("recover segment info done", zap.Int("segments", len(r.segments))) return struct{}{}, nil }) if err = conc.BlockOnAll(fVChannel, fSegment); err != nil { return err } return conc.BlockOnAll(fVChannel, fSegment) } // initializeRecoverInfo initializes the recover info for the given channel. // before first streaming service is enabled, there's no recovery info for channel. // we should initialize the recover info for the channel. // !!! This function will only call once for each channel when the streaming service is enabled. func (r *recoveryStorageImpl) initializeRecoverInfo(ctx context.Context, channelInfo types.PChannelInfo, untilMessage message.ImmutableMessage) (*streamingpb.WALCheckpoint, error) { // The message that is not generated by the streaming service is not managed by the recovery storage at streamingnode. // So we ignore it, just use the global milvus metainfo to initialize the recovery storage. // !!! It's not a strong guarantee that keep the consistency of old arch and new arch. r.Logger().Info("checkpoint not found in catalog, may upgrading from old arch, initializing it...", log.FieldMessage(untilMessage)) coord, err := resource.Resource().MixCoordClient().GetWithContext(ctx) if err != nil { return nil, errors.Wrap(err, "when wait for rootcoord client ready") } resp, err := coord.GetPChannelInfo(ctx, &rootcoordpb.GetPChannelInfoRequest{ Pchannel: channelInfo.Name, }) if err = merr.CheckRPCCall(resp, err); err != nil { return nil, errors.Wrap(err, "failed to get pchannel info from rootcoord") } // save the vchannel recovery info into the catalog vchannels := make(map[string]*streamingpb.VChannelMeta, len(resp.GetCollections())) for _, collection := range resp.GetCollections() { partitions := make([]*streamingpb.PartitionInfoOfVChannel, 0, len(collection.Partitions)) for _, partition := range collection.Partitions { partitions = append(partitions, &streamingpb.PartitionInfoOfVChannel{PartitionId: partition.PartitionId}) } vchannels[collection.Vchannel] = &streamingpb.VChannelMeta{ Vchannel: collection.Vchannel, State: streamingpb.VChannelState_VCHANNEL_STATE_NORMAL, CollectionInfo: &streamingpb.CollectionInfoOfVChannel{ CollectionId: collection.CollectionId, Partitions: partitions, }, } } // SaveVChannels saves the vchannels into the catalog. if err := resource.Resource().StreamingNodeCatalog().SaveVChannels(ctx, channelInfo.Name, vchannels); err != nil { return nil, errors.Wrap(err, "failed to save vchannels to catalog") } // Use the first timesync message as the initial checkpoint. checkpoint := &streamingpb.WALCheckpoint{ MessageId: &messagespb.MessageID{ Id: untilMessage.LastConfirmedMessageID().Marshal(), }, TimeTick: untilMessage.TimeTick(), RecoveryMagic: recoveryMagicStreamingInitialized, } if err := resource.Resource().StreamingNodeCatalog().SaveConsumeCheckpoint(ctx, channelInfo.Name, checkpoint); err != nil { return nil, errors.Wrap(err, "failed to save checkpoint to catalog") } r.Logger().Info("initialize checkpoint done", zap.Int("vchannels", len(vchannels)), zap.String("checkpoint", checkpoint.MessageId.String()), zap.Uint64("timetick", checkpoint.TimeTick), zap.Int64("magic", checkpoint.RecoveryMagic), ) return checkpoint, nil }