Zhen Ye 46b6f1b9e2
fix: panic when logging a old message should be skipped (#43076)
issue: #43074

- fix: panic when logging a old message should be skipped, #43074 
- fix: make the ack of broadcaster idompotent, #43026
- fix: lost dropping collection when upgrading, #43092
- fix: panic when DropPartition happen after DropCollection, #43027,
#43078

---------

Signed-off-by: chyezh <chyezh@outlook.com>
2025-07-04 16:04:44 +08:00

156 lines
6.8 KiB
Go

package recovery
import (
"context"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/streamingnode/server/resource"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
"github.com/milvus-io/milvus/pkg/v2/proto/etcdpb"
"github.com/milvus-io/milvus/pkg/v2/proto/messagespb"
"github.com/milvus-io/milvus/pkg/v2/proto/rootcoordpb"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
"github.com/milvus-io/milvus/pkg/v2/util/commonpbutil"
"github.com/milvus-io/milvus/pkg/v2/util/conc"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
)
// recoverRecoveryInfoFromMeta retrieves the recovery info for the given channel.
func (r *recoveryStorageImpl) recoverRecoveryInfoFromMeta(ctx context.Context, walName string, channelInfo types.PChannelInfo, lastTimeTickMessage message.ImmutableMessage) error {
r.metrics.ObserveStateChange(recoveryStorageStatePersistRecovering)
r.SetLogger(resource.Resource().Logger().With(
log.FieldComponent(componentRecoveryStorage),
zap.String("channel", channelInfo.String()),
zap.String("state", recoveryStorageStatePersistRecovering),
))
catalog := resource.Resource().StreamingNodeCatalog()
cpProto, err := catalog.GetConsumeCheckpoint(ctx, channelInfo.Name)
if err != nil {
return errors.Wrap(err, "failed to get checkpoint from catalog")
}
if cpProto == nil {
// There's no checkpoint for current pchannel, so we need to initialize the recover info.
if cpProto, err = r.initializeRecoverInfo(ctx, channelInfo, lastTimeTickMessage); err != nil {
return errors.Wrap(err, "failed to initialize checkpoint")
}
}
r.checkpoint = newWALCheckpointFromProto(walName, cpProto)
r.Logger().Info("recover checkpoint done",
zap.String("checkpoint", r.checkpoint.MessageID.String()),
zap.Uint64("timetick", r.checkpoint.TimeTick),
zap.Int64("magic", r.checkpoint.Magic),
)
fVChannel := conc.Go(func() (struct{}, error) {
var err error
vchannels, err := catalog.ListVChannel(ctx, channelInfo.Name)
if err != nil {
return struct{}{}, errors.Wrap(err, "failed to get vchannel from catalog")
}
r.vchannels = newVChannelRecoveryInfoFromVChannelMeta(vchannels)
r.Logger().Info("recovery vchannel info done", zap.Int("vchannels", len(r.vchannels)))
return struct{}{}, nil
})
fSegment := conc.Go(func() (struct{}, error) {
var err error
segmentAssign, err := catalog.ListSegmentAssignment(ctx, channelInfo.Name)
if err != nil {
return struct{}{}, errors.Wrap(err, "failed to get segment assignment from catalog")
}
r.segments = newSegmentRecoveryInfoFromSegmentAssignmentMeta(segmentAssign)
r.Logger().Info("recover segment info done", zap.Int("segments", len(r.segments)))
return struct{}{}, nil
})
if err = conc.BlockOnAll(fVChannel, fSegment); err != nil {
return err
}
return conc.BlockOnAll(fVChannel, fSegment)
}
// initializeRecoverInfo initializes the recover info for the given channel.
// before first streaming service is enabled, there's no recovery info for channel.
// we should initialize the recover info for the channel.
// !!! This function will only call once for each channel when the streaming service is enabled.
func (r *recoveryStorageImpl) initializeRecoverInfo(ctx context.Context, channelInfo types.PChannelInfo, untilMessage message.ImmutableMessage) (*streamingpb.WALCheckpoint, error) {
// The message that is not generated by the streaming service is not managed by the recovery storage at streamingnode.
// So we ignore it, just use the global milvus metainfo to initialize the recovery storage.
// !!! It's not a strong guarantee that keep the consistency of old arch and new arch.
r.Logger().Info("checkpoint not found in catalog, may upgrading from old arch, initializing it...", log.FieldMessage(untilMessage))
coord, err := resource.Resource().MixCoordClient().GetWithContext(ctx)
if err != nil {
return nil, errors.Wrap(err, "when wait for rootcoord client ready")
}
resp, err := coord.GetPChannelInfo(ctx, &rootcoordpb.GetPChannelInfoRequest{
Pchannel: channelInfo.Name,
})
if err = merr.CheckRPCCall(resp, err); err != nil {
return nil, errors.Wrap(err, "failed to get pchannel info from rootcoord")
}
// save the vchannel recovery info into the catalog
vchannels := make(map[string]*streamingpb.VChannelMeta, len(resp.GetCollections()))
for _, collection := range resp.GetCollections() {
if collection.State == etcdpb.CollectionState_CollectionDropping {
// Drop the already dropping collection before streaming arch enabled.
// Otherwise, the dropping collection message will be lost,
// and the data of collection can not be dropped.
coordClient, err := resource.Resource().MixCoordClient().GetWithContext(ctx)
if err != nil {
return nil, err
}
resp, err := coordClient.DropVirtualChannel(ctx, &datapb.DropVirtualChannelRequest{
Base: commonpbutil.NewMsgBase(commonpbutil.WithSourceID(paramtable.GetNodeID())),
ChannelName: collection.Vchannel,
})
if err = merr.CheckRPCCall(resp, err); err != nil {
return nil, errors.Wrap(err, "failed to drop virtual channel")
}
continue
}
partitions := make([]*streamingpb.PartitionInfoOfVChannel, 0, len(collection.Partitions))
for _, partition := range collection.Partitions {
partitions = append(partitions, &streamingpb.PartitionInfoOfVChannel{PartitionId: partition.PartitionId})
}
vchannels[collection.Vchannel] = &streamingpb.VChannelMeta{
Vchannel: collection.Vchannel,
State: streamingpb.VChannelState_VCHANNEL_STATE_NORMAL,
CollectionInfo: &streamingpb.CollectionInfoOfVChannel{
CollectionId: collection.CollectionId,
Partitions: partitions,
},
}
}
// SaveVChannels saves the vchannels into the catalog.
if err := resource.Resource().StreamingNodeCatalog().SaveVChannels(ctx, channelInfo.Name, vchannels); err != nil {
return nil, errors.Wrap(err, "failed to save vchannels to catalog")
}
// Use the first timesync message as the initial checkpoint.
checkpoint := &streamingpb.WALCheckpoint{
MessageId: &messagespb.MessageID{
Id: untilMessage.LastConfirmedMessageID().Marshal(),
},
TimeTick: untilMessage.TimeTick(),
RecoveryMagic: RecoveryMagicStreamingInitialized,
}
if err := resource.Resource().StreamingNodeCatalog().SaveConsumeCheckpoint(ctx, channelInfo.Name, checkpoint); err != nil {
return nil, errors.Wrap(err, "failed to save checkpoint to catalog")
}
r.Logger().Info("initialize checkpoint done",
zap.Int("vchannels", len(vchannels)),
zap.String("checkpoint", checkpoint.MessageId.String()),
zap.Uint64("timetick", checkpoint.TimeTick),
zap.Int64("magic", checkpoint.RecoveryMagic),
)
return checkpoint, nil
}