Zhen Ye 0a465bb5b7
enhance: use recovery+shardmanager, remove segment assignment interceptor (#41824)
issue: #41544

- add lock interceptor into wal.
- use recovery and shardmanager to replace the original implementation
of segment assignment.
- remove redundant implementation and unittest.
- remove redundant proto definition.
- use 2 streamingnode in e2e.

---------

Signed-off-by: chyezh <chyezh@outlook.com>
2025-05-14 23:00:23 +08:00

135 lines
5.8 KiB
Go

package recovery
import (
"context"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/streamingnode/server/resource"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/messagespb"
"github.com/milvus-io/milvus/pkg/v2/proto/rootcoordpb"
"github.com/milvus-io/milvus/pkg/v2/proto/streamingpb"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/message"
"github.com/milvus-io/milvus/pkg/v2/streaming/util/types"
"github.com/milvus-io/milvus/pkg/v2/util/conc"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
)
// recoverRecoveryInfoFromMeta retrieves the recovery info for the given channel.
func (r *recoveryStorageImpl) recoverRecoveryInfoFromMeta(ctx context.Context, walName string, channelInfo types.PChannelInfo, lastTimeTickMessage message.ImmutableMessage) error {
r.metrics.ObserveStateChange(recoveryStorageStatePersistRecovering)
r.SetLogger(resource.Resource().Logger().With(
log.FieldComponent(componentRecoveryStorage),
zap.String("channel", channelInfo.String()),
zap.String("state", recoveryStorageStatePersistRecovering),
))
catalog := resource.Resource().StreamingNodeCatalog()
cpProto, err := catalog.GetConsumeCheckpoint(ctx, channelInfo.Name)
if err != nil {
return errors.Wrap(err, "failed to get checkpoint from catalog")
}
if cpProto == nil {
// There's no checkpoint for current pchannel, so we need to initialize the recover info.
if cpProto, err = r.initializeRecoverInfo(ctx, channelInfo, lastTimeTickMessage); err != nil {
return errors.Wrap(err, "failed to initialize checkpoint")
}
}
r.checkpoint = newWALCheckpointFromProto(walName, cpProto)
r.Logger().Info("recover checkpoint done",
zap.String("checkpoint", r.checkpoint.MessageID.String()),
zap.Uint64("timetick", r.checkpoint.TimeTick),
zap.Int64("magic", r.checkpoint.Magic),
)
fVChannel := conc.Go(func() (struct{}, error) {
var err error
vchannels, err := catalog.ListVChannel(ctx, channelInfo.Name)
if err != nil {
return struct{}{}, errors.Wrap(err, "failed to get vchannel from catalog")
}
r.vchannels = newVChannelRecoveryInfoFromVChannelMeta(vchannels)
r.Logger().Info("recovery vchannel info done", zap.Int("vchannels", len(r.vchannels)))
return struct{}{}, nil
})
fSegment := conc.Go(func() (struct{}, error) {
var err error
segmentAssign, err := catalog.ListSegmentAssignment(ctx, channelInfo.Name)
if err != nil {
return struct{}{}, errors.Wrap(err, "failed to get segment assignment from catalog")
}
r.segments = newSegmentRecoveryInfoFromSegmentAssignmentMeta(segmentAssign)
r.Logger().Info("recover segment info done", zap.Int("segments", len(r.segments)))
return struct{}{}, nil
})
if err = conc.BlockOnAll(fVChannel, fSegment); err != nil {
return err
}
return conc.BlockOnAll(fVChannel, fSegment)
}
// initializeRecoverInfo initializes the recover info for the given channel.
// before first streaming service is enabled, there's no recovery info for channel.
// we should initialize the recover info for the channel.
// !!! This function will only call once for each channel when the streaming service is enabled.
func (r *recoveryStorageImpl) initializeRecoverInfo(ctx context.Context, channelInfo types.PChannelInfo, untilMessage message.ImmutableMessage) (*streamingpb.WALCheckpoint, error) {
// The message that is not generated by the streaming service is not managed by the recovery storage at streamingnode.
// So we ignore it, just use the global milvus metainfo to initialize the recovery storage.
// !!! It's not a strong guarantee that keep the consistency of old arch and new arch.
r.Logger().Info("checkpoint not found in catalog, may upgrading from old arch, initializing it...", log.FieldMessage(untilMessage))
coord, err := resource.Resource().MixCoordClient().GetWithContext(ctx)
if err != nil {
return nil, errors.Wrap(err, "when wait for rootcoord client ready")
}
resp, err := coord.GetPChannelInfo(ctx, &rootcoordpb.GetPChannelInfoRequest{
Pchannel: channelInfo.Name,
})
if err = merr.CheckRPCCall(resp, err); err != nil {
return nil, errors.Wrap(err, "failed to get pchannel info from rootcoord")
}
// save the vchannel recovery info into the catalog
vchannels := make(map[string]*streamingpb.VChannelMeta, len(resp.GetCollections()))
for _, collection := range resp.GetCollections() {
partitions := make([]*streamingpb.PartitionInfoOfVChannel, 0, len(collection.Partitions))
for _, partition := range collection.Partitions {
partitions = append(partitions, &streamingpb.PartitionInfoOfVChannel{PartitionId: partition.PartitionId})
}
vchannels[collection.Vchannel] = &streamingpb.VChannelMeta{
Vchannel: collection.Vchannel,
State: streamingpb.VChannelState_VCHANNEL_STATE_NORMAL,
CollectionInfo: &streamingpb.CollectionInfoOfVChannel{
CollectionId: collection.CollectionId,
Partitions: partitions,
},
}
}
// SaveVChannels saves the vchannels into the catalog.
if err := resource.Resource().StreamingNodeCatalog().SaveVChannels(ctx, channelInfo.Name, vchannels); err != nil {
return nil, errors.Wrap(err, "failed to save vchannels to catalog")
}
// Use the first timesync message as the initial checkpoint.
checkpoint := &streamingpb.WALCheckpoint{
MessageId: &messagespb.MessageID{
Id: untilMessage.LastConfirmedMessageID().Marshal(),
},
TimeTick: untilMessage.TimeTick(),
RecoveryMagic: recoveryMagicStreamingInitialized,
}
if err := resource.Resource().StreamingNodeCatalog().SaveConsumeCheckpoint(ctx, channelInfo.Name, checkpoint); err != nil {
return nil, errors.Wrap(err, "failed to save checkpoint to catalog")
}
r.Logger().Info("initialize checkpoint done",
zap.Int("vchannels", len(vchannels)),
zap.String("checkpoint", checkpoint.MessageId.String()),
zap.Uint64("timetick", checkpoint.TimeTick),
zap.Int64("magic", checkpoint.RecoveryMagic),
)
return checkpoint, nil
}