mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 17:48:29 +08:00
enhance: make Load process traceable in querycoord (#29806)
See also #29803 This PR: - Add trace span for collection/partition load - Use TraceSpan to generate Segment/ChannelTasks when loading - Refine BaseTask trace tag usage --------- Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
parent
2f702ad316
commit
c4ddfff2a7
@ -21,6 +21,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
|
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
|
||||||
@ -98,16 +99,16 @@ func (c *ChannelChecker) checkReplica(ctx context.Context, replica *meta.Replica
|
|||||||
ret := make([]task.Task, 0)
|
ret := make([]task.Task, 0)
|
||||||
|
|
||||||
lacks, redundancies := c.getDmChannelDiff(replica.GetCollectionID(), replica.GetID())
|
lacks, redundancies := c.getDmChannelDiff(replica.GetCollectionID(), replica.GetID())
|
||||||
tasks := c.createChannelLoadTask(ctx, lacks, replica)
|
tasks := c.createChannelLoadTask(c.getTraceCtx(ctx, replica.CollectionID), lacks, replica)
|
||||||
task.SetReason("lacks of channel", tasks...)
|
task.SetReason("lacks of channel", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
tasks = c.createChannelReduceTasks(ctx, redundancies, replica.GetID())
|
tasks = c.createChannelReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), redundancies, replica.GetID())
|
||||||
task.SetReason("collection released", tasks...)
|
task.SetReason("collection released", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
repeated := c.findRepeatedChannels(replica.GetID())
|
repeated := c.findRepeatedChannels(replica.GetID())
|
||||||
tasks = c.createChannelReduceTasks(ctx, repeated, replica.GetID())
|
tasks = c.createChannelReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), repeated, replica.GetID())
|
||||||
task.SetReason("redundancies of channel")
|
task.SetReason("redundancies of channel")
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
@ -222,3 +223,12 @@ func (c *ChannelChecker) createChannelReduceTasks(ctx context.Context, channels
|
|||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *ChannelChecker) getTraceCtx(ctx context.Context, collectionID int64) context.Context {
|
||||||
|
coll := c.meta.GetCollection(collectionID)
|
||||||
|
if coll == nil || coll.LoadSpan == nil {
|
||||||
|
return ctx
|
||||||
|
}
|
||||||
|
|
||||||
|
return trace.ContextWithSpan(ctx, coll.LoadSpan)
|
||||||
|
}
|
||||||
|
|||||||
@ -22,6 +22,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||||
@ -120,25 +121,26 @@ func (c *SegmentChecker) checkReplica(ctx context.Context, replica *meta.Replica
|
|||||||
|
|
||||||
// compare with targets to find the lack and redundancy of segments
|
// compare with targets to find the lack and redundancy of segments
|
||||||
lacks, redundancies := c.getSealedSegmentDiff(replica.GetCollectionID(), replica.GetID())
|
lacks, redundancies := c.getSealedSegmentDiff(replica.GetCollectionID(), replica.GetID())
|
||||||
tasks := c.createSegmentLoadTasks(ctx, lacks, replica)
|
// loadCtx := trace.ContextWithSpan(context.Background(), c.meta.GetCollection(replica.CollectionID).LoadSpan)
|
||||||
|
tasks := c.createSegmentLoadTasks(c.getTraceCtx(ctx, replica.CollectionID), lacks, replica)
|
||||||
task.SetReason("lacks of segment", tasks...)
|
task.SetReason("lacks of segment", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
redundancies = c.filterSegmentInUse(replica, redundancies)
|
redundancies = c.filterSegmentInUse(replica, redundancies)
|
||||||
tasks = c.createSegmentReduceTasks(ctx, redundancies, replica.GetID(), querypb.DataScope_Historical)
|
tasks = c.createSegmentReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), redundancies, replica.GetID(), querypb.DataScope_Historical)
|
||||||
task.SetReason("segment not exists in target", tasks...)
|
task.SetReason("segment not exists in target", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
// compare inner dists to find repeated loaded segments
|
// compare inner dists to find repeated loaded segments
|
||||||
redundancies = c.findRepeatedSealedSegments(replica.GetID())
|
redundancies = c.findRepeatedSealedSegments(replica.GetID())
|
||||||
redundancies = c.filterExistedOnLeader(replica, redundancies)
|
redundancies = c.filterExistedOnLeader(replica, redundancies)
|
||||||
tasks = c.createSegmentReduceTasks(ctx, redundancies, replica.GetID(), querypb.DataScope_Historical)
|
tasks = c.createSegmentReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), redundancies, replica.GetID(), querypb.DataScope_Historical)
|
||||||
task.SetReason("redundancies of segment", tasks...)
|
task.SetReason("redundancies of segment", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
// compare with target to find the lack and redundancy of segments
|
// compare with target to find the lack and redundancy of segments
|
||||||
_, redundancies = c.getGrowingSegmentDiff(replica.GetCollectionID(), replica.GetID())
|
_, redundancies = c.getGrowingSegmentDiff(replica.GetCollectionID(), replica.GetID())
|
||||||
tasks = c.createSegmentReduceTasks(ctx, redundancies, replica.GetID(), querypb.DataScope_Streaming)
|
tasks = c.createSegmentReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), redundancies, replica.GetID(), querypb.DataScope_Streaming)
|
||||||
task.SetReason("streaming segment not exists in target", tasks...)
|
task.SetReason("streaming segment not exists in target", tasks...)
|
||||||
ret = append(ret, tasks...)
|
ret = append(ret, tasks...)
|
||||||
|
|
||||||
@ -411,3 +413,12 @@ func (c *SegmentChecker) createSegmentReduceTasks(ctx context.Context, segments
|
|||||||
}
|
}
|
||||||
return ret
|
return ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *SegmentChecker) getTraceCtx(ctx context.Context, collectionID int64) context.Context {
|
||||||
|
coll := c.meta.GetCollection(collectionID)
|
||||||
|
if coll == nil || coll.LoadSpan == nil {
|
||||||
|
return ctx
|
||||||
|
}
|
||||||
|
|
||||||
|
return trace.ContextWithSpan(ctx, coll.LoadSpan)
|
||||||
|
}
|
||||||
|
|||||||
@ -23,6 +23,8 @@ import (
|
|||||||
|
|
||||||
"github.com/cockroachdb/errors"
|
"github.com/cockroachdb/errors"
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/proto/querypb"
|
"github.com/milvus-io/milvus/internal/proto/querypb"
|
||||||
@ -179,6 +181,8 @@ func (job *LoadCollectionJob) Execute() error {
|
|||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
_, sp := otel.Tracer(typeutil.QueryCoordRole).Start(job.ctx, "LoadCollection", trace.WithNewRoot())
|
||||||
collection := &meta.Collection{
|
collection := &meta.Collection{
|
||||||
CollectionLoadInfo: &querypb.CollectionLoadInfo{
|
CollectionLoadInfo: &querypb.CollectionLoadInfo{
|
||||||
CollectionID: req.GetCollectionID(),
|
CollectionID: req.GetCollectionID(),
|
||||||
@ -188,6 +192,7 @@ func (job *LoadCollectionJob) Execute() error {
|
|||||||
LoadType: querypb.LoadType_LoadCollection,
|
LoadType: querypb.LoadType_LoadCollection,
|
||||||
},
|
},
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
|
LoadSpan: sp,
|
||||||
}
|
}
|
||||||
job.undo.IsNewCollection = true
|
job.undo.IsNewCollection = true
|
||||||
err = job.meta.CollectionManager.PutCollection(collection, partitions...)
|
err = job.meta.CollectionManager.PutCollection(collection, partitions...)
|
||||||
@ -355,6 +360,8 @@ func (job *LoadPartitionJob) Execute() error {
|
|||||||
})
|
})
|
||||||
if !job.meta.CollectionManager.Exist(req.GetCollectionID()) {
|
if !job.meta.CollectionManager.Exist(req.GetCollectionID()) {
|
||||||
job.undo.IsNewCollection = true
|
job.undo.IsNewCollection = true
|
||||||
|
|
||||||
|
_, sp := otel.Tracer(typeutil.QueryCoordRole).Start(job.ctx, "LoadPartition", trace.WithNewRoot())
|
||||||
collection := &meta.Collection{
|
collection := &meta.Collection{
|
||||||
CollectionLoadInfo: &querypb.CollectionLoadInfo{
|
CollectionLoadInfo: &querypb.CollectionLoadInfo{
|
||||||
CollectionID: req.GetCollectionID(),
|
CollectionID: req.GetCollectionID(),
|
||||||
@ -364,6 +371,7 @@ func (job *LoadPartitionJob) Execute() error {
|
|||||||
LoadType: querypb.LoadType_LoadPartition,
|
LoadType: querypb.LoadType_LoadPartition,
|
||||||
},
|
},
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
|
LoadSpan: sp,
|
||||||
}
|
}
|
||||||
err = job.meta.CollectionManager.PutCollection(collection, partitions...)
|
err = job.meta.CollectionManager.PutCollection(collection, partitions...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -25,6 +25,7 @@ import (
|
|||||||
|
|
||||||
"github.com/golang/protobuf/proto"
|
"github.com/golang/protobuf/proto"
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
"go.opentelemetry.io/otel/trace"
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/metastore"
|
"github.com/milvus-io/milvus/internal/metastore"
|
||||||
@ -45,6 +46,7 @@ type Collection struct {
|
|||||||
|
|
||||||
mut sync.RWMutex
|
mut sync.RWMutex
|
||||||
refreshNotifier chan struct{}
|
refreshNotifier chan struct{}
|
||||||
|
LoadSpan trace.Span
|
||||||
}
|
}
|
||||||
|
|
||||||
func (collection *Collection) SetRefreshNotifier(notifier chan struct{}) {
|
func (collection *Collection) SetRefreshNotifier(notifier chan struct{}) {
|
||||||
@ -79,6 +81,7 @@ func (collection *Collection) Clone() *Collection {
|
|||||||
CreatedAt: collection.CreatedAt,
|
CreatedAt: collection.CreatedAt,
|
||||||
UpdatedAt: collection.UpdatedAt,
|
UpdatedAt: collection.UpdatedAt,
|
||||||
refreshNotifier: collection.refreshNotifier,
|
refreshNotifier: collection.refreshNotifier,
|
||||||
|
LoadSpan: collection.LoadSpan,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -502,6 +505,10 @@ func (m *CollectionManager) UpdateLoadPercent(partitionID int64, loadPercent int
|
|||||||
saveCollection := false
|
saveCollection := false
|
||||||
if collectionPercent == 100 {
|
if collectionPercent == 100 {
|
||||||
saveCollection = true
|
saveCollection = true
|
||||||
|
if newCollection.LoadSpan != nil {
|
||||||
|
newCollection.LoadSpan.End()
|
||||||
|
newCollection.LoadSpan = nil
|
||||||
|
}
|
||||||
newCollection.Status = querypb.LoadStatus_Loaded
|
newCollection.Status = querypb.LoadStatus_Loaded
|
||||||
|
|
||||||
// if collection becomes loaded, clear it's recoverTimes in load info
|
// if collection becomes loaded, clear it's recoverTimes in load info
|
||||||
|
|||||||
@ -29,7 +29,7 @@ import (
|
|||||||
"github.com/milvus-io/milvus/internal/proto/querypb"
|
"github.com/milvus-io/milvus/internal/proto/querypb"
|
||||||
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
||||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||||
. "github.com/milvus-io/milvus/pkg/util/typeutil"
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type (
|
type (
|
||||||
@ -69,10 +69,10 @@ type Source fmt.Stringer
|
|||||||
type Task interface {
|
type Task interface {
|
||||||
Context() context.Context
|
Context() context.Context
|
||||||
Source() Source
|
Source() Source
|
||||||
ID() UniqueID
|
ID() typeutil.UniqueID
|
||||||
CollectionID() UniqueID
|
CollectionID() typeutil.UniqueID
|
||||||
ReplicaID() UniqueID
|
ReplicaID() typeutil.UniqueID
|
||||||
SetID(id UniqueID)
|
SetID(id typeutil.UniqueID)
|
||||||
Status() Status
|
Status() Status
|
||||||
SetStatus(status Status)
|
SetStatus(status Status)
|
||||||
Err() error
|
Err() error
|
||||||
@ -100,9 +100,9 @@ type baseTask struct {
|
|||||||
doneCh chan struct{}
|
doneCh chan struct{}
|
||||||
canceled *atomic.Bool
|
canceled *atomic.Bool
|
||||||
|
|
||||||
id UniqueID // Set by scheduler
|
id typeutil.UniqueID // Set by scheduler
|
||||||
collectionID UniqueID
|
collectionID typeutil.UniqueID
|
||||||
replicaID UniqueID
|
replicaID typeutil.UniqueID
|
||||||
shard string
|
shard string
|
||||||
loadType querypb.LoadType
|
loadType querypb.LoadType
|
||||||
|
|
||||||
@ -118,9 +118,9 @@ type baseTask struct {
|
|||||||
span trace.Span
|
span trace.Span
|
||||||
}
|
}
|
||||||
|
|
||||||
func newBaseTask(ctx context.Context, source Source, collectionID, replicaID UniqueID, shard string) *baseTask {
|
func newBaseTask(ctx context.Context, source Source, collectionID, replicaID typeutil.UniqueID, shard string, taskTag string) *baseTask {
|
||||||
ctx, cancel := context.WithCancel(ctx)
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
ctx, span := otel.Tracer("QueryCoord").Start(ctx, "QueryCoord-BaseTask")
|
ctx, span := otel.Tracer(typeutil.QueryCoordRole).Start(ctx, taskTag)
|
||||||
|
|
||||||
return &baseTask{
|
return &baseTask{
|
||||||
source: source,
|
source: source,
|
||||||
@ -146,19 +146,19 @@ func (task *baseTask) Source() Source {
|
|||||||
return task.source
|
return task.source
|
||||||
}
|
}
|
||||||
|
|
||||||
func (task *baseTask) ID() UniqueID {
|
func (task *baseTask) ID() typeutil.UniqueID {
|
||||||
return task.id
|
return task.id
|
||||||
}
|
}
|
||||||
|
|
||||||
func (task *baseTask) SetID(id UniqueID) {
|
func (task *baseTask) SetID(id typeutil.UniqueID) {
|
||||||
task.id = id
|
task.id = id
|
||||||
}
|
}
|
||||||
|
|
||||||
func (task *baseTask) CollectionID() UniqueID {
|
func (task *baseTask) CollectionID() typeutil.UniqueID {
|
||||||
return task.collectionID
|
return task.collectionID
|
||||||
}
|
}
|
||||||
|
|
||||||
func (task *baseTask) ReplicaID() UniqueID {
|
func (task *baseTask) ReplicaID() typeutil.UniqueID {
|
||||||
return task.replicaID
|
return task.replicaID
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,7 +278,7 @@ func (task *baseTask) String() string {
|
|||||||
type SegmentTask struct {
|
type SegmentTask struct {
|
||||||
*baseTask
|
*baseTask
|
||||||
|
|
||||||
segmentID UniqueID
|
segmentID typeutil.UniqueID
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewSegmentTask creates a SegmentTask with actions,
|
// NewSegmentTask creates a SegmentTask with actions,
|
||||||
@ -288,7 +288,7 @@ func NewSegmentTask(ctx context.Context,
|
|||||||
timeout time.Duration,
|
timeout time.Duration,
|
||||||
source Source,
|
source Source,
|
||||||
collectionID,
|
collectionID,
|
||||||
replicaID UniqueID,
|
replicaID typeutil.UniqueID,
|
||||||
actions ...Action,
|
actions ...Action,
|
||||||
) (*SegmentTask, error) {
|
) (*SegmentTask, error) {
|
||||||
if len(actions) == 0 {
|
if len(actions) == 0 {
|
||||||
@ -310,7 +310,7 @@ func NewSegmentTask(ctx context.Context,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
base := newBaseTask(ctx, source, collectionID, replicaID, shard)
|
base := newBaseTask(ctx, source, collectionID, replicaID, shard, fmt.Sprintf("SegmentTask-%s-%d", actions[0].Type().String(), segmentID))
|
||||||
base.actions = actions
|
base.actions = actions
|
||||||
return &SegmentTask{
|
return &SegmentTask{
|
||||||
baseTask: base,
|
baseTask: base,
|
||||||
@ -322,7 +322,7 @@ func (task *SegmentTask) Shard() string {
|
|||||||
return task.shard
|
return task.shard
|
||||||
}
|
}
|
||||||
|
|
||||||
func (task *SegmentTask) SegmentID() UniqueID {
|
func (task *SegmentTask) SegmentID() typeutil.UniqueID {
|
||||||
return task.segmentID
|
return task.segmentID
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,7 +345,7 @@ func NewChannelTask(ctx context.Context,
|
|||||||
timeout time.Duration,
|
timeout time.Duration,
|
||||||
source Source,
|
source Source,
|
||||||
collectionID,
|
collectionID,
|
||||||
replicaID UniqueID,
|
replicaID typeutil.UniqueID,
|
||||||
actions ...Action,
|
actions ...Action,
|
||||||
) (*ChannelTask, error) {
|
) (*ChannelTask, error) {
|
||||||
if len(actions) == 0 {
|
if len(actions) == 0 {
|
||||||
@ -365,7 +365,7 @@ func NewChannelTask(ctx context.Context,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
base := newBaseTask(ctx, source, collectionID, replicaID, channel)
|
base := newBaseTask(ctx, source, collectionID, replicaID, channel, fmt.Sprintf("ChannelTask-%s-%s", actions[0].Type().String(), channel))
|
||||||
base.actions = actions
|
base.actions = actions
|
||||||
return &ChannelTask{
|
return &ChannelTask{
|
||||||
baseTask: base,
|
baseTask: base,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user