mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 22:45:26 +08:00
### **User description** issue: #46466 ___ ### **PR Type** Bug fix ___ ### **Description** - Extract finished task state check into reusable helper function - Skip finished tasks during compaction recovery to prevent reprocessing - Add backward compatibility check for pre-allocated segment IDs ___ ### Diagram Walkthrough ```mermaid flowchart LR A["Compaction Task States"] -->|"Check with helper"| B["isCompactionTaskFinished()"] B -->|"Used in"| C["compactionInspector.loadMeta()"] B -->|"Used in"| D["compactionTaskMeta.reloadFromKV()"] C -->|"Skip finished tasks"| E["Recovery Process"] D -->|"Backward compatibility"| E ``` <details><summary><h3>File Walkthrough</h3></summary> <table><thead><tr><th></th><th align="left">Relevant files</th></tr></thead><tbody><tr><td><strong>Enhancement</strong></td><td><table> <tr> <td> <details> <summary><strong>compaction_util.go</strong><dd><code>Add isCompactionTaskFinished helper function</code> </dd></summary> <hr> internal/datacoord/compaction_util.go <ul><li>Added new helper function <code>isCompactionTaskFinished()</code> to check if a <br>compaction task is in a terminal state<br> <li> Function checks for failed, timeout, completed, cleaned, or unknown <br>states<br> <li> Centralizes task state validation logic for reuse across multiple <br>components</ul> </details> </td> <td><a href="https://github.com/milvus-io/milvus/pull/46515/files#diff-8f2cb8d0fef37617202c5a2290ad2bdbf2df5b5983604b5b505bc73a65c7eb43">+8/-0</a> </td> </tr> </table></td></tr><tr><td><strong>Bug fix</strong></td><td><table> <tr> <td> <details> <summary><strong>compaction_inspector.go</strong><dd><code>Refactor to use finished task helper function</code> </dd></summary> <hr> internal/datacoord/compaction_inspector.go <ul><li>Replaced inline state checks with call to <code>isCompactionTaskFinished()</code> <br>helper<br> <li> Simplifies code by removing repetitive state comparison logic<br> <li> Maintains same behavior of skipping finished tasks during recovery</ul> </details> </td> <td><a href="https://github.com/milvus-io/milvus/pull/46515/files#diff-1c884001f2e84de177fea22b584f3de70a6e73695dbffa34031be9890d17da6d">+1/-5</a> </td> </tr> <tr> <td> <details> <summary><strong>compaction_task_meta.go</strong><dd><code>Add finished task check for backward compatibility</code> </dd></summary> <hr> internal/datacoord/compaction_task_meta.go <ul><li>Added check to skip finished tasks before processing pre-allocated <br>segment IDs<br> <li> Ensures backward compatibility for tasks without pre-allocated segment <br>IDs<br> <li> Prevents marking already-finished tasks as failed during reload</ul> </details> </td> <td><a href="https://github.com/milvus-io/milvus/pull/46515/files#diff-0dae7214c4c79ddf5106bd51d375b5fb2f41239d5d433798afa90708e443eca8">+1/-1</a> </td> </tr> </table></td></tr></tbody></table> </details> ___ <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Improved detection of finished compaction tasks to reduce false failures. * Prevented finished tasks with missing pre-allocations from being incorrectly marked as failed. * Simplified abandonment logic for completed/timeout/cleaned tasks to reduce erroneous retries and noisy logs. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
200 lines
6.9 KiB
Go
200 lines
6.9 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package datacoord
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/golang-lru/v2/expirable"
|
|
"github.com/samber/lo"
|
|
"go.uber.org/zap"
|
|
"google.golang.org/protobuf/proto"
|
|
|
|
"github.com/milvus-io/milvus/internal/json"
|
|
"github.com/milvus-io/milvus/internal/metastore"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/metricsinfo"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/timerecord"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
func newCompactionTaskStats(task *datapb.CompactionTask) *metricsinfo.CompactionTask {
|
|
return &metricsinfo.CompactionTask{
|
|
PlanID: task.PlanID,
|
|
CollectionID: task.CollectionID,
|
|
Type: task.Type.String(),
|
|
State: task.State.String(),
|
|
FailReason: task.FailReason,
|
|
StartTime: typeutil.TimestampToString(uint64(task.StartTime) * 1000),
|
|
EndTime: typeutil.TimestampToString(uint64(task.EndTime) * 1000),
|
|
TotalRows: task.TotalRows,
|
|
InputSegments: lo.Map(task.InputSegments, func(t int64, i int) string {
|
|
return strconv.FormatInt(t, 10)
|
|
}),
|
|
ResultSegments: lo.Map(task.ResultSegments, func(t int64, i int) string {
|
|
return strconv.FormatInt(t, 10)
|
|
}),
|
|
NodeID: task.NodeID,
|
|
}
|
|
}
|
|
|
|
type compactionTaskMeta struct {
|
|
sync.RWMutex
|
|
ctx context.Context
|
|
catalog metastore.DataCoordCatalog
|
|
// currently only clustering compaction task is stored in persist meta
|
|
compactionTasks map[int64]map[int64]*datapb.CompactionTask // triggerID -> planID
|
|
taskStats *expirable.LRU[UniqueID, *metricsinfo.CompactionTask]
|
|
}
|
|
|
|
func newCompactionTaskMeta(ctx context.Context, catalog metastore.DataCoordCatalog) (*compactionTaskMeta, error) {
|
|
csm := &compactionTaskMeta{
|
|
RWMutex: sync.RWMutex{},
|
|
ctx: ctx,
|
|
catalog: catalog,
|
|
compactionTasks: make(map[int64]map[int64]*datapb.CompactionTask, 0),
|
|
taskStats: expirable.NewLRU[UniqueID, *metricsinfo.CompactionTask](512, nil, time.Minute*15),
|
|
}
|
|
if err := csm.reloadFromKV(); err != nil {
|
|
return nil, err
|
|
}
|
|
return csm, nil
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) reloadFromKV() error {
|
|
record := timerecord.NewTimeRecorder("compactionTaskMeta-reloadFromKV")
|
|
compactionTasks, err := csm.catalog.ListCompactionTask(csm.ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, task := range compactionTasks {
|
|
// Compatibility handling: for milvus ≤v2.4, since compaction task has no PreAllocatedSegmentIDs field,
|
|
// here we just mark the task as failed and wait for the compaction trigger to generate a new one.
|
|
if !isCompactionTaskFinished(task) && task.PreAllocatedSegmentIDs == nil {
|
|
log.Warn("PreAllocatedSegmentIDs is nil, mark the task as failed",
|
|
zap.Int64("taskID", task.GetPlanID()),
|
|
zap.String("type", task.GetType().String()),
|
|
zap.String("originalState", task.State.String()),
|
|
)
|
|
task.State = datapb.CompactionTaskState_failed
|
|
task.FailReason = fmt.Sprintf("PreAllocatedSegmentIDs is nil, taskID: %v", task.GetPlanID())
|
|
}
|
|
csm.saveCompactionTaskMemory(task)
|
|
}
|
|
log.Info("DataCoord compactionTaskMeta reloadFromKV done", zap.Duration("duration", record.ElapseSpan()))
|
|
return nil
|
|
}
|
|
|
|
// GetCompactionTasks returns clustering compaction tasks from local cache
|
|
func (csm *compactionTaskMeta) GetCompactionTasks() map[int64][]*datapb.CompactionTask {
|
|
csm.RLock()
|
|
defer csm.RUnlock()
|
|
res := make(map[int64][]*datapb.CompactionTask, 0)
|
|
for triggerID, tasks := range csm.compactionTasks {
|
|
triggerTasks := make([]*datapb.CompactionTask, 0)
|
|
for _, task := range tasks {
|
|
triggerTasks = append(triggerTasks, proto.Clone(task).(*datapb.CompactionTask))
|
|
}
|
|
res[triggerID] = triggerTasks
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) GetCompactionTasksByCollection(collectionID int64) map[int64][]*datapb.CompactionTask {
|
|
csm.RLock()
|
|
defer csm.RUnlock()
|
|
res := make(map[int64][]*datapb.CompactionTask, 0)
|
|
for _, tasks := range csm.compactionTasks {
|
|
for _, task := range tasks {
|
|
if task.CollectionID == collectionID {
|
|
_, exist := res[task.TriggerID]
|
|
if !exist {
|
|
res[task.TriggerID] = make([]*datapb.CompactionTask, 0)
|
|
}
|
|
res[task.TriggerID] = append(res[task.TriggerID], proto.Clone(task).(*datapb.CompactionTask))
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) GetCompactionTasksByTriggerID(triggerID int64) []*datapb.CompactionTask {
|
|
csm.RLock()
|
|
defer csm.RUnlock()
|
|
res := make([]*datapb.CompactionTask, 0)
|
|
tasks, triggerIDExist := csm.compactionTasks[triggerID]
|
|
if triggerIDExist {
|
|
for _, task := range tasks {
|
|
res = append(res, proto.Clone(task).(*datapb.CompactionTask))
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) SaveCompactionTask(ctx context.Context, task *datapb.CompactionTask) error {
|
|
csm.Lock()
|
|
defer csm.Unlock()
|
|
if err := csm.catalog.SaveCompactionTask(ctx, task); err != nil {
|
|
log.Error("meta update: update compaction task fail", zap.Error(err))
|
|
return err
|
|
}
|
|
csm.saveCompactionTaskMemory(task)
|
|
return nil
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) saveCompactionTaskMemory(task *datapb.CompactionTask) {
|
|
_, triggerIDExist := csm.compactionTasks[task.TriggerID]
|
|
if !triggerIDExist {
|
|
csm.compactionTasks[task.TriggerID] = make(map[int64]*datapb.CompactionTask, 0)
|
|
}
|
|
csm.compactionTasks[task.TriggerID][task.PlanID] = task
|
|
csm.taskStats.Add(task.PlanID, newCompactionTaskStats(task))
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) DropCompactionTask(ctx context.Context, task *datapb.CompactionTask) error {
|
|
csm.Lock()
|
|
defer csm.Unlock()
|
|
if err := csm.catalog.DropCompactionTask(ctx, task); err != nil {
|
|
log.Error("meta update: drop compaction task fail", zap.Int64("triggerID", task.TriggerID), zap.Int64("planID", task.PlanID), zap.Int64("collectionID", task.CollectionID), zap.Error(err))
|
|
return err
|
|
}
|
|
_, triggerIDExist := csm.compactionTasks[task.TriggerID]
|
|
if triggerIDExist {
|
|
delete(csm.compactionTasks[task.TriggerID], task.PlanID)
|
|
}
|
|
if len(csm.compactionTasks[task.TriggerID]) == 0 {
|
|
delete(csm.compactionTasks, task.TriggerID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (csm *compactionTaskMeta) TaskStatsJSON() string {
|
|
tasks := csm.taskStats.Values()
|
|
ret, err := json.Marshal(tasks)
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
return string(ret)
|
|
}
|