enhance: Limit number of segments restored and promptly terminate the job (#39344)

1. Limit the maximum number of restored segments to 1024.
2. Fail the import job if saving binlog fails.
3. Fail the import job if saving the import task fails to prevent
repeatedly generating dirty importing segments.

issue: https://github.com/milvus-io/milvus/issues/39331

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2025-01-21 15:13:13 +08:00 committed by GitHub
parent 7261128df0
commit 89eaf92984
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 20 additions and 4 deletions

View File

@ -242,6 +242,10 @@ func (c *importChecker) checkPreImportingJob(job ImportJob) {
err = c.imeta.AddTask(context.TODO(), t) err = c.imeta.AddTask(context.TODO(), t)
if err != nil { if err != nil {
log.Warn("add new import task failed", WrapTaskLog(t, zap.Error(err))...) log.Warn("add new import task failed", WrapTaskLog(t, zap.Error(err))...)
updateErr := c.imeta.UpdateJob(context.TODO(), job.GetJobID(), UpdateJobState(internalpb.ImportJobState_Failed), UpdateJobReason(err.Error()))
if updateErr != nil {
log.Warn("failed to update job state to Failed", zap.Error(updateErr))
}
return return
} }
log.Info("add new import task", WrapTaskLog(t)...) log.Info("add new import task", WrapTaskLog(t)...)

View File

@ -288,13 +288,17 @@ func (s *ImportCheckerSuite) TestCheckJob_Failed() {
catalog.ExpectedCalls = nil catalog.ExpectedCalls = nil
catalog.EXPECT().SaveImportTask(mock.Anything, mock.Anything).Return(mockErr) catalog.EXPECT().SaveImportTask(mock.Anything, mock.Anything).Return(mockErr)
catalog.EXPECT().SaveImportJob(mock.Anything, mock.Anything).Return(nil)
s.checker.checkPreImportingJob(job) s.checker.checkPreImportingJob(job)
importTasks := s.imeta.GetTaskBy(context.TODO(), WithJob(job.GetJobID()), WithType(ImportTaskType)) importTasks := s.imeta.GetTaskBy(context.TODO(), WithJob(job.GetJobID()), WithType(ImportTaskType))
s.Equal(0, len(importTasks)) s.Equal(0, len(importTasks))
s.Equal(internalpb.ImportJobState_PreImporting, s.imeta.GetJob(context.TODO(), job.GetJobID()).GetState()) s.Equal(internalpb.ImportJobState_Failed, s.imeta.GetJob(context.TODO(), job.GetJobID()).GetState())
alloc.ExpectedCalls = nil alloc.ExpectedCalls = nil
alloc.EXPECT().AllocN(mock.Anything).Return(0, 0, mockErr) alloc.EXPECT().AllocN(mock.Anything).Return(0, 0, mockErr)
err := s.imeta.UpdateJob(context.TODO(), job.GetJobID(), UpdateJobState(internalpb.ImportJobState_PreImporting))
s.NoError(err)
s.checker.checkPreImportingJob(job)
importTasks = s.imeta.GetTaskBy(context.TODO(), WithJob(job.GetJobID()), WithType(ImportTaskType)) importTasks = s.imeta.GetTaskBy(context.TODO(), WithJob(job.GetJobID()), WithType(ImportTaskType))
s.Equal(0, len(importTasks)) s.Equal(0, len(importTasks))
s.Equal(internalpb.ImportJobState_PreImporting, s.imeta.GetJob(context.TODO(), job.GetJobID()).GetState()) s.Equal(internalpb.ImportJobState_PreImporting, s.imeta.GetJob(context.TODO(), job.GetJobID()).GetState())

View File

@ -278,8 +278,7 @@ func (s *importScheduler) processInProgressImport(task ImportTask) {
return return
} }
if resp.GetState() == datapb.ImportTaskStateV2_Failed { if resp.GetState() == datapb.ImportTaskStateV2_Failed {
err = s.imeta.UpdateJob(context.TODO(), task.GetJobID(), UpdateJobState(internalpb.ImportJobState_Failed), err = s.imeta.UpdateJob(context.TODO(), task.GetJobID(), UpdateJobState(internalpb.ImportJobState_Failed), UpdateJobReason(resp.GetReason()))
UpdateJobReason(resp.GetReason()))
if err != nil { if err != nil {
log.Warn("failed to update job state to Failed", zap.Int64("jobID", task.GetJobID()), zap.Error(err)) log.Warn("failed to update job state to Failed", zap.Int64("jobID", task.GetJobID()), zap.Error(err))
} }
@ -324,7 +323,11 @@ func (s *importScheduler) processInProgressImport(task ImportTask) {
op2 := UpdateStatusOperator(info.GetSegmentID(), commonpb.SegmentState_Flushed) op2 := UpdateStatusOperator(info.GetSegmentID(), commonpb.SegmentState_Flushed)
err = s.meta.UpdateSegmentsInfo(context.TODO(), op1, op2) err = s.meta.UpdateSegmentsInfo(context.TODO(), op1, op2)
if err != nil { if err != nil {
log.Warn("update import segment binlogs failed", WrapTaskLog(task, zap.Error(err))...) updateErr := s.imeta.UpdateJob(context.TODO(), task.GetJobID(), UpdateJobState(internalpb.ImportJobState_Failed), UpdateJobReason(err.Error()))
if updateErr != nil {
log.Warn("failed to update job state to Failed", zap.Int64("jobID", task.GetJobID()), zap.Error(updateErr))
}
log.Warn("update import segment binlogs failed", WrapTaskLog(task, zap.String("err", err.Error()))...)
return return
} }
} }

View File

@ -1715,6 +1715,11 @@ func (s *Server) ImportV2(ctx context.Context, in *internalpb.ImportRequestInter
resp.Status = merr.Status(merr.WrapErrImportFailed(fmt.Sprintf("no binlog to import, input=%s", in.GetFiles()))) resp.Status = merr.Status(merr.WrapErrImportFailed(fmt.Sprintf("no binlog to import, input=%s", in.GetFiles())))
return resp, nil return resp, nil
} }
if len(files) > paramtable.Get().DataCoordCfg.MaxFilesPerImportReq.GetAsInt() {
resp.Status = merr.Status(merr.WrapErrImportFailed(fmt.Sprintf("The max number of import files should not exceed %d, but got %d",
paramtable.Get().DataCoordCfg.MaxFilesPerImportReq.GetAsInt(), len(files))))
return resp, nil
}
log.Info("list binlogs prefixes for import", zap.Any("binlog_prefixes", files)) log.Info("list binlogs prefixes for import", zap.Any("binlog_prefixes", files))
} }