From b13aac5164caa6409ccea8033048f99a7d988d09 Mon Sep 17 00:00:00 2001 From: "cai.zhang" Date: Tue, 30 Dec 2025 21:13:21 +0800 Subject: [PATCH] fix: Include fieldID in raw data cleanup to prevent delete other fields (#46688) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit issue: #46687 - Core invariant: raw-data cleanup must be scoped to (segment_id, field_id) so deleting temporary raw files for one field never removes raw files for other fields in the same segment (prevents cross-field deletion during index builds). - Root cause and fix (bug): VectorDiskIndex::Build() and BuildWithDataset() called RemoveDir on the segment-level path; this removed rawdata/{segment_id}/. The fix changes both calls to remove storage::GenFieldRawDataPathPrefix(local_chunk_manager, segment_id, field_id) instead, limiting cleanup to rawdata/{segment_id}_{field_id}/ (field-scoped). - Logic removed/simplified: the old helper GetSegmentRawDataPathPrefix was removed and callers were switched to GenFieldRawDataPathPrefix; cleanup logic is simplified from segment-level to field-level path generation and removal, eliminating redundant broad deletions. - Why this does NOT cause data loss or regress behavior: the change narrows RemoveDir() to the exact field path used when caching raw data and offsets earlier in Build (offsets_path and CacheRawDataToDisk produce field-scoped local paths). Build still writes/reads offsets and raw data from GenFieldRawDataPathPrefix(...) and then removes that same prefix after successful index.Build(); therefore only temporary files for the built field are deleted and other fields’ raw files under the same segment are preserved. This fixes issue #46687 by preventing accidental deletion of other fields’ raw data. Signed-off-by: Cai Zhang --- internal/core/src/index/VectorDiskIndex.cpp | 8 ++++---- internal/core/src/storage/Util.cpp | 10 +--------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/internal/core/src/index/VectorDiskIndex.cpp b/internal/core/src/index/VectorDiskIndex.cpp index 9f3cfabe37..339bef33e7 100644 --- a/internal/core/src/index/VectorDiskIndex.cpp +++ b/internal/core/src/index/VectorDiskIndex.cpp @@ -227,8 +227,8 @@ VectorDiskAnnIndex::Build(const Config& config) { ThrowInfo(ErrorCode::IndexBuildError, "failed to build disk index, " + KnowhereStatusString(stat)); - local_chunk_manager->RemoveDir( - storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id)); + local_chunk_manager->RemoveDir(storage::GenFieldRawDataPathPrefix( + local_chunk_manager, segment_id, field_id)); } template @@ -335,8 +335,8 @@ VectorDiskAnnIndex::BuildWithDataset(const DatasetPtr& dataset, file_manager_->AddFile(valid_data_path); } - local_chunk_manager->RemoveDir( - storage::GetSegmentRawDataPathPrefix(local_chunk_manager, segment_id)); + local_chunk_manager->RemoveDir(storage::GenFieldRawDataPathPrefix( + local_chunk_manager, segment_id, field_id)); // TODO :: // SetDim(index_->Dim()); diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index 8d83e36df3..785516d7af 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -942,15 +942,7 @@ GenFieldRawDataPathPrefix(ChunkManagerPtr cm, boost::filesystem::path prefix = cm->GetRootPath(); boost::filesystem::path path = std::string(RAWDATA_ROOT_PATH); boost::filesystem::path path1 = - std::to_string(segment_id) + "/" + std::to_string(field_id) + "/"; - return NormalizePath(prefix / path / path1); -} - -std::string -GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id) { - boost::filesystem::path prefix = cm->GetRootPath(); - boost::filesystem::path path = std::string(RAWDATA_ROOT_PATH); - boost::filesystem::path path1 = std::to_string(segment_id); + std::to_string(segment_id) + "_" + std::to_string(field_id) + "/"; return NormalizePath(prefix / path / path1); }