fix: [2.5] Do not delete the centroids file when sampling fails instead wait GC (#40702)

issue: #40700 

pr: #40701

---------

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2025-03-18 22:00:20 +08:00 committed by GitHub
parent aca9dd9105
commit d4e2f581f9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -105,7 +105,8 @@ KmeansClustering::SampleTrainData(
}
}
// shuffle files
std::shuffle(files.begin(), files.end(), std::mt19937());
std::mt19937 rng(static_cast<unsigned int>(std::time(nullptr)));
std::shuffle(files.begin(), files.end(), rng);
FetchDataFiles<T>(
buf, expected_train_size, expected_train_size, files, dim, offset);
return;
@ -325,11 +326,6 @@ KmeansClustering::StreamingAssignandUpload(
}
if (IsDataSkew<T>(config, dim, num_vectors_each_centroid)) {
LOG_INFO(msg_header_ + "data skew! skip clustering");
// remove uploaded files
remote_paths_to_size[cluster_result_.centroid_path] =
cluster_result_.centroid_file_size;
RemoveClusteringResultFiles(file_manager_->GetChunkManager().get(),
remote_paths_to_size);
// skip clustering, nothing takes affect
throw SegcoreError(ErrorCode::ClusterSkip,
"data skew! skip clustering");