mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
fix: Handle all-null data in StringIndexSort to prevent load timeout (#45100)
Related to #45081 StringIndexSort now properly handles collections with all-null string fields by: - Removing the error thrown when unique_count is 0 in ParseBinaryData - Adding alignment and padding support in mmap serialization (similar to ScalarIndexSort) - Separating data_size_ from mmap_size_ to correctly parse data without reading padding This fixes load collection timeout failures when all string field data is null, particularly affecting STL_SORT and TRIE index types. Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
parent
36a887b38b
commit
fd0ef09e97
@ -56,8 +56,13 @@ StringIndexSortImpl::ParseBinaryData(const uint8_t* data, size_t data_size) {
|
|||||||
memcpy(&result.unique_count, ptr, sizeof(uint32_t));
|
memcpy(&result.unique_count, ptr, sizeof(uint32_t));
|
||||||
ptr += sizeof(uint32_t);
|
ptr += sizeof(uint32_t);
|
||||||
|
|
||||||
|
// Handle all-null case where unique_count is 0
|
||||||
if (result.unique_count == 0) {
|
if (result.unique_count == 0) {
|
||||||
ThrowInfo(DataFormatBroken, "Unique count is 0");
|
result.string_offsets = nullptr;
|
||||||
|
result.string_data_start = ptr;
|
||||||
|
result.post_list_offsets = nullptr;
|
||||||
|
result.post_list_data_start = ptr;
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read string offsets
|
// Read string offsets
|
||||||
@ -88,6 +93,10 @@ StringIndexSortImpl::ParseBinaryData(const uint8_t* data, size_t data_size) {
|
|||||||
|
|
||||||
const std::string STRING_INDEX_SORT_FILE = "string_index_sort";
|
const std::string STRING_INDEX_SORT_FILE = "string_index_sort";
|
||||||
|
|
||||||
|
constexpr size_t ALIGNMENT = 32; // 32-byte alignment
|
||||||
|
|
||||||
|
const uint64_t MMAP_INDEX_PADDING = 1;
|
||||||
|
|
||||||
StringIndexSort::StringIndexSort(
|
StringIndexSort::StringIndexSort(
|
||||||
const storage::FileManagerContext& file_manager_context)
|
const storage::FileManagerContext& file_manager_context)
|
||||||
: StringIndex(ASCENDING_SORT), is_built_(false) {
|
: StringIndex(ASCENDING_SORT), is_built_(false) {
|
||||||
@ -867,9 +876,19 @@ StringIndexSortMmapImpl::LoadFromBinary(const BinarySet& binary_set,
|
|||||||
std::filesystem::create_directories(
|
std::filesystem::create_directories(
|
||||||
std::filesystem::path(mmap_filepath_).parent_path());
|
std::filesystem::path(mmap_filepath_).parent_path());
|
||||||
|
|
||||||
|
auto aligned_size =
|
||||||
|
((index_data->size + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
|
||||||
{
|
{
|
||||||
auto file_writer = storage::FileWriter(mmap_filepath_);
|
auto file_writer = storage::FileWriter(mmap_filepath_);
|
||||||
file_writer.Write(index_data->data.get(), index_data->size);
|
file_writer.Write(index_data->data.get(), index_data->size);
|
||||||
|
|
||||||
|
if (aligned_size > index_data->size) {
|
||||||
|
std::vector<uint8_t> padding(aligned_size - index_data->size, 0);
|
||||||
|
file_writer.Write(padding.data(), padding.size());
|
||||||
|
}
|
||||||
|
// write padding in case of all null values
|
||||||
|
std::vector<uint8_t> padding(MMAP_INDEX_PADDING, 0);
|
||||||
|
file_writer.Write(padding.data(), padding.size());
|
||||||
file_writer.Finish();
|
file_writer.Finish();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -878,7 +897,8 @@ StringIndexSortMmapImpl::LoadFromBinary(const BinarySet& binary_set,
|
|||||||
ThrowInfo(DataFormatBroken, "Failed to open mmap file");
|
ThrowInfo(DataFormatBroken, "Failed to open mmap file");
|
||||||
}
|
}
|
||||||
|
|
||||||
mmap_size_ = index_data->size;
|
mmap_size_ = aligned_size + MMAP_INDEX_PADDING;
|
||||||
|
data_size_ = index_data->size;
|
||||||
mmap_data_ = static_cast<char*>(
|
mmap_data_ = static_cast<char*>(
|
||||||
mmap(nullptr, mmap_size_, PROT_READ, MAP_PRIVATE, fd, 0));
|
mmap(nullptr, mmap_size_, PROT_READ, MAP_PRIVATE, fd, 0));
|
||||||
close(fd);
|
close(fd);
|
||||||
@ -889,7 +909,7 @@ StringIndexSortMmapImpl::LoadFromBinary(const BinarySet& binary_set,
|
|||||||
|
|
||||||
const uint8_t* data_start = reinterpret_cast<const uint8_t*>(mmap_data_);
|
const uint8_t* data_start = reinterpret_cast<const uint8_t*>(mmap_data_);
|
||||||
|
|
||||||
auto parsed = ParseBinaryData(data_start, mmap_size_);
|
auto parsed = ParseBinaryData(data_start, data_size_);
|
||||||
unique_count_ = parsed.unique_count;
|
unique_count_ = parsed.unique_count;
|
||||||
string_offsets_ = parsed.string_offsets;
|
string_offsets_ = parsed.string_offsets;
|
||||||
string_data_start_ = parsed.string_data_start;
|
string_data_start_ = parsed.string_data_start;
|
||||||
|
|||||||
@ -410,6 +410,7 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
|
|||||||
private:
|
private:
|
||||||
char* mmap_data_ = nullptr;
|
char* mmap_data_ = nullptr;
|
||||||
size_t mmap_size_ = 0;
|
size_t mmap_size_ = 0;
|
||||||
|
size_t data_size_ = 0; // Actual data size without padding
|
||||||
std::string mmap_filepath_;
|
std::string mmap_filepath_;
|
||||||
size_t unique_count_ = 0;
|
size_t unique_count_ = 0;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user