From fe8de016d517ab6593fb87727d08f425334e9edf Mon Sep 17 00:00:00 2001 From: congqixia Date: Tue, 15 Jul 2025 19:22:50 +0800 Subject: [PATCH] fix: [StorageV2] Align null bitmap offset when loading multi-chunk (#43321) Related to #43262 This patch fixes following logic bug: - When multiple chunks are loaded and size cannot be divided by 8, just appending uint8_t as bitmap will cause null bitmap dislocation - `null_bitmap_data()` points to start of whole row group, which may not stand for current `arrow::Array` The current solutions is: - Reorganize the null_bitmap with currect size & offset - Pass `array->offset()` in tuple to info the current offset Signed-off-by: Congqi Xia --- internal/core/src/common/ChunkWriter.cpp | 67 ++++++++++++++++-------- internal/core/src/common/ChunkWriter.h | 37 ++++++++++--- internal/core/src/common/File.h | 3 +- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/internal/core/src/common/ChunkWriter.cpp b/internal/core/src/common/ChunkWriter.cpp index de094e1f4b..a97d080d80 100644 --- a/internal/core/src/common/ChunkWriter.cpp +++ b/internal/core/src/common/ChunkWriter.cpp @@ -12,6 +12,7 @@ #include "common/ChunkWriter.h" #include #include +#include #include #include #include @@ -34,7 +35,8 @@ void StringChunkWriter::write(const arrow::ArrayVector& array_vec) { auto size = 0; std::vector strs; - std::vector> null_bitmaps; + // tuple + std::vector> null_bitmaps; for (const auto& data : array_vec) { auto array = std::dynamic_pointer_cast(data); for (int i = 0; i < array->length(); i++) { @@ -44,7 +46,9 @@ StringChunkWriter::write(const arrow::ArrayVector& array_vec) { } if (nullable_) { auto null_bitmap_n = (data->length() + 7) / 8; - null_bitmaps.emplace_back(data->null_bitmap_data(), null_bitmap_n); + // size, offset all in bits + null_bitmaps.emplace_back( + data->null_bitmap_data(), data->length(), data->offset()); size += null_bitmap_n; } row_nums_ += array->length(); @@ -92,7 +96,8 @@ void JSONChunkWriter::write(const arrow::ArrayVector& array_vec) { auto size = 0; std::vector jsons; - std::vector> null_bitmaps; + // tuple + std::vector> null_bitmaps; for (const auto& data : array_vec) { auto array = std::dynamic_pointer_cast(data); for (int i = 0; i < array->length(); i++) { @@ -103,7 +108,9 @@ JSONChunkWriter::write(const arrow::ArrayVector& array_vec) { } if (nullable_) { auto null_bitmap_n = (data->length() + 7) / 8; - null_bitmaps.emplace_back(data->null_bitmap_data(), null_bitmap_n); + // size, offset all in bits + null_bitmaps.emplace_back( + data->null_bitmap_data(), data->length(), data->offset()); size += null_bitmap_n; } row_nums_ += array->length(); @@ -151,7 +158,8 @@ ArrayChunkWriter::write(const arrow::ArrayVector& array_vec) { auto size = 0; auto is_string = IsStringDataType(element_type_); std::vector arrays; - std::vector> null_bitmaps; + // tuple + std::vector> null_bitmaps; for (const auto& data : array_vec) { auto array = std::dynamic_pointer_cast(data); @@ -170,7 +178,9 @@ ArrayChunkWriter::write(const arrow::ArrayVector& array_vec) { row_nums_ += array->length(); if (nullable_) { auto null_bitmap_n = (data->length() + 7) / 8; - null_bitmaps.emplace_back(data->null_bitmap_data(), null_bitmap_n); + // size, offset all in bits + null_bitmaps.emplace_back( + data->null_bitmap_data(), data->length(), data->offset()); size += null_bitmap_n; } } @@ -368,9 +378,9 @@ template std::shared_ptr create_chunk_writer(const FieldMeta& field_meta, Args&&... args) { int dim = IsVectorDataType(field_meta.get_data_type()) && - !IsSparseFloatVectorDataType(field_meta.get_data_type()) - ? field_meta.get_dim() - : 1; + !IsSparseFloatVectorDataType(field_meta.get_data_type()) + ? field_meta.get_dim() + : 1; bool nullable = field_meta.is_nullable(); switch (field_meta.get_data_type()) { case milvus::DataType::BOOL: @@ -395,41 +405,53 @@ create_chunk_writer(const FieldMeta& field_meta, Args&&... args) { return std::make_shared>( dim, std::forward(args)..., nullable); case milvus::DataType::VECTOR_FLOAT: - return std::make_shared>( + return std::make_shared< + ChunkWriter>( dim, std::forward(args)..., nullable); case milvus::DataType::VECTOR_BINARY: - return std::make_shared>( + return std::make_shared< + ChunkWriter>( dim / 8, std::forward(args)..., nullable); case milvus::DataType::VECTOR_FLOAT16: - return std::make_shared>( + return std::make_shared< + ChunkWriter>( dim, std::forward(args)..., nullable); case milvus::DataType::VECTOR_BFLOAT16: - return std::make_shared>( + return std::make_shared< + ChunkWriter>( dim, std::forward(args)..., nullable); case milvus::DataType::VECTOR_INT8: - return std::make_shared>( + return std::make_shared< + ChunkWriter>( dim, std::forward(args)..., nullable); case milvus::DataType::VARCHAR: case milvus::DataType::STRING: case milvus::DataType::TEXT: - return std::make_shared(std::forward(args)..., nullable); + return std::make_shared( + std::forward(args)..., nullable); case milvus::DataType::JSON: - return std::make_shared(std::forward(args)..., nullable); + return std::make_shared( + std::forward(args)..., nullable); case milvus::DataType::ARRAY: return std::make_shared( - field_meta.get_element_type(), std::forward(args)..., nullable); + field_meta.get_element_type(), + std::forward(args)..., + nullable); case milvus::DataType::VECTOR_SPARSE_FLOAT: - return std::make_shared(std::forward(args)..., nullable); + return std::make_shared( + std::forward(args)..., nullable); case milvus::DataType::VECTOR_ARRAY: - return std::make_shared(dim, field_meta.get_element_type(), std::forward(args)...); + return std::make_shared( + dim, + field_meta.get_element_type(), + std::forward(args)...); default: PanicInfo(Unsupported, "Unsupported data type"); } } std::unique_ptr -create_chunk(const FieldMeta& field_meta, - const arrow::ArrayVector& array_vec) { +create_chunk(const FieldMeta& field_meta, const arrow::ArrayVector& array_vec) { auto cw = create_chunk_writer(field_meta); cw->write(array_vec); return cw->finish(); @@ -437,7 +459,8 @@ create_chunk(const FieldMeta& field_meta, std::unique_ptr create_chunk(const FieldMeta& field_meta, - const arrow::ArrayVector& array_vec, const std::string& file_path) { + const arrow::ArrayVector& array_vec, + const std::string& file_path) { auto cw = create_chunk_writer(field_meta, file_path); cw->write(array_vec); return cw->finish(); diff --git a/internal/core/src/common/ChunkWriter.h b/internal/core/src/common/ChunkWriter.h index f70fbef9cf..23dd3609c1 100644 --- a/internal/core/src/common/ChunkWriter.h +++ b/internal/core/src/common/ChunkWriter.h @@ -49,17 +49,38 @@ class ChunkWriterBase { void write_null_bit_maps( - const std::vector>& null_bitmaps) { + const std::vector>& + null_bitmaps) { if (nullable_) { - for (auto [data, size] : null_bitmaps) { + // merge all null bitmaps in case of multiple chunk null bitmap dislocation + // say [0xFF, 0x00] with size [7, 8] cannot be treated as [0xFF, 0x00] after merged but + // [0x7F, 0x00], othersize the null index will be dislocated + std::vector merged_null_bitmap; + int64_t size_total_bit = 0; + for (auto [data, size_bits, offset_bits] : null_bitmaps) { + // resize in byte + merged_null_bitmap.resize((size_total_bit + size_bits + 7) / 8, + 0xFF); if (data != nullptr) { - target_->write(data, size); + bitset::detail::ElementWiseBitsetPolicy::op_copy( + data, + offset_bits, + merged_null_bitmap.data(), + size_total_bit, + size_bits); } else { // have to append always-true bitmap due to arrow optimize this - std::vector null_bitmap(size, 0xff); - target_->write(null_bitmap.data(), size); + std::vector null_bitmap(size_bits, 0xff); + bitset::detail::ElementWiseBitsetPolicy::op_copy( + null_bitmap.data(), + 0, + merged_null_bitmap.data(), + size_total_bit, + size_bits); } + size_total_bit += size_bits; } + target_->write(merged_null_bitmap.data(), (size_total_bit + 7) / 8); } } @@ -208,7 +229,8 @@ class ArrayChunkWriter : public ChunkWriterBase { ArrayChunkWriter(const milvus::DataType element_type, std::string file_path, bool nullable) - : ChunkWriterBase(std::move(file_path), nullable), element_type_(element_type) { + : ChunkWriterBase(std::move(file_path), nullable), + element_type_(element_type) { } void @@ -257,8 +279,7 @@ class SparseFloatVectorChunkWriter : public ChunkWriterBase { }; std::unique_ptr -create_chunk(const FieldMeta& field_meta, - const arrow::ArrayVector& array_vec); +create_chunk(const FieldMeta& field_meta, const arrow::ArrayVector& array_vec); std::unique_ptr create_chunk(const FieldMeta& field_meta, diff --git a/internal/core/src/common/File.h b/internal/core/src/common/File.h index 3b0a4415a7..af658d487e 100644 --- a/internal/core/src/common/File.h +++ b/internal/core/src/common/File.h @@ -115,7 +115,8 @@ class File { } private: - static inline const char* get_mode_from_flags(int flags) { + static inline const char* + get_mode_from_flags(int flags) { switch (flags) { case O_RDONLY: { return "rb";