From f01ff57f3f9312b186e7a2b9dcdc28dfb48a709a Mon Sep 17 00:00:00 2001 From: congqixia Date: Tue, 17 Jun 2025 10:08:38 +0800 Subject: [PATCH] fix: [StorageV2] Use correct offset filling null bitmap (#42774) Related to #39173 `null_bitmap_data()` returns raw pointer of null bitmap of Array. While after slicing, this bitmap is not rewritten due to zero copy implementation, so the current start pos maybe non-zero while FillFieldData generating column `valid_data` array. This PR add `offset` param for `FillFieldData` method, and force all invocation pass correct offset of `null_bitmap_data` ptr. Also update milvus-storage commit fixing reader failed to return data when buffer size smaller than row group size problem. --------- Signed-off-by: Congqi Xia --- internal/core/src/common/FieldData.cpp | 100 +++++++++++------- internal/core/src/common/FieldDataInterface.h | 22 +++- internal/core/src/storage/Util.cpp | 3 +- .../thirdparty/milvus-storage/CMakeLists.txt | 2 +- .../core/unittest/test_array_bitmap_index.cpp | 2 +- internal/core/unittest/test_bitmap_index.cpp | 2 +- internal/core/unittest/test_chunk.cpp | 6 +- internal/core/unittest/test_data_codec.cpp | 20 ++-- internal/core/unittest/test_hybrid_index.cpp | 2 +- .../core/unittest/test_inverted_index.cpp | 4 +- .../unittest/test_json_key_stats_index.cpp | 3 +- internal/core/unittest/test_sealed.cpp | 4 +- internal/core/unittest/test_utils/DataGen.h | 2 +- internal/storage/rw.go | 12 ++- internal/storagev2/packed/constant.go | 2 + internal/storagev2/packed/packed_test.go | 2 +- 16 files changed, 115 insertions(+), 73 deletions(-) diff --git a/internal/core/src/common/FieldData.cpp b/internal/core/src/common/FieldData.cpp index e37957c4b9..d4eeedf981 100644 --- a/internal/core/src/common/FieldData.cpp +++ b/internal/core/src/common/FieldData.cpp @@ -53,7 +53,10 @@ FieldDataImpl::FillFieldData(const void* source, template void FieldDataImpl::FillFieldData( - const void* field_data, const uint8_t* valid_data, ssize_t element_count) { + const void* field_data, + const uint8_t* valid_data, + ssize_t element_count, + ssize_t offset) { AssertInfo( nullable_, "no need to fill valid_data, use the 2-argument version instead"); @@ -73,7 +76,7 @@ FieldDataImpl::FillFieldData( // means null_count == 0, will fill it with 0xFF if (valid_data != nullptr) { bitset::detail::ElementWiseBitsetPolicy::op_copy( - valid_data, 0, valid_data_.data(), length_, element_count); + valid_data, offset, valid_data_.data(), length_, element_count); } length_ += element_count; @@ -129,7 +132,8 @@ FieldDataImpl::FillFieldData( if (nullable_) { return FillFieldData(values.data(), bool_array->null_bitmap_data(), - element_count); + element_count, + bool_array->offset()); } return FillFieldData(values.data(), element_count); } @@ -138,8 +142,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray( array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -148,8 +154,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray(array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -158,8 +166,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray(array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -168,8 +178,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray(array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -178,8 +190,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray(array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -188,8 +202,10 @@ FieldDataImpl::FillFieldData( GetDataInfoFromArray(array); if (nullable_) { - return FillFieldData( - array_info.first, array->null_bitmap_data(), element_count); + return FillFieldData(array_info.first, + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(array_info.first, array_info.second); } @@ -205,8 +221,10 @@ FieldDataImpl::FillFieldData( values[index] = string_array->GetString(index); } if (nullable_) { - return FillFieldData( - values.data(), array->null_bitmap_data(), element_count); + return FillFieldData(values.data(), + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(values.data(), element_count); } @@ -223,8 +241,10 @@ FieldDataImpl::FillFieldData( Json(simdjson::padded_string(json_array->GetString(index))); } if (nullable_) { - return FillFieldData( - values.data(), array->null_bitmap_data(), element_count); + return FillFieldData(values.data(), + array->null_bitmap_data(), + element_count, + array->offset()); } return FillFieldData(values.data(), element_count); } @@ -245,8 +265,10 @@ FieldDataImpl::FillFieldData( values[index] = Array(field_data); } if (nullable_) { - return FillFieldData( - values.data(), array->null_bitmap_data(), element_count); + return FillFieldData(values.data(), + array->null_bitmap_data(), + element_count, + array->offset()); } AssertInfo(null_number == 0, "get empty string when not nullable"); return FillFieldData(values.data(), element_count); @@ -322,70 +344,70 @@ FieldDataImpl::FillFieldData( if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->bool_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::INT8: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->int_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::INT16: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->int_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::INT32: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->int_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::INT64: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->long_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::FLOAT: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->float_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::DOUBLE: { FixedVector values(element_count); if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->double_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::STRING: case DataType::VARCHAR: { @@ -393,23 +415,23 @@ FieldDataImpl::FillFieldData( if (default_value.has_value()) { std::fill( values.begin(), values.end(), default_value->string_data()); - return FillFieldData(values.data(), nullptr, element_count); + return FillFieldData(values.data(), nullptr, element_count, 0); } return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::JSON: { // The code here is not referenced. // A subclass named FieldDataJsonImpl is implemented, which overloads this function. FixedVector values(element_count); return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } case DataType::ARRAY: { // todo: add array default_value FixedVector values(element_count); return FillFieldData( - values.data(), valid_data_ptr.get(), element_count); + values.data(), valid_data_ptr.get(), element_count, 0); } default: { PanicInfo(DataTypeInvalid, diff --git a/internal/core/src/common/FieldDataInterface.h b/internal/core/src/common/FieldDataInterface.h index 7b6e4524cb..31085cca30 100644 --- a/internal/core/src/common/FieldDataInterface.h +++ b/internal/core/src/common/FieldDataInterface.h @@ -25,6 +25,7 @@ #include #include +#include "log/Log.h" #include "Types.h" #include "arrow/api.h" #include "arrow/array/array_binary.h" @@ -57,7 +58,8 @@ class FieldDataBase { virtual void FillFieldData(const void* field_data, const uint8_t* valid_data, - ssize_t element_count) = 0; + ssize_t element_count, + ssize_t offset) = 0; virtual void FillFieldData(const std::shared_ptr arrays) = 0; @@ -164,7 +166,8 @@ class FieldBitsetImpl : public FieldDataBase { void FillFieldData(const void* field_data, const uint8_t* valid_data, - ssize_t element_count) override { + ssize_t element_count, + ssize_t offset) override { PanicInfo(NotImplemented, "FillFieldData(const void* field_data, " "const uint8_t* valid_data, ssize_t element_count)" @@ -378,7 +381,8 @@ class FieldDataImpl : public FieldDataBase { void FillFieldData(const void* field_data, const uint8_t* valid_data, - ssize_t element_count) override; + ssize_t element_count, + ssize_t offset) override; void FillFieldData(const std::shared_ptr arrays) override; @@ -598,7 +602,11 @@ class FieldDataStringImpl : public FieldDataImpl { auto valid_data = array->null_bitmap_data(); if (valid_data != nullptr) { bitset::detail::ElementWiseBitsetPolicy::op_copy( - valid_data, 0, valid_data_.data(), length_, n); + valid_data, + array->offset(), + valid_data_.data(), + length_, + n); } } length_ += n; @@ -689,7 +697,11 @@ class FieldDataJsonImpl : public FieldDataImpl { auto valid_data = array->null_bitmap_data(); if (valid_data != nullptr) { bitset::detail::ElementWiseBitsetPolicy::op_copy( - valid_data, 0, valid_data_.data(), length_, n); + valid_data, + array->offset(), + valid_data_.data(), + length_, + n); } } length_ += n; diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index a002178d8f..94fd10ef3a 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -1,4 +1,3 @@ - // Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -1006,7 +1005,7 @@ MergeFieldData(std::vector& data_array) { for (const auto& data : data_array) { if (merged_data->IsNullable()) { merged_data->FillFieldData( - data->Data(), data->ValidData(), data->Length()); + data->Data(), data->ValidData(), data->Length(), 0); } else { merged_data->FillFieldData(data->Data(), data->Length()); } diff --git a/internal/core/thirdparty/milvus-storage/CMakeLists.txt b/internal/core/thirdparty/milvus-storage/CMakeLists.txt index 9989bae9b9..88476b9459 100644 --- a/internal/core/thirdparty/milvus-storage/CMakeLists.txt +++ b/internal/core/thirdparty/milvus-storage/CMakeLists.txt @@ -14,7 +14,7 @@ # Update milvus-storage_VERSION for the first occurrence milvus_add_pkg_config("milvus-storage") set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES "") -set( milvus-storage_VERSION 1238e21 ) +set( milvus-storage_VERSION fa304fa ) set( GIT_REPOSITORY "https://github.com/milvus-io/milvus-storage.git") message(STATUS "milvus-storage repo: ${GIT_REPOSITORY}") message(STATUS "milvus-storage version: ${milvus-storage_VERSION}") diff --git a/internal/core/unittest/test_array_bitmap_index.cpp b/internal/core/unittest/test_array_bitmap_index.cpp index 748acac025..b817911278 100644 --- a/internal/core/unittest/test_array_bitmap_index.cpp +++ b/internal/core/unittest/test_array_bitmap_index.cpp @@ -201,7 +201,7 @@ class ArrayBitmapIndexTest : public testing::Test { ptr[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data_.data(), ptr, data_.size()); + field_data->FillFieldData(data_.data(), ptr, data_.size(), 0); delete[] ptr; } else { field_data->FillFieldData(data_.data(), data_.size()); diff --git a/internal/core/unittest/test_bitmap_index.cpp b/internal/core/unittest/test_bitmap_index.cpp index 13c1c0d0ed..4c6c131baf 100644 --- a/internal/core/unittest/test_bitmap_index.cpp +++ b/internal/core/unittest/test_bitmap_index.cpp @@ -132,7 +132,7 @@ class BitmapIndexTest : public testing::Test { ptr[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data_.data(), ptr, data_.size()); + field_data->FillFieldData(data_.data(), ptr, data_.size(), 0); delete[] ptr; } else { field_data->FillFieldData(data_.data(), data_.size()); diff --git a/internal/core/unittest/test_chunk.cpp b/internal/core/unittest/test_chunk.cpp index 8b3042ecfa..d9a421a0b3 100644 --- a/internal/core/unittest/test_chunk.cpp +++ b/internal/core/unittest/test_chunk.cpp @@ -125,7 +125,7 @@ TEST(chunk, test_variable_field_nullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::VARCHAR, true); uint8_t* valid_data = new uint8_t[1]{0x15}; // 10101 in binary - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); delete[] valid_data; storage::InsertEventData event_data; @@ -290,7 +290,7 @@ TEST(chunk, test_null_int64) { // Set up validity bitmap: 10011 (1st, 4th, and 5th are valid) uint8_t* valid_data = new uint8_t[1]{0x13}; // 10011 in binary - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); delete[] valid_data; storage::InsertEventData event_data; @@ -413,7 +413,7 @@ TEST(chunk, test_null_array) { // Set up validity bitmap: 10101 (1st, 3rd, and 5th are valid) uint8_t* valid_data = new uint8_t[1]{0x15}; // 10101 in binary - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); delete[] valid_data; storage::InsertEventData event_data; diff --git a/internal/core/unittest/test_data_codec.cpp b/internal/core/unittest/test_data_codec.cpp index b07ae9261e..1bff0f7619 100644 --- a/internal/core/unittest/test_data_codec.cpp +++ b/internal/core/unittest/test_data_codec.cpp @@ -66,7 +66,7 @@ TEST(storage, InsertDataBoolNullable) { milvus::storage::CreateFieldData(storage::DataType::BOOL, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -132,7 +132,7 @@ TEST(storage, InsertDataInt8Nullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT8, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -196,7 +196,7 @@ TEST(storage, InsertDataInt16Nullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT16, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -260,7 +260,7 @@ TEST(storage, InsertDataInt32Nullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT32, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -324,7 +324,7 @@ TEST(storage, InsertDataInt64Nullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT64, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -394,7 +394,7 @@ TEST(storage, InsertDataStringNullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::STRING, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -461,7 +461,7 @@ TEST(storage, InsertDataFloatNullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::FLOAT, true); std::array valid_data = {0xF3}; - field_data->FillFieldData(data.data(), valid_data.data(), data.size()); + field_data->FillFieldData(data.data(), valid_data.data(), data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -524,7 +524,7 @@ TEST(storage, InsertDataDoubleNullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::DOUBLE, true); uint8_t* valid_data = new uint8_t[1]{0xF3}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -775,7 +775,7 @@ TEST(storage, InsertDataStringArrayNullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::ARRAY, true); uint8_t* valid_data = new uint8_t[1]{0xFD}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); @@ -813,7 +813,7 @@ TEST(storage, InsertDataJsonNullable) { auto field_data = milvus::storage::CreateFieldData(storage::DataType::JSON, true); uint8_t* valid_data = new uint8_t[1]{0xFC}; - field_data->FillFieldData(data.data(), valid_data, data.size()); + field_data->FillFieldData(data.data(), valid_data, data.size(), 0); auto payload_reader = std::make_shared(field_data); diff --git a/internal/core/unittest/test_hybrid_index.cpp b/internal/core/unittest/test_hybrid_index.cpp index 2ed1d5cc00..f490ddaea0 100644 --- a/internal/core/unittest/test_hybrid_index.cpp +++ b/internal/core/unittest/test_hybrid_index.cpp @@ -130,7 +130,7 @@ class HybridIndexTestV1 : public testing::Test { ptr[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data_.data(), ptr, data_.size()); + field_data->FillFieldData(data_.data(), ptr, data_.size(), 0); delete[] ptr; } else { field_data->FillFieldData(data_.data(), data_.size()); diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index 2cc6513628..a05518b035 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -132,7 +132,7 @@ test_run() { valid_data_[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data.data(), valid_data_, data.size()); + field_data->FillFieldData(data.data(), valid_data_, data.size(), 0); delete[] valid_data_; } else { field_data->FillFieldData(data.data(), data.size()); @@ -525,7 +525,7 @@ test_string() { valid_data_[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data.data(), valid_data_, data.size()); + field_data->FillFieldData(data.data(), valid_data_, data.size(), 0); delete[] valid_data_; } else { field_data->FillFieldData(data.data(), data.size()); diff --git a/internal/core/unittest/test_json_key_stats_index.cpp b/internal/core/unittest/test_json_key_stats_index.cpp index 2b0c1e595f..2485f7a924 100644 --- a/internal/core/unittest/test_json_key_stats_index.cpp +++ b/internal/core/unittest/test_json_key_stats_index.cpp @@ -85,7 +85,8 @@ class JsonKeyStatsIndexTest : public ::testing::TestWithParam { valid_data_[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(data_.data(), valid_data_, data_.size()); + field_data->FillFieldData( + data_.data(), valid_data_, data_.size(), 0); delete[] valid_data_; } else { field_data->FillFieldData(data_.data(), data_.size()); diff --git a/internal/core/unittest/test_sealed.cpp b/internal/core/unittest/test_sealed.cpp index 17baa1208a..56866d7eaf 100644 --- a/internal/core/unittest/test_sealed.cpp +++ b/internal/core/unittest/test_sealed.cpp @@ -1771,7 +1771,7 @@ TEST(Sealed, SkipIndexSkipUnaryRangeNullable) { auto int64s_field_data = storage::CreateFieldData(DataType::INT64, true, 1, 5); - int64s_field_data->FillFieldData(int64s.data(), valid_data.data(), 5); + int64s_field_data->FillFieldData(int64s.data(), valid_data.data(), 5, 0); auto load_info = PrepareSingleFieldInsertBinlog(kCollectionID, kPartitionID, kSegmentID, @@ -1842,7 +1842,7 @@ TEST(Sealed, SkipIndexSkipBinaryRangeNullable) { auto int64s_field_data = storage::CreateFieldData(DataType::INT64, true, 1, 5); - int64s_field_data->FillFieldData(int64s.data(), valid_data.data(), 5); + int64s_field_data->FillFieldData(int64s.data(), valid_data.data(), 5, 0); auto load_info = PrepareSingleFieldInsertBinlog(kCollectionID, kPartitionID, kSegmentID, diff --git a/internal/core/unittest/test_utils/DataGen.h b/internal/core/unittest/test_utils/DataGen.h index e6e7ea324d..c46d99f019 100644 --- a/internal/core/unittest/test_utils/DataGen.h +++ b/internal/core/unittest/test_utils/DataGen.h @@ -1128,7 +1128,7 @@ CreateFieldDataFromDataArray(ssize_t raw_count, valid_data[byteIndex] &= ~(1 << bitIndex); } } - field_data->FillFieldData(raw_data, valid_data.data(), raw_count); + field_data->FillFieldData(raw_data, valid_data.data(), raw_count, 0); }; if (field_meta.is_vector()) { diff --git a/internal/storage/rw.go b/internal/storage/rw.go index 76671b78cd..bf86c5b880 100644 --- a/internal/storage/rw.go +++ b/internal/storage/rw.go @@ -58,13 +58,19 @@ type rwOptions struct { type RwOption func(*rwOptions) -func DefaultRwOptions() *rwOptions { +func DefaultWriterOptions() *rwOptions { return &rwOptions{ bufferSize: packed.DefaultWriteBufferSize, multiPartUploadSize: packed.DefaultMultiPartUploadSize, } } +func DefaultReaderOptions() *rwOptions { + return &rwOptions{ + bufferSize: packed.DefaultReadBufferSize, + } +} + func WithVersion(version int64) RwOption { return func(options *rwOptions) { options.version = version @@ -192,7 +198,7 @@ func makeBlobsReader(ctx context.Context, binlogs []*datapb.FieldBinlog, downloa } func NewBinlogRecordReader(ctx context.Context, binlogs []*datapb.FieldBinlog, schema *schemapb.CollectionSchema, option ...RwOption) (RecordReader, error) { - rwOptions := DefaultRwOptions() + rwOptions := DefaultReaderOptions() for _, opt := range option { opt(rwOptions) } @@ -229,7 +235,7 @@ func NewBinlogRecordWriter(ctx context.Context, collectionID, partitionID, segme schema *schemapb.CollectionSchema, allocator allocator.Interface, chunkSize uint64, bucketName, rootPath string, maxRowNum int64, option ...RwOption, ) (BinlogRecordWriter, error) { - rwOptions := DefaultRwOptions() + rwOptions := DefaultWriterOptions() for _, opt := range option { opt(rwOptions) } diff --git a/internal/storagev2/packed/constant.go b/internal/storagev2/packed/constant.go index 4cf71d8b6d..a30e8f8e59 100644 --- a/internal/storagev2/packed/constant.go +++ b/internal/storagev2/packed/constant.go @@ -19,6 +19,8 @@ const ( ColumnGroupSizeThreshold = 1024 // 1KB // DefaultBufferSize is the default buffer size for writing data to storage. DefaultWriteBufferSize = 32 * 1024 * 1024 // 32MB + // DefaultBufferSize is the default buffer size for reading data from storage. + DefaultReadBufferSize = -1 // use -1 for unlimited // DefaultMultiPartUploadSize is the default size of each part of a multipart upload. DefaultMultiPartUploadSize = 10 * 1024 * 1024 // 10MB // Arrow will convert these field IDs to a metadata key named PARQUET:field_id on the appropriate field. diff --git a/internal/storagev2/packed/packed_test.go b/internal/storagev2/packed/packed_test.go index 2603c53b1f..e5127b7902 100644 --- a/internal/storagev2/packed/packed_test.go +++ b/internal/storagev2/packed/packed_test.go @@ -132,7 +132,7 @@ func (suite *PackedTestSuite) TestPackedMultiFiles() { defer rec.Release() paths := []string{"/tmp/100", "/tmp/101"} columnGroups := []storagecommon.ColumnGroup{{Columns: []int{2}}, {Columns: []int{0, 1}}} - bufferSize := int64(10 * 1024 * 1024) // 10MB + bufferSize := int64(-1) // unlimited multiPartUploadSize := int64(0) pw, err := NewPackedWriter(paths, suite.schema, bufferSize, multiPartUploadSize, columnGroups, nil) suite.NoError(err)