From 7ebe3d7038c25bf420cdef2bbbf0eb8f1117bdf0 Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Sun, 16 Mar 2025 22:20:08 +0800 Subject: [PATCH] enhance: refine chunk access logic and add some comment on data (#40618) #40367 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/common/Chunk.h | 68 ++++++-- internal/core/src/common/ChunkWriter.h | 61 ++++--- internal/core/src/mmap/ChunkedColumn.h | 35 ++-- .../unittest/test_cached_search_iterator.cpp | 6 +- internal/core/unittest/test_chunk.cpp | 152 ++++++++++++++++-- .../core/unittest/test_chunked_segment.cpp | 5 +- 6 files changed, 251 insertions(+), 76 deletions(-) diff --git a/internal/core/src/common/Chunk.h b/internal/core/src/common/Chunk.h index 94c87ee36f..d6c60b5a12 100644 --- a/internal/core/src/common/Chunk.h +++ b/internal/core/src/common/Chunk.h @@ -88,7 +88,6 @@ class Chunk { FixedVector valid_; // parse null bitmap to valid_ to be compatible with SpanBase }; - // for fixed size data, includes fixed size array class FixedWidthChunk : public Chunk { public: @@ -100,12 +99,14 @@ class FixedWidthChunk : public Chunk { bool nullable) : Chunk(row_nums, data, size, nullable), dim_(dim), - element_size_(element_size){}; + element_size_(element_size) { + auto null_bitmap_bytes_num = nullable_ ? (row_nums_ + 7) / 8 : 0; + data_start_ = data_ + null_bitmap_bytes_num; + }; milvus::SpanBase Span() const { - auto null_bitmap_bytes_num = (row_nums_ + 7) / 8; - return milvus::SpanBase(data_ + null_bitmap_bytes_num, + return milvus::SpanBase(data_start_, nullable_ ? valid_.data() : nullptr, row_nums_, element_size_ * dim_); @@ -113,30 +114,47 @@ class FixedWidthChunk : public Chunk { const char* ValueAt(int64_t idx) const override { - auto null_bitmap_bytes_num = (row_nums_ + 7) / 8; - return data_ + null_bitmap_bytes_num + idx * element_size_ * dim_; + return data_start_ + idx * element_size_ * dim_; } const char* Data() const override { - auto null_bitmap_bytes_num = (row_nums_ + 7) / 8; - return data_ + null_bitmap_bytes_num; + return data_start_; } private: int dim_; int element_size_; + const char* data_start_; }; +// A StringChunk is a class that represents a collection of strings stored in a contiguous memory block. +// It is initialized with the number of rows, a pointer to the data, the size of the data, and a boolean +// indicating whether the data can contain null values. The data is accessed using offsets, which are +// stored after an optional null bitmap. Each string is represented by a range in the data block, defined +// by these offsets. +// +// Example of a valid StringChunk: +// +// Suppose we have a data block containing the strings "apple", "banana", and "cherry", and we want to +// create a StringChunk for these strings. The data block might look like this: +// +// [null_bitmap][offsets][string_data] +// [00000000] [17, 22, 28, 34] ["apple", "banana", "cherry"] +// +// Here, the null_bitmap is empty (indicating no nulls), the offsets array indicates the start of each +// string in the data block, and the string_data contains the actual string content. +// +// StringChunk exampleChunk(3, dataPointer, dataSize, false); +// +// In this example, 'exampleChunk' is a StringChunk with 3 rows, a pointer to the data stored in 'dataPointer', +// a total data size of 'dataSize', and it does not support nullability. class StringChunk : public Chunk { public: StringChunk() = default; StringChunk(int32_t row_nums, char* data, uint64_t size, bool nullable) : Chunk(row_nums, data, size, nullable) { - auto null_bitmap_bytes_num = 0; - if (nullable) { - null_bitmap_bytes_num = (row_nums + 7) / 8; - } + auto null_bitmap_bytes_num = nullable_ ? (row_nums_ + 7) / 8 : 0; offsets_ = reinterpret_cast(data + null_bitmap_bytes_num); } @@ -196,6 +214,32 @@ class StringChunk : public Chunk { using JSONChunk = StringChunk; + +// An ArrayChunk is a class that represents a collection of arrays stored in a contiguous memory block. +// It is initialized with the number of rows, a pointer to the data, the size of the data, the element type, +// and a boolean indicating whether the data can contain null values. The data is accessed using offsets and lengths, +// which are stored after an optional null bitmap. Each array is represented by a range in the data block. +// +// Example of a valid ArrayChunk: +// +// Suppose we have a data block containing arrays of integers [1, 2, 3], [4, 5], and [6, 7, 8, 9], and we want to +// create an ArrayChunk for these arrays. The data block might look like this: +// +// [null_bitmap][offsets_lens][array_data] +// [00000000] [24, 3, 36, 2, 44, 4, 60] [1, 2, 3, 4, 5, 6, 7, 8, 9] +// +// For string arrays, the structure is more complex as each string element needs its own offset: +// [null_bitmap][offsets_lens][array1_offsets][array1_data][array2_offsets][array2_data][array3_offsets][array3_data] +// [00000000] [24, 3, 48, 2, 64, 4, 96] [0, 5, 11, 16] ["hello", "world", "!"] [0, 3, 6] ["foo", "bar"] [0, 6, 12, 18, 24] ["apple", "orange", "banana", "grape"] +// +// Here, the null_bitmap is empty (indicating no nulls), the offsets_lens array contains pairs of (offset, length) +// for each array, and the array_data contains the actual array elements. +// +// ArrayChunk exampleChunk(3, dataPointer, dataSize, DataType::INT32, false); +// +// In this example, 'exampleChunk' is an ArrayChunk with 3 rows, a pointer to the data stored in 'dataPointer', +// a total data size of 'dataSize', element type INT32, and it does not support nullability. + class ArrayChunk : public Chunk { public: ArrayChunk(int32_t row_nums, diff --git a/internal/core/src/common/ChunkWriter.h b/internal/core/src/common/ChunkWriter.h index 5fe4c0ddf0..25ec68f06f 100644 --- a/internal/core/src/common/ChunkWriter.h +++ b/internal/core/src/common/ChunkWriter.h @@ -22,7 +22,6 @@ #include "common/EasyAssert.h" #include "common/FieldDataInterface.h" namespace milvus { - class ChunkWriterBase { public: explicit ChunkWriterBase(bool nullable) : nullable_(nullable) { @@ -86,9 +85,12 @@ class ChunkWriter final : public ChunkWriterBase { for (auto& batch : batch_vec) { row_nums += batch->num_rows(); auto data = batch->column(0); - auto array = std::dynamic_pointer_cast(data); - auto null_bitmap_n = (data->length() + 7) / 8; - size += null_bitmap_n + array->length() * dim_ * sizeof(T); + auto array = std::static_pointer_cast(data); + if (nullable_) { + auto null_bitmap_n = (data->length() + 7) / 8; + size += null_bitmap_n; + } + size += array->length() * dim_ * sizeof(T); } row_nums_ = row_nums; @@ -97,23 +99,27 @@ class ChunkWriter final : public ChunkWriterBase { } else { target_ = std::make_shared(size); } - - // chunk layout: nullbitmap, data1, data2, ..., datan - for (auto& batch : batch_vec) { - auto data = batch->column(0); - auto null_bitmap = data->null_bitmap_data(); - auto null_bitmap_n = (data->length() + 7) / 8; - if (null_bitmap) { - target_->write(null_bitmap, null_bitmap_n); - } else { - std::vector null_bitmap(null_bitmap_n, 0xff); - target_->write(null_bitmap.data(), null_bitmap_n); + // Chunk layout: + // 1. Null bitmap (if nullable_=true): Indicates which values are null + // 2. Data values: Contiguous storage of data elements in the order: + // data1, data2, ..., dataN where each data element has size dim_*sizeof(T) + if (nullable_) { + for (auto& batch : batch_vec) { + auto data = batch->column(0); + auto null_bitmap = data->null_bitmap_data(); + auto null_bitmap_n = (data->length() + 7) / 8; + if (null_bitmap) { + target_->write(null_bitmap, null_bitmap_n); + } else { + std::vector null_bitmap(null_bitmap_n, 0xff); + target_->write(null_bitmap.data(), null_bitmap_n); + } } } for (auto& batch : batch_vec) { auto data = batch->column(0); - auto array = std::dynamic_pointer_cast(data); + auto array = std::static_pointer_cast(data); auto data_ptr = array->raw_values(); target_->write(data_ptr, array->length() * dim_ * sizeof(T)); } @@ -151,16 +157,19 @@ ChunkWriter::write( } else { target_ = std::make_shared(size); } - // chunk layout: nullbitmap, data1, data2, ..., datan - for (auto& batch : batch_vec) { - auto data = batch->column(0); - auto null_bitmap = data->null_bitmap_data(); - auto null_bitmap_n = (data->length() + 7) / 8; - if (null_bitmap) { - target_->write(null_bitmap, null_bitmap_n); - } else { - std::vector null_bitmap(null_bitmap_n, 0xff); - target_->write(null_bitmap.data(), null_bitmap_n); + + if (nullable_) { + // chunk layout: nullbitmap, data1, data2, ..., datan + for (auto& batch : batch_vec) { + auto data = batch->column(0); + auto null_bitmap = data->null_bitmap_data(); + auto null_bitmap_n = (data->length() + 7) / 8; + if (null_bitmap) { + target_->write(null_bitmap, null_bitmap_n); + } else { + std::vector null_bitmap(null_bitmap_n, 0xff); + target_->write(null_bitmap.data(), null_bitmap_n); + } } } diff --git a/internal/core/src/mmap/ChunkedColumn.h b/internal/core/src/mmap/ChunkedColumn.h index 73fbdf2d8c..618d19a7cb 100644 --- a/internal/core/src/mmap/ChunkedColumn.h +++ b/internal/core/src/mmap/ChunkedColumn.h @@ -184,6 +184,11 @@ class ChunkedColumnBase : public ColumnBase { return {chunk_idx, offset_in_chunk}; } + std::shared_ptr + GetChunk(int64_t chunk_id) const { + return chunks_[chunk_id]; + } + int64_t GetNumRowsUntilChunk(int64_t chunk_id) const { return num_rows_until_chunk_[chunk_id]; @@ -258,7 +263,7 @@ class ChunkedColumn : public ChunkedColumnBase { SpanBase Span(int64_t chunk_id) const override { - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->Span(); } }; @@ -289,7 +294,7 @@ class ChunkedSparseFloatColumn : public ChunkedColumnBase { chunks_.push_back(chunk); dim_ = std::max( dim_, - std::dynamic_pointer_cast(chunk)->Dim()); + std::static_pointer_cast(chunk)->Dim()); } SpanBase @@ -339,19 +344,14 @@ class ChunkedVariableColumn : public ChunkedColumnBase { StringViews(int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->StringViews(offset_len); } - std::shared_ptr - GetChunk(int64_t chunk_id) const { - return chunks_[chunk_id]; - } - std::pair, FixedVector> ViewsByOffsets(int64_t chunk_id, const FixedVector& offsets) const override { - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->ViewsByOffsets(offsets); } @@ -363,7 +363,7 @@ class ChunkedVariableColumn : public ChunkedColumnBase { std::vector elements; elements.push_back( {chunks_[chunk_id]->Data(), - std::dynamic_pointer_cast(chunks_[chunk_id]) + std::static_pointer_cast(chunks_[chunk_id]) ->Offsets(), static_cast(start_offset), static_cast(start_offset + length)}); @@ -379,12 +379,8 @@ class ChunkedVariableColumn : public ChunkedColumnBase { } auto [chunk_id, offset_in_chunk] = GetChunkIDByOffset(i); - auto data = chunks_[chunk_id]->Data(); - auto offsets = std::dynamic_pointer_cast(chunks_[chunk_id]) - ->Offsets(); - auto len = offsets[offset_in_chunk + 1] - offsets[offset_in_chunk]; - - return ViewType(data + offsets[offset_in_chunk], len); + std::string_view str_view = std::static_pointer_cast(chunks_[chunk_id])->operator[](offset_in_chunk); + return ViewType(str_view.data(), str_view.size()); } std::string_view @@ -392,7 +388,6 @@ class ChunkedVariableColumn : public ChunkedColumnBase { return std::string_view((*this)[i]); } }; - class ChunkedArrayColumn : public ChunkedColumnBase { public: // memory mode ctor @@ -420,14 +415,14 @@ class ChunkedArrayColumn : public ChunkedColumnBase { ArrayView operator[](const int i) const { auto [chunk_id, offset_in_chunk] = GetChunkIDByOffset(i); - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->View(offset_in_chunk); } ScalarArray RawAt(const int i) const { auto [chunk_id, offset_in_chunk] = GetChunkIDByOffset(i); - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->View(offset_in_chunk) .output_data(); } @@ -436,7 +431,7 @@ class ChunkedArrayColumn : public ChunkedColumnBase { ArrayViews(int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { - return std::dynamic_pointer_cast(chunks_[chunk_id]) + return std::static_pointer_cast(chunks_[chunk_id]) ->Views(offset_len); } }; diff --git a/internal/core/unittest/test_cached_search_iterator.cpp b/internal/core/unittest/test_cached_search_iterator.cpp index e6154eae05..b31134a1bb 100644 --- a/internal/core/unittest/test_cached_search_iterator.cpp +++ b/internal/core/unittest/test_cached_search_iterator.cpp @@ -252,12 +252,10 @@ class CachedSearchIteratorTest size_t offset = 0; for (size_t i = 0; i < num_chunks_; ++i) { const size_t rows = std::min(nb_ - offset, kSizePerChunk); - const size_t chunk_bitset_size = (rows + 7) / 8; - const size_t buf_size = - chunk_bitset_size + rows * dim_ * sizeof(float); + const size_t buf_size = rows * dim_ * sizeof(float); auto& chunk_data = column_data_[i]; chunk_data.resize(buf_size); - memcpy(chunk_data.data() + chunk_bitset_size, + memcpy(chunk_data.data(), base_dataset_.cbegin() + offset * dim_, rows * dim_ * sizeof(float)); column_->AddChunk(std::make_shared( diff --git a/internal/core/unittest/test_chunk.cpp b/internal/core/unittest/test_chunk.cpp index 384c3b730e..fdc4a58a6a 100644 --- a/internal/core/unittest/test_chunk.cpp +++ b/internal/core/unittest/test_chunk.cpp @@ -103,6 +103,48 @@ TEST(chunk, test_variable_field) { } } +TEST(chunk, test_variable_field_nullable) { + FixedVector data = { + "test1", "test2", "test3", "test4", "test5"}; + FixedVector validity = {true, false, true, false, true}; + + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::VARCHAR, true); + uint8_t* valid_data = new uint8_t[1]{0x15}; // 10101 in binary + field_data->FillFieldData(data.data(), valid_data, data.size()); + delete[] valid_data; + + storage::InsertEventData event_data; + event_data.field_data = field_data; + auto ser_data = event_data.Serialize(); + auto buffer = std::make_shared( + ser_data.data() + 2 * sizeof(milvus::Timestamp), + ser_data.size() - 2 * sizeof(milvus::Timestamp)); + + parquet::arrow::FileReaderBuilder reader_builder; + auto s = reader_builder.Open(buffer); + EXPECT_TRUE(s.ok()); + std::unique_ptr arrow_reader; + s = reader_builder.Build(&arrow_reader); + EXPECT_TRUE(s.ok()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + s = arrow_reader->GetRecordBatchReader(&rb_reader); + EXPECT_TRUE(s.ok()); + + FieldMeta field_meta( + FieldName("a"), milvus::FieldId(1), DataType::STRING, true); + auto chunk = create_chunk(field_meta, 1, rb_reader); + auto views = std::dynamic_pointer_cast(chunk)->StringViews( + std::nullopt); + for (size_t i = 0; i < data.size(); ++i) { + EXPECT_EQ(views.second[i], validity[i]); + if (validity[i]) { + EXPECT_EQ(views.first[i], data[i]); + } + } +} + TEST(chunk, test_json_field) { auto row_num = 100; FixedVector data; @@ -221,12 +263,16 @@ TEST(chunk, test_json_field) { } } -TEST(chunk, test_null_field) { +TEST(chunk, test_null_int64) { FixedVector data = {1, 2, 3, 4, 5}; auto field_data = milvus::storage::CreateFieldData(storage::DataType::INT64, true); - uint8_t* valid_data_ = new uint8_t[1]{0x13}; - field_data->FillFieldData(data.data(), valid_data_, data.size()); + + // Set up validity bitmap: 10011 (1st, 4th, and 5th are valid) + uint8_t* valid_data = new uint8_t[1]{0x13}; // 10011 in binary + field_data->FillFieldData(data.data(), valid_data, data.size()); + delete[] valid_data; + storage::InsertEventData event_data; event_data.field_data = field_data; auto ser_data = event_data.Serialize(); @@ -248,17 +294,25 @@ TEST(chunk, test_null_field) { FieldMeta field_meta( FieldName("a"), milvus::FieldId(1), DataType::INT64, true); auto chunk = create_chunk(field_meta, 1, rb_reader); - auto span = std::dynamic_pointer_cast(chunk)->Span(); + auto fixed_chunk = std::dynamic_pointer_cast(chunk); + auto span = fixed_chunk->Span(); EXPECT_EQ(span.row_count(), data.size()); - data = {1, 2, 0, 0, 5}; - FixedVector valid_data = {true, true, false, false, true}; + + // Check validity based on our bitmap pattern (10011) + EXPECT_TRUE(fixed_chunk->isValid(0)); + EXPECT_TRUE(fixed_chunk->isValid(1)); + EXPECT_FALSE(fixed_chunk->isValid(2)); + EXPECT_FALSE(fixed_chunk->isValid(3)); + EXPECT_TRUE(fixed_chunk->isValid(4)); + + // Verify data for valid entries for (size_t i = 0; i < data.size(); ++i) { - auto n = *(int64_t*)((char*)span.data() + i * span.element_sizeof()); - EXPECT_EQ(n, data[i]); - auto v = *(bool*)((char*)span.valid_data() + i); - EXPECT_EQ(v, valid_data[i]); + if (fixed_chunk->isValid(i)) { + auto n = + *(int64_t*)((char*)span.data() + i * span.element_sizeof()); + EXPECT_EQ(n, data[i]); + } } - delete[] valid_data_; } TEST(chunk, test_array) { @@ -307,6 +361,82 @@ TEST(chunk, test_array) { } } +TEST(chunk, test_null_array) { + // Create a test with some arrays being null + auto array_count = 5; + FixedVector data; + data.reserve(array_count); + + // Create a string array to use for non-null values + milvus::proto::schema::ScalarField field_string_data; + field_string_data.mutable_string_data()->add_data("test1"); + field_string_data.mutable_string_data()->add_data("test2"); + field_string_data.mutable_string_data()->add_data("test3"); + auto string_array = Array(field_string_data); + + for (int i = 0; i < array_count; i++) { + data.emplace_back(string_array); + } + + auto field_data = + milvus::storage::CreateFieldData(storage::DataType::ARRAY, true); + + // Set up validity bitmap: 10101 (1st, 3rd, and 5th are valid) + uint8_t* valid_data = new uint8_t[1]{0x15}; // 10101 in binary + field_data->FillFieldData(data.data(), valid_data, data.size()); + delete[] valid_data; + + storage::InsertEventData event_data; + event_data.field_data = field_data; + auto ser_data = event_data.Serialize(); + auto buffer = std::make_shared( + ser_data.data() + 2 * sizeof(milvus::Timestamp), + ser_data.size() - 2 * sizeof(milvus::Timestamp)); + + parquet::arrow::FileReaderBuilder reader_builder; + auto s = reader_builder.Open(buffer); + EXPECT_TRUE(s.ok()); + std::unique_ptr arrow_reader; + s = reader_builder.Build(&arrow_reader); + EXPECT_TRUE(s.ok()); + + std::shared_ptr<::arrow::RecordBatchReader> rb_reader; + s = arrow_reader->GetRecordBatchReader(&rb_reader); + EXPECT_TRUE(s.ok()); + + FieldMeta field_meta(FieldName("a"), + milvus::FieldId(1), + DataType::ARRAY, + DataType::STRING, + true); + auto chunk = create_chunk(field_meta, 1, rb_reader); + auto [views, valid] = + std::dynamic_pointer_cast(chunk)->Views(std::nullopt); + + EXPECT_EQ(views.size(), array_count); + EXPECT_EQ(valid.size(), array_count); + + // Check validity based on our bitmap pattern (10101) + EXPECT_TRUE(valid[0]); + EXPECT_FALSE(valid[1]); + EXPECT_TRUE(valid[2]); + EXPECT_FALSE(valid[3]); + EXPECT_TRUE(valid[4]); + + // Verify data for valid arrays + for (size_t i = 0; i < array_count; i++) { + if (valid[i]) { + auto& arr = views[i]; + EXPECT_EQ(arr.length(), + field_string_data.string_data().data_size()); + for (size_t j = 0; j < arr.length(); j++) { + auto str = arr.get_data(j); + EXPECT_EQ(str, field_string_data.string_data().data(j)); + } + } + } +} + TEST(chunk, test_array_views) { milvus::proto::schema::ScalarField field_string_data; field_string_data.mutable_string_data()->add_data("a"); diff --git a/internal/core/unittest/test_chunked_segment.cpp b/internal/core/unittest/test_chunked_segment.cpp index fda85f019e..cf72a1f8a6 100644 --- a/internal/core/unittest/test_chunked_segment.cpp +++ b/internal/core/unittest/test_chunked_segment.cpp @@ -68,7 +68,6 @@ TEST(test_chunk_segment, TestSearchOnSealed) { int chunk_size = 100; int total_row_count = chunk_num * chunk_size; int bitset_size = (total_row_count + 7) / 8; - int chunk_bitset_size = (chunk_size + 7) / 8; auto column = std::make_shared(); auto schema = std::make_shared(); @@ -78,11 +77,11 @@ TEST(test_chunk_segment, TestSearchOnSealed) { for (int i = 0; i < chunk_num; i++) { auto dataset = segcore::DataGen(schema, chunk_size); auto data = dataset.get_col(fakevec_id); - auto buf_size = chunk_bitset_size + 4 * data.size(); + auto buf_size = 4 * data.size(); char* buf = new char[buf_size]; defer.AddDefer([buf]() { delete[] buf; }); - memcpy(buf + chunk_bitset_size, data.data(), 4 * data.size()); + memcpy(buf, data.data(), 4 * data.size()); auto chunk = std::make_shared( chunk_size, dim, buf, buf_size, 4, false);