// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #pragma once #include #include #include #include #include #include "arrow/array/array_primitive.h" #include "arrow/type_fwd.h" #include "common/ChunkTarget.h" #include "arrow/record_batch.h" #include "common/Chunk.h" namespace milvus { class ChunkWriterBase { public: explicit ChunkWriterBase(bool nullable) : nullable_(nullable) { } virtual std::pair calculate_size(const arrow::ArrayVector& data) = 0; virtual void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) = 0; protected: void write_null_bit_maps( const std::vector>& null_bitmaps, const std::shared_ptr& target) { if (nullable_) { // merge all null bitmaps in case of multiple chunk null bitmap dislocation // say [0xFF, 0x00] with size [7, 8] cannot be treated as [0xFF, 0x00] after merged but // [0x7F, 0x00], othersize the null index will be dislocated std::vector merged_null_bitmap; int64_t size_total_bit = 0; for (auto [data, size_bits, offset_bits] : null_bitmaps) { // resize in byte merged_null_bitmap.resize((size_total_bit + size_bits + 7) / 8, 0xFF); if (data != nullptr) { bitset::detail::ElementWiseBitsetPolicy::op_copy( data, offset_bits, merged_null_bitmap.data(), size_total_bit, size_bits); } else { // have to append always-true bitmap due to arrow optimize this std::vector null_bitmap(size_bits, 0xff); bitset::detail::ElementWiseBitsetPolicy::op_copy( null_bitmap.data(), 0, merged_null_bitmap.data(), size_total_bit, size_bits); } size_total_bit += size_bits; } target->write(merged_null_bitmap.data(), (size_total_bit + 7) / 8); } } protected: size_t row_nums_ = 0; bool nullable_ = false; }; template class ChunkWriter final : public ChunkWriterBase { public: ChunkWriter(int dim, bool nullable) : ChunkWriterBase(nullable), dim_(dim) { } std::pair calculate_size(const arrow::ArrayVector& array_vec) override { size_t size = 0; size_t row_nums = 0; for (const auto& data : array_vec) { row_nums += data->length(); auto array = std::static_pointer_cast(data); size += array->length() * dim_ * sizeof(T); } if (nullable_) { size += (row_nums + 7) / 8; } row_nums_ = row_nums; return {size, row_nums}; } void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override { // Chunk layout: // 1. Null bitmap (if nullable_=true): Indicates which values are null // 2. Data values: Contiguous storage of data elements in the order: // data1, data2, ..., dataN where each data element has size dim_*sizeof(T) if (nullable_) { // tuple std::vector> null_bitmaps; for (const auto& data : array_vec) { null_bitmaps.emplace_back( data->null_bitmap_data(), data->length(), data->offset()); } write_null_bit_maps(null_bitmaps, target); } for (const auto& data : array_vec) { auto array = std::static_pointer_cast(data); auto data_ptr = array->raw_values(); target->write(data_ptr, array->length() * dim_ * sizeof(T)); } } private: const int64_t dim_; }; template class NullableVectorChunkWriter final : public ChunkWriterBase { public: NullableVectorChunkWriter(int64_t dim, bool nullable) : ChunkWriterBase(nullable), dim_(dim) { Assert(nullable && "NullableVectorChunkWriter requires nullable=true"); } std::pair calculate_size(const arrow::ArrayVector& array_vec) override { size_t size = 0; size_t row_nums = 0; for (const auto& data : array_vec) { row_nums += data->length(); auto binary_array = std::static_pointer_cast(data); int64_t valid_count = data->length() - binary_array->null_count(); size += valid_count * dim_ * sizeof(T); } // null bitmap size size += (row_nums + 7) / 8; row_nums_ = row_nums; return {size, row_nums}; } void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override { std::vector> null_bitmaps; for (const auto& data : array_vec) { null_bitmaps.emplace_back( data->null_bitmap_data(), data->length(), data->offset()); } write_null_bit_maps(null_bitmaps, target); for (const auto& data : array_vec) { auto binary_array = std::static_pointer_cast(data); auto data_offset = binary_array->value_offset(0); auto data_ptr = binary_array->value_data()->data() + data_offset; int64_t valid_count = data->length() - binary_array->null_count(); target->write(data_ptr, valid_count * dim_ * sizeof(T)); } } private: const int64_t dim_; }; template <> inline void ChunkWriter::write_to_target( const arrow::ArrayVector& array_vec, const std::shared_ptr& target) { if (nullable_) { // tuple std::vector> null_bitmaps; for (const auto& data : array_vec) { null_bitmaps.emplace_back( data->null_bitmap_data(), data->length(), data->offset()); } write_null_bit_maps(null_bitmaps, target); } for (const auto& data : array_vec) { auto array = std::dynamic_pointer_cast(data); for (int i = 0; i < array->length(); i++) { auto value = array->Value(i); target->write(&value, sizeof(bool)); } } } class StringChunkWriter : public ChunkWriterBase { public: using ChunkWriterBase::ChunkWriterBase; std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; private: std::vector strs_; }; class JSONChunkWriter : public ChunkWriterBase { public: using ChunkWriterBase::ChunkWriterBase; std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; }; class GeometryChunkWriter : public ChunkWriterBase { public: using ChunkWriterBase::ChunkWriterBase; std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; }; class ArrayChunkWriter : public ChunkWriterBase { public: ArrayChunkWriter(const milvus::DataType element_type, bool nullable) : ChunkWriterBase(nullable), element_type_(element_type) { } std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; private: const milvus::DataType element_type_; }; class VectorArrayChunkWriter : public ChunkWriterBase { public: VectorArrayChunkWriter(int64_t dim, const milvus::DataType element_type) : ChunkWriterBase(false), element_type_(element_type), dim_(dim) { } std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; private: const milvus::DataType element_type_; const int64_t dim_; }; class SparseFloatVectorChunkWriter : public ChunkWriterBase { public: using ChunkWriterBase::ChunkWriterBase; std::pair calculate_size(const arrow::ArrayVector& array_vec) override; void write_to_target(const arrow::ArrayVector& array_vec, const std::shared_ptr& target) override; }; std::unique_ptr create_chunk(const FieldMeta& field_meta, const arrow::ArrayVector& array_vec, const std::string& file_path = "", proto::common::LoadPriority load_priority = proto::common::LoadPriority::HIGH); std::unordered_map> create_group_chunk(const std::vector& field_ids, const std::vector& field_metas, const std::vector& array_vec, const std::string& file_path = "", proto::common::LoadPriority load_priority = proto::common::LoadPriority::HIGH); arrow::ArrayVector read_single_column_batches(std::shared_ptr reader); } // namespace milvus