// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef ARROW_ARRAY_H #define ARROW_ARRAY_H #include #include #include #include #include #include #include #include "arrow/buffer.h" #include "arrow/compare.h" #include "arrow/result.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" // IWYU pragma: export #include "arrow/util/visibility.h" namespace arrow { class Array; class ArrayVisitor; // When slicing, we do not know the null count of the sliced range without // doing some computation. To avoid doing this eagerly, we set the null count // to -1 (any negative number will do). When Array::null_count is called the // first time, the null count will be computed. See ARROW-33 constexpr int64_t kUnknownNullCount = -1; class MemoryPool; class Status; // ---------------------------------------------------------------------- // Generic array data container /// \class ArrayData /// \brief Mutable container for generic Arrow array data /// /// This data structure is a self-contained representation of the memory and /// metadata inside an Arrow array data structure (called vectors in Java). The /// classes arrow::Array and its subclasses provide strongly-typed accessors /// with support for the visitor pattern and other affordances. /// /// This class is designed for easy internal data manipulation, analytical data /// processing, and data transport to and from IPC messages. For example, we /// could cast from int64 to float64 like so: /// /// Int64Array arr = GetMyData(); /// auto new_data = arr.data()->Copy(); /// new_data->type = arrow::float64(); /// DoubleArray double_arr(new_data); /// /// This object is also useful in an analytics setting where memory may be /// reused. For example, if we had a group of operations all returning doubles, /// say: /// /// Log(Sqrt(Expr(arr))) /// /// Then the low-level implementations of each of these functions could have /// the signatures /// /// void Log(const ArrayData& values, ArrayData* out); /// /// As another example a function may consume one or more memory buffers in an /// input array and replace them with newly-allocated data, changing the output /// data type as well. struct ARROW_EXPORT ArrayData { ArrayData() : length(0), null_count(0), offset(0) {} ArrayData(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : type(type), length(length), null_count(null_count), offset(offset) {} ArrayData(const std::shared_ptr& type, int64_t length, const std::vector>& buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : ArrayData(type, length, null_count, offset) { this->buffers = buffers; } ArrayData(const std::shared_ptr& type, int64_t length, const std::vector>& buffers, const std::vector>& child_data, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : ArrayData(type, length, null_count, offset) { this->buffers = buffers; this->child_data = child_data; } ArrayData(const std::shared_ptr& type, int64_t length, std::vector>&& buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : ArrayData(type, length, null_count, offset) { this->buffers = std::move(buffers); } static std::shared_ptr Make(const std::shared_ptr& type, int64_t length, std::vector>&& buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0); static std::shared_ptr Make( const std::shared_ptr& type, int64_t length, const std::vector>& buffers, int64_t null_count = kUnknownNullCount, int64_t offset = 0); static std::shared_ptr Make( const std::shared_ptr& type, int64_t length, const std::vector>& buffers, const std::vector>& child_data, int64_t null_count = kUnknownNullCount, int64_t offset = 0); static std::shared_ptr Make(const std::shared_ptr& type, int64_t length, int64_t null_count = kUnknownNullCount, int64_t offset = 0); // Move constructor ArrayData(ArrayData&& other) noexcept : type(std::move(other.type)), length(other.length), null_count(other.null_count), offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), dictionary(std::move(other.dictionary)) {} // Copy constructor ArrayData(const ArrayData& other) noexcept : type(other.type), length(other.length), null_count(other.null_count), offset(other.offset), buffers(other.buffers), child_data(other.child_data), dictionary(other.dictionary) {} // Move assignment ArrayData& operator=(ArrayData&& other) = default; // Copy assignment ArrayData& operator=(const ArrayData& other) = default; std::shared_ptr Copy() const { return std::make_shared(*this); } // Access a buffer's data as a typed C pointer template inline const T* GetValues(int i, int64_t absolute_offset) const { if (buffers[i]) { return reinterpret_cast(buffers[i]->data()) + absolute_offset; } else { return NULLPTR; } } template inline const T* GetValues(int i) const { return GetValues(i, offset); } // Access a buffer's data as a typed C pointer template inline T* GetMutableValues(int i, int64_t absolute_offset) { if (buffers[i]) { return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; } else { return NULLPTR; } } template inline T* GetMutableValues(int i) { return GetMutableValues(i, offset); } // Construct a zero-copy slice of the data with the indicated offset and length ArrayData Slice(int64_t offset, int64_t length) const; /// \brief Return null count, or compute and set it if it's not known int64_t GetNullCount() const; std::shared_ptr type; int64_t length; mutable int64_t null_count; // The logical start point into the physical buffers (in values, not bytes). // Note that, for child data, this must be *added* to the child data's own offset. int64_t offset; std::vector> buffers; std::vector> child_data; // The dictionary for this Array, if any. Only used for dictionary // type std::shared_ptr dictionary; }; /// \brief Create a strongly-typed Array instance from generic ArrayData /// \param[in] data the array contents /// \return the resulting Array instance ARROW_EXPORT std::shared_ptr MakeArray(const std::shared_ptr& data); /// \brief Create a strongly-typed Array instance with all elements null /// \param[in] type the array type /// \param[in] length the array length /// \param[out] out resulting Array instance ARROW_EXPORT Status MakeArrayOfNull(const std::shared_ptr& type, int64_t length, std::shared_ptr* out); // ---------------------------------------------------------------------- // User array accessor types /// \brief Array base type /// Immutable data array with some logical type and some length. /// /// Any memory is owned by the respective Buffer instance (or its parents). /// /// The base class is only required to have a null bitmap buffer if the null /// count is greater than 0 /// /// If known, the null count can be provided in the base Array constructor. If /// the null count is not known, pass -1 to indicate that the null count is to /// be computed on the first call to null_count() class ARROW_EXPORT Array { public: virtual ~Array() = default; /// \brief Return true if value at index is null. Does not boundscheck bool IsNull(int64_t i) const { return null_bitmap_data_ != NULLPTR && !BitUtil::GetBit(null_bitmap_data_, i + data_->offset); } /// \brief Return true if value at index is valid (not null). Does not /// boundscheck bool IsValid(int64_t i) const { return null_bitmap_data_ == NULLPTR || BitUtil::GetBit(null_bitmap_data_, i + data_->offset); } /// Size in the number of elements this array contains. int64_t length() const { return data_->length; } /// A relative position into another array's data, to enable zero-copy /// slicing. This value defaults to zero int64_t offset() const { return data_->offset; } /// The number of null entries in the array. If the null count was not known /// at time of construction (and set to a negative value), then the null /// count will be computed and cached on the first invocation of this /// function int64_t null_count() const; std::shared_ptr type() const { return data_->type; } Type::type type_id() const { return data_->type->id(); } /// Buffer for the null bitmap. /// /// Note that for `null_count == 0`, this can be null. /// This buffer does not account for any slice offset std::shared_ptr null_bitmap() const { return data_->buffers[0]; } /// Raw pointer to the null bitmap. /// /// Note that for `null_count == 0`, this can be null. /// This buffer does not account for any slice offset const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } /// Equality comparison with another array bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool Equals(const std::shared_ptr& arr, const EqualOptions& = EqualOptions::Defaults()) const; /// Approximate equality comparison with another array /// /// epsilon is only used if this is FloatArray or DoubleArray bool ApproxEquals(const std::shared_ptr& arr, const EqualOptions& = EqualOptions::Defaults()) const; bool ApproxEquals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const; /// Compare if the range of slots specified are equal for the given array and /// this array. end_idx exclusive. This methods does not bounds check. bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const Array& other) const; bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx, const std::shared_ptr& other) const; bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx, int64_t other_start_idx) const; bool RangeEquals(const std::shared_ptr& other, int64_t start_idx, int64_t end_idx, int64_t other_start_idx) const; Status Accept(ArrayVisitor* visitor) const; /// Construct a zero-copy view of this array with the given type. /// /// This method checks if the types are layout-compatible. /// Nested types are traversed in depth-first order. Data buffers must have /// the same item sizes, even though the logical types may be different. /// An error is returned if the types are not layout-compatible. Status View(const std::shared_ptr& type, std::shared_ptr* out); /// Construct a zero-copy slice of the array with the indicated offset and /// length /// /// \param[in] offset the position of the first element in the constructed /// slice /// \param[in] length the length of the slice. If there are not enough /// elements in the array, the length will be adjusted accordingly /// /// \return a new object wrapped in std::shared_ptr std::shared_ptr Slice(int64_t offset, int64_t length) const; /// Slice from offset until end of the array std::shared_ptr Slice(int64_t offset) const; std::shared_ptr data() const { return data_; } int num_fields() const { return static_cast(data_->child_data.size()); } /// \return PrettyPrint representation of array suitable for debugging std::string ToString() const; protected: Array() : null_bitmap_data_(NULLPTR) {} std::shared_ptr data_; const uint8_t* null_bitmap_data_; /// Protected method for constructors inline void SetData(const std::shared_ptr& data) { if (data->buffers.size() > 0 && data->buffers[0]) { null_bitmap_data_ = data->buffers[0]->data(); } else { null_bitmap_data_ = NULLPTR; } data_ = data; } private: ARROW_DISALLOW_COPY_AND_ASSIGN(Array); }; using ArrayVector = std::vector>; namespace internal { /// Given a number of ArrayVectors, treat each ArrayVector as the /// chunks of a chunked array. Then rechunk each ArrayVector such that /// all ArrayVectors are chunked identically. It is mandatory that /// all ArrayVectors contain the same total number of elements. ARROW_EXPORT std::vector RechunkArraysConsistently(const std::vector&); } // namespace internal static inline std::ostream& operator<<(std::ostream& os, const Array& x) { os << x.ToString(); return os; } /// Base class for non-nested arrays class ARROW_EXPORT FlatArray : public Array { protected: using Array::Array; }; /// Degenerate null type Array class ARROW_EXPORT NullArray : public FlatArray { public: using TypeClass = NullType; explicit NullArray(const std::shared_ptr& data) { SetData(data); } explicit NullArray(int64_t length); private: inline void SetData(const std::shared_ptr& data) { null_bitmap_data_ = NULLPTR; data->null_count = data->length; data_ = data; } }; /// Base class for arrays of fixed-size logical types class ARROW_EXPORT PrimitiveArray : public FlatArray { public: PrimitiveArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// Does not account for any slice offset std::shared_ptr values() const { return data_->buffers[1]; } protected: PrimitiveArray() : raw_values_(NULLPTR) {} inline void SetData(const std::shared_ptr& data) { auto values = data->buffers[1]; this->Array::SetData(data); raw_values_ = values == NULLPTR ? NULLPTR : values->data(); } explicit inline PrimitiveArray(const std::shared_ptr& data) { SetData(data); } const uint8_t* raw_values_; }; /// Concrete Array class for numeric data. template class NumericArray : public PrimitiveArray { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; explicit NumericArray(const std::shared_ptr& data) : PrimitiveArray(data) {} // Only enable this constructor without a type argument for types without additional // metadata template NumericArray( typename std::enable_if::is_parameter_free, int64_t>::type length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0) : PrimitiveArray(TypeTraits::type_singleton(), length, data, null_bitmap, null_count, offset) {} const value_type* raw_values() const { return reinterpret_cast(raw_values_) + data_->offset; } value_type Value(int64_t i) const { return raw_values()[i]; } // For API compatibility with BinaryArray etc. value_type GetView(int64_t i) const { return Value(i); } protected: using PrimitiveArray::PrimitiveArray; }; /// Concrete Array class for boolean data class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: using TypeClass = BooleanType; explicit BooleanArray(const std::shared_ptr& data); BooleanArray(int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); bool Value(int64_t i) const { return BitUtil::GetBit(reinterpret_cast(raw_values_), i + data_->offset); } bool GetView(int64_t i) const { return Value(i); } protected: using PrimitiveArray::PrimitiveArray; }; // ---------------------------------------------------------------------- // ListArray /// Concrete Array class for list data class ARROW_EXPORT ListArray : public Array { public: using TypeClass = ListType; explicit ListArray(const std::shared_ptr& data); ListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& values, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// \brief Construct ListArray from array of offsets and child value array /// /// This function does the bare minimum of validation of the offsets and /// input types, and will allocate a new offsets array if necessary (i.e. if /// the offsets contain any nulls). If the offsets do not have nulls, they /// are assumed to be well-formed /// /// \param[in] offsets Array containing n + 1 offsets encoding length and /// size. Must be of int32 type /// \param[in] values Array containing /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values /// \param[out] out Will have length equal to offsets.length() - 1 static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr* out); const ListType* list_type() const { return list_type_; } /// \brief Return array object containing the list's values std::shared_ptr values() const; /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[1]; } std::shared_ptr value_type() const; /// Return pointer to raw value offsets accounting for any slice offset const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } // The following functions will not perform boundschecking int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } int32_t value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: // This constructor defers SetData to a derived array class ListArray() = default; void SetData(const std::shared_ptr& data); const int32_t* raw_value_offsets_ = NULLPTR; private: const ListType* list_type_ = NULLPTR; std::shared_ptr values_; }; // ---------------------------------------------------------------------- // MapArray /// Concrete Array class for map data /// /// NB: "value" in this context refers to a pair of a key and the correspondint item class ARROW_EXPORT MapArray : public ListArray { public: using TypeClass = MapType; explicit MapArray(const std::shared_ptr& data); MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& keys, const std::shared_ptr& items, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& values, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); const MapType* map_type() const { return map_type_; } /// \brief Return array object containing all map keys std::shared_ptr keys() const { return keys_; } /// \brief Return array object containing all mapped items std::shared_ptr items() const { return items_; } protected: void SetData(const std::shared_ptr& data); private: const MapType* map_type_; std::shared_ptr keys_, items_; }; // ---------------------------------------------------------------------- // FixedSizeListArray /// Concrete Array class for fixed size list data class ARROW_EXPORT FixedSizeListArray : public Array { public: using TypeClass = FixedSizeListType; explicit FixedSizeListArray(const std::shared_ptr& data); FixedSizeListArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& values, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); const FixedSizeListType* list_type() const; /// \brief Return array object containing the list's values std::shared_ptr values() const; std::shared_ptr value_type() const; // The following functions will not perform boundschecking int32_t value_offset(int64_t i) const { i += data_->offset; return static_cast(list_size_ * i); } int32_t value_length(int64_t i = 0) const { return list_size_; } std::shared_ptr value_slice(int64_t i) const { return values_->Slice(value_offset(i), value_length(i)); } protected: void SetData(const std::shared_ptr& data); int32_t list_size_; private: std::shared_ptr values_; }; // ---------------------------------------------------------------------- // Binary and String /// Concrete Array class for variable-size binary data class ARROW_EXPORT BinaryArray : public FlatArray { public: using TypeClass = BinaryType; explicit BinaryArray(const std::shared_ptr& data); BinaryArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// Return the pointer to the given elements bytes // XXX should GetValue(int64_t i) return a string_view? const uint8_t* GetValue(int64_t i, int32_t* out_length) const { // Account for base offset i += data_->offset; const int32_t pos = raw_value_offsets_[i]; *out_length = raw_value_offsets_[i + 1] - pos; return raw_data_ + pos; } /// \brief Get binary value as a string_view /// /// \param i the value index /// \return the view over the selected value util::string_view GetView(int64_t i) const { // Account for base offset i += data_->offset; const int32_t pos = raw_value_offsets_[i]; return util::string_view(reinterpret_cast(raw_data_ + pos), raw_value_offsets_[i + 1] - pos); } /// \brief Get binary value as a std::string /// /// \param i the value index /// \return the value copied into a std::string std::string GetString(int64_t i) const { return std::string(GetView(i)); } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_data() const { return data_->buffers[2]; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } // Neither of these functions will perform boundschecking int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } int32_t value_length(int64_t i) const { i += data_->offset; return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; } protected: // For subclasses BinaryArray() : raw_value_offsets_(NULLPTR), raw_data_(NULLPTR) {} /// Protected method for constructors void SetData(const std::shared_ptr& data); // Constructor to allow sub-classes/builders to substitute their own logical type BinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); const int32_t* raw_value_offsets_; const uint8_t* raw_data_; }; /// Concrete Array class for variable-size string (utf-8) data class ARROW_EXPORT StringArray : public BinaryArray { public: using TypeClass = StringType; explicit StringArray(const std::shared_ptr& data); StringArray(int64_t length, const std::shared_ptr& value_offsets, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); }; // ---------------------------------------------------------------------- // Fixed width binary /// Concrete Array class for fixed-size binary data class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { public: using TypeClass = FixedSizeBinaryType; explicit FixedSizeBinaryArray(const std::shared_ptr& data); FixedSizeBinaryArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); const uint8_t* GetValue(int64_t i) const; const uint8_t* Value(int64_t i) const { return GetValue(i); } util::string_view GetView(int64_t i) const { return util::string_view(reinterpret_cast(GetValue(i)), byte_width()); } std::string GetString(int64_t i) const { return std::string(GetView(i)); } int32_t byte_width() const { return byte_width_; } const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; } protected: inline void SetData(const std::shared_ptr& data) { this->PrimitiveArray::SetData(data); byte_width_ = internal::checked_cast(*type()).byte_width(); } int32_t byte_width_; }; /// DayTimeArray /// --------------------- /// \brief Array of Day and Millisecond values. class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray { public: using TypeClass = DayTimeIntervalType; explicit DayTimeIntervalArray(const std::shared_ptr& data); DayTimeIntervalArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& data, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); TypeClass::DayMilliseconds GetValue(int64_t i) const; TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); } // For compatibility with Take kernel. TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); } int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); } const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); } protected: inline void SetData(const std::shared_ptr& data) { this->PrimitiveArray::SetData(data); } }; // ---------------------------------------------------------------------- // Decimal128Array /// Concrete Array class for 128-bit decimal data class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray { public: using TypeClass = Decimal128Type; using FixedSizeBinaryArray::FixedSizeBinaryArray; /// \brief Construct Decimal128Array from ArrayData instance explicit Decimal128Array(const std::shared_ptr& data); std::string FormatValue(int64_t i) const; }; // Backward compatibility using DecimalArray = Decimal128Array; // ---------------------------------------------------------------------- // Struct /// Concrete Array class for struct data class ARROW_EXPORT StructArray : public Array { public: using TypeClass = StructType; explicit StructArray(const std::shared_ptr& data); StructArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// \brief Return a StructArray from child arrays and field names. /// /// The length and data type are automatically inferred from the arguments. /// There should be at least one child array. static Result> Make( const std::vector>& children, const std::vector& field_names, std::shared_ptr null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); const StructType* struct_type() const; // Return a shared pointer in case the requestor desires to share ownership // with this array. The returned array has its offset, length and null // count adjusted. std::shared_ptr field(int pos) const; /// Returns null if name not found std::shared_ptr GetFieldByName(const std::string& name) const; /// \brief Flatten this array as a vector of arrays, one for each field /// /// \param[in] pool The pool to allocate null bitmaps from, if necessary /// \param[out] out The resulting vector of arrays Status Flatten(MemoryPool* pool, ArrayVector* out) const; private: // For caching boxed child data // XXX This is not handled in a thread-safe manner. mutable std::vector> boxed_fields_; }; // ---------------------------------------------------------------------- // Union /// Concrete Array class for union data class ARROW_EXPORT UnionArray : public Array { public: using TypeClass = UnionType; using type_id_t = uint8_t; explicit UnionArray(const std::shared_ptr& data); UnionArray(const std::shared_ptr& type, int64_t length, const std::vector>& children, const std::shared_ptr& type_ids, const std::shared_ptr& value_offsets = NULLPTR, const std::shared_ptr& null_bitmap = NULLPTR, int64_t null_count = kUnknownNullCount, int64_t offset = 0); /// \brief Construct Dense UnionArray from types_ids, value_offsets and children /// /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. /// \param[out] out Will have length equal to value_offsets.length() static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, const std::vector& field_names, const std::vector& type_codes, std::shared_ptr* out); /// \brief Construct Dense UnionArray from types_ids, value_offsets and children /// /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[out] out Will have length equal to value_offsets.length() static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, const std::vector& field_names, std::shared_ptr* out) { return MakeDense(type_ids, value_offsets, children, field_names, {}, out); } /// \brief Construct Dense UnionArray from types_ids, value_offsets and children /// /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] type_codes Vector of type codes. /// \param[out] out Will have length equal to value_offsets.length() static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, const std::vector& type_codes, std::shared_ptr* out) { return MakeDense(type_ids, value_offsets, children, {}, type_codes, out); } /// \brief Construct Dense UnionArray from types_ids, value_offsets and children /// /// This function does the bare minimum of validation of the offsets and /// input types. The value_offsets are assumed to be well-formed. /// /// The name of each field is filled by the index of the field. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] value_offsets An array of signed int32 values indicating the /// relative offset into the respective child array for the type in a given slot. /// The respective offsets for each child value array must be in order / increasing. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[out] out Will have length equal to value_offsets.length() static Status MakeDense(const Array& type_ids, const Array& value_offsets, const std::vector>& children, std::shared_ptr* out) { return MakeDense(type_ids, value_offsets, children, {}, {}, out); } /// \brief Construct Sparse UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[in] type_codes Vector of type codes. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, const std::vector>& children, const std::vector& field_names, const std::vector& type_codes, std::shared_ptr* out); /// \brief Construct Sparse UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] field_names Vector of strings containing the name of each field. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, const std::vector>& children, const std::vector& field_names, std::shared_ptr* out) { return MakeSparse(type_ids, children, field_names, {}, out); } /// \brief Construct Sparse UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[in] type_codes Vector of type codes. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, const std::vector>& children, const std::vector& type_codes, std::shared_ptr* out) { return MakeSparse(type_ids, children, {}, type_codes, out); } /// \brief Construct Sparse UnionArray from type_ids and children /// /// This function does the bare minimum of validation of the offsets and /// input types. /// /// The name of each field is filled by the index of the field. /// /// \param[in] type_ids An array of 8-bit signed integers, enumerated from /// 0 corresponding to each type. /// \param[in] children Vector of children Arrays containing the data for each type. /// \param[out] out Will have length equal to type_ids.length() static Status MakeSparse(const Array& type_ids, const std::vector>& children, std::shared_ptr* out) { return MakeSparse(type_ids, children, {}, {}, out); } /// Note that this buffer does not account for any slice offset std::shared_ptr type_ids() const { return data_->buffers[1]; } /// Note that this buffer does not account for any slice offset std::shared_ptr value_offsets() const { return data_->buffers[2]; } int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; } const type_id_t* raw_type_ids() const { return raw_type_ids_ + data_->offset; } const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; } UnionMode::type mode() const { return internal::checked_cast(*type()).mode(); } // Return the given field as an individual array. // For sparse unions, the returned array has its offset, length and null // count adjusted. // For dense unions, the returned array is unchanged. std::shared_ptr child(int pos) const; /// Only use this while the UnionArray is in scope const Array* UnsafeChild(int pos) const; protected: void SetData(const std::shared_ptr& data); const type_id_t* raw_type_ids_; const int32_t* raw_value_offsets_; // For caching boxed child data mutable std::vector> boxed_fields_; }; // ---------------------------------------------------------------------- // DictionaryArray /// \brief Array type for dictionary-encoded data with a /// data-dependent dictionary /// /// A dictionary array contains an array of non-negative integers (the /// "dictionary indices") along with a data type containing a "dictionary" /// corresponding to the distinct values represented in the data. /// /// For example, the array /// /// ["foo", "bar", "foo", "bar", "foo", "bar"] /// /// with dictionary ["bar", "foo"], would have dictionary array representation /// /// indices: [1, 0, 1, 0, 1, 0] /// dictionary: ["bar", "foo"] /// /// The indices in principle may have any integer type (signed or unsigned), /// though presently data in IPC exchanges must be signed int32. class ARROW_EXPORT DictionaryArray : public Array { public: using TypeClass = DictionaryType; explicit DictionaryArray(const std::shared_ptr& data); DictionaryArray(const std::shared_ptr& type, const std::shared_ptr& indices, const std::shared_ptr& dictionary); /// \brief Construct DictionaryArray from dictionary and indices /// array and validate /// /// This function does the validation of the indices and input type. It checks if /// all indices are non-negative and smaller than the size of the dictionary /// /// \param[in] type a dictionary type /// \param[in] dictionary the dictionary with same value type as the /// type object /// \param[in] indices an array of non-negative signed /// integers smaller than the size of the dictionary /// \param[out] out the resulting DictionaryArray instance static Status FromArrays(const std::shared_ptr& type, const std::shared_ptr& indices, const std::shared_ptr& dictionary, std::shared_ptr* out); /// \brief Transpose this DictionaryArray /// /// This method constructs a new dictionary array with the given dictionary type, /// transposing indices using the transpose map. /// The type and the transpose map are typically computed using /// DictionaryType::Unify. /// /// \param[in] pool a pool to allocate the array data from /// \param[in] type the new type object /// \param[in] dictionary the new dictionary /// \param[in] transpose_map a vector transposing this array's indices /// into the target array's indices /// \param[out] out the resulting DictionaryArray instance Status Transpose(MemoryPool* pool, const std::shared_ptr& type, const std::shared_ptr& dictionary, const std::vector& transpose_map, std::shared_ptr* out) const; /// \brief Return the dictionary for this array, which is stored as /// a member of the ArrayData internal structure std::shared_ptr dictionary() const; std::shared_ptr indices() const; const DictionaryType* dict_type() const { return dict_type_; } private: void SetData(const std::shared_ptr& data); const DictionaryType* dict_type_; std::shared_ptr indices_; }; /// \brief Perform any validation checks to determine obvious inconsistencies /// with the array's internal data /// /// This can be an expensive check. /// /// \param array an Array instance /// \return Status ARROW_EXPORT Status ValidateArray(const Array& array); } // namespace arrow #endif // ARROW_ARRAY_H