// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include #include "common/FieldData.h" #include "common/LoadInfo.h" #include "knowhere/comp/index_param.h" #include "parquet/schema.h" #include "storage/Event.h" #include "storage/PayloadStream.h" #include "storage/FileManager.h" #include "storage/BinlogReader.h" #include "storage/ChunkManager.h" #include "storage/DataCodec.h" #include "storage/Types.h" namespace milvus::storage { StorageType ReadMediumType(BinlogReaderPtr reader); void AddPayloadToArrowBuilder(std::shared_ptr builder, const Payload& payload); void AddOneStringToArrowBuilder(std::shared_ptr builder, const char* str, int str_size); void AddOneBinaryToArrowBuilder(std::shared_ptr builder, const uint8_t* data, int length); std::shared_ptr CreateArrowBuilder(DataType data_type); std::shared_ptr CreateArrowBuilder(DataType data_type, int dim); std::shared_ptr CreateArrowSchema(DataType data_type, bool nullable); std::shared_ptr CreateArrowSchema(DataType data_type, int dim, bool nullable); int GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema, DataType data_type); int GetDimensionFromArrowArray(std::shared_ptr array, DataType data_type); std::string GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id); std::string GenIndexPathIdentifier(int64_t build_id, int64_t index_version); std::string GenTextIndexPathIdentifier(int64_t build_id, int64_t index_version, int64_t segment_id, int64_t field_id); std::string GenIndexPathPrefix(ChunkManagerPtr cm, int64_t build_id, int64_t index_version); std::string GenTextIndexPathPrefix(ChunkManagerPtr cm, int64_t build_id, int64_t index_version, int64_t segment_id, int64_t field_id); std::string GenJsonKeyIndexPathIdentifier(int64_t build_id, int64_t index_version, int64_t collection_id, int64_t partition_id, int64_t segment_id, int64_t field_id); std::string GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm, int64_t build_id, int64_t index_version, int64_t collection_id, int64_t partition_id, int64_t segment_id, int64_t field_id); std::string GenFieldRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id, int64_t field_id); std::string GetSegmentRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id); std::unique_ptr DownloadAndDecodeRemoteFile(ChunkManager* chunk_manager, const std::string& file, bool is_field_data = true); std::pair EncodeAndUploadIndexSlice(ChunkManager* chunk_manager, uint8_t* buf, int64_t batch_size, IndexMeta index_meta, FieldDataMeta field_meta, std::string object_key); std::pair EncodeAndUploadFieldSlice(ChunkManager* chunk_manager, void* buf, int64_t element_count, FieldDataMeta field_data_meta, const FieldMeta& field_meta, std::string object_key); std::vector>> GetObjectData(ChunkManager* remote_chunk_manager, const std::vector& remote_files); std::map PutIndexData(ChunkManager* remote_chunk_manager, const std::vector& data_slices, const std::vector& slice_sizes, const std::vector& slice_names, FieldDataMeta& field_meta, IndexMeta& index_meta); int64_t GetTotalNumRowsForFieldDatas(const std::vector& field_datas); size_t GetNumRowsForLoadInfo(const LoadFieldDataInfo& load_info); void ReleaseArrowUnused(); // size_t // getCurrentRSS(); ChunkManagerPtr CreateChunkManager(const StorageConfig& storage_config); FieldDataPtr CreateFieldData(const DataType& type, bool nullable = false, int64_t dim = 1, int64_t total_num_rows = 0); int64_t GetByteSizeOfFieldDatas(const std::vector& field_datas); std::vector CollectFieldDataChannel(FieldDataChannelPtr& channel); FieldDataPtr MergeFieldData(std::vector& data_array); template struct has_native_type : std::false_type {}; template struct has_native_type> : std::true_type {}; template using DataTypeNativeOrVoid = typename std::conditional>::value, typename TypeTraits::NativeType, void>::type; template using DataTypeToOffsetMap = std::unordered_map, int64_t>; std::vector FetchFieldData(ChunkManager* cm, const std::vector& batch_files); inline void SortByPath(std::vector& paths) { std::sort(paths.begin(), paths.end(), [](const std::string& a, const std::string& b) { return std::stol(a.substr(a.find_last_of("/") + 1)) < std::stol(b.substr(b.find_last_of("/") + 1)); }); } inline std::shared_ptr ConvertFieldDataToArrowDataWrapper(const FieldDataPtr& field_data) { BaseEventData event_data; event_data.field_data = field_data; auto event_data_bytes = event_data.Serialize(); std::shared_ptr file_data(new uint8_t[event_data_bytes.size()]); std::memcpy( file_data.get(), event_data_bytes.data(), event_data_bytes.size()); storage::BinlogReaderPtr reader = std::make_shared( file_data, event_data_bytes.size()); event_data = storage::BaseEventData(reader, event_data_bytes.size(), field_data->get_data_type(), field_data->IsNullable(), false); return std::make_shared( event_data.payload_reader->get_reader(), event_data.payload_reader->get_file_reader(), file_data); } } // namespace milvus::storage