// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include #include #include #include #include #include #include #include "cachinglayer/CacheSlot.h" #include "cachinglayer/Manager.h" #include "cachinglayer/Translator.h" #include "cachinglayer/Utils.h" #include "common/Array.h" #include "common/Chunk.h" #include "common/EasyAssert.h" #include "common/FieldMeta.h" #include "common/Span.h" #include "segcore/storagev1translator/ChunkTranslator.h" #include "cachinglayer/Translator.h" #include "mmap/ChunkedColumnInterface.h" namespace milvus { using namespace milvus::cachinglayer; std::pair inline GetChunkIDByOffset( int64_t offset, const std::vector& num_rows_until_chunk) { // optimize for single chunk case if (num_rows_until_chunk.size() == 2) { return {0, offset}; } auto iter = std::lower_bound( num_rows_until_chunk.begin(), num_rows_until_chunk.end(), offset + 1); size_t chunk_idx = std::distance(num_rows_until_chunk.begin(), iter) - 1; size_t offset_in_chunk = offset - num_rows_until_chunk[chunk_idx]; return {chunk_idx, offset_in_chunk}; } std::pair, std::vector< int64_t>> inline GetChunkIDsByOffsets(const int64_t* offsets, int64_t count, const std::vector& num_rows_until_chunk) { std::vector cids(count, 1); std::vector offsets_in_chunk(count); int64_t len = num_rows_until_chunk.size() - 1; while (len > 1) { const int64_t half = len / 2; len -= half; for (size_t i = 0; i < count; ++i) { const bool cmp = num_rows_until_chunk[cids[i] + half - 1] < offsets[i] + 1; cids[i] += static_cast(cmp) * half; } } for (size_t i = 0; i < count; ++i) { offsets_in_chunk[i] = offsets[i] - num_rows_until_chunk[--cids[i]]; } return std::make_pair(std::move(cids), std::move(offsets_in_chunk)); } std::pair, std::vector< int64_t>> inline GetChunkIDsByOffsets(const int64_t* offsets, int64_t count, const std::vector& num_rows_until_chunk, int64_t virt_chunk_order, const std::vector& vcid_to_cid_arr) { std::vector cids(count, 0); std::vector offsets_in_chunk(count); // optimize for single chunk case if (num_rows_until_chunk.size() == 2) { for (int64_t i = 0; i < count; i++) { offsets_in_chunk[i] = offsets[i]; } return std::make_pair(std::move(cids), std::move(offsets_in_chunk)); } for (int64_t i = 0; i < count; i++) { auto offset = offsets[i]; auto vcid = offset >> virt_chunk_order; auto scid = vcid_to_cid_arr[vcid]; while (scid < num_rows_until_chunk.size() - 1 && offset >= num_rows_until_chunk[scid + 1]) { scid++; } auto offset_in_chunk = offset - num_rows_until_chunk[scid]; cids[i] = scid; offsets_in_chunk[i] = offset_in_chunk; } return std::make_pair(std::move(cids), std::move(offsets_in_chunk)); } class ChunkedColumnBase : public ChunkedColumnInterface { public: explicit ChunkedColumnBase(std::unique_ptr> translator, const FieldMeta& field_meta) : nullable_(field_meta.is_nullable()), data_type_(field_meta.get_data_type()), num_chunks_(translator->num_cells()), slot_(Manager::GetInstance().CreateCacheSlot(std::move(translator))) { num_rows_ = GetNumRowsUntilChunk().back(); } virtual ~ChunkedColumnBase() = default; void ManualEvictCache() const override { slot_->ManualEvictAll(); } PinWrapper DataOfChunk(milvus::OpContext* op_ctx, int chunk_id) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper(ca, chunk->Data()); } bool IsValid(milvus::OpContext* op_ctx, size_t offset) const override { if (!nullable_) { return true; } auto [chunk_id, offset_in_chunk] = GetChunkIDByOffset(offset); auto ca = SemiInlineGet( slot_->PinCells(op_ctx, {static_cast(chunk_id)})); auto chunk = ca->get_cell_of(chunk_id); return chunk->isValid(offset_in_chunk); } void BulkIsValid(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets = nullptr, int64_t count = 0) const override { if (!nullable_) { if (offsets == nullptr) { for (int64_t i = 0; i < num_rows_; i++) { fn(true, i); } } else { for (int64_t i = 0; i < count; i++) { fn(true, i); } } } // nullable: if (offsets == nullptr) { auto ca = SemiInlineGet(slot_->PinAllCells(op_ctx)); for (int64_t i = 0; i < num_rows_; i++) { auto [cid, offset_in_chunk] = GetChunkIDByOffset(i); auto chunk = ca->get_cell_of(cid); auto valid = chunk->isValid(offset_in_chunk); fn(valid, i); } } else { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto valid = chunk->isValid(offsets_in_chunk[i]); fn(valid, i); } } } bool IsNullable() const override { return nullable_; } size_t NumRows() const override { return num_rows_; }; int64_t num_chunks() const override { return num_chunks_; } // This returns only memory byte size. size_t DataByteSize() const override { auto size = 0; for (auto i = 0; i < num_chunks_; i++) { size += slot_->size_of_cell(i).memory_bytes; } return size; } int64_t chunk_row_nums(int64_t chunk_id) const override { return GetNumRowsUntilChunk(chunk_id + 1) - GetNumRowsUntilChunk(chunk_id); } PinWrapper Span(milvus::OpContext* op_ctx, int64_t chunk_id) const override { ThrowInfo(ErrorCode::Unsupported, "Span only supported for ChunkedColumn"); } void BulkValueAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) override { ThrowInfo(ErrorCode::Unsupported, "BulkValueAt only supported for ChunkedColumn and " "ProxyChunkColumn"); } void BulkPrimitiveValueAt(milvus::OpContext* op_ctx, void* dst, const int64_t* offsets, int64_t count) override { ThrowInfo(ErrorCode::Unsupported, "BulkPrimitiveValueAt only supported for ChunkedColumn"); } void BulkVectorValueAt(milvus::OpContext* op_ctx, void* dst, const int64_t* offsets, int64_t element_sizeof, int64_t count) override { ThrowInfo(ErrorCode::Unsupported, "BulkVectorValueAt only supported for ChunkedColumn"); } PinWrapper, FixedVector>> StringViews(milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { ThrowInfo(ErrorCode::Unsupported, "StringViews only supported for VariableColumn"); } PinWrapper, FixedVector>> ArrayViews( milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len) const override { ThrowInfo(ErrorCode::Unsupported, "ArrayViews only supported for ArrayChunkedColumn"); } PinWrapper, FixedVector>> VectorArrayViews( milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len) const override { ThrowInfo( ErrorCode::Unsupported, "VectorArrayViews only supported for ChunkedVectorArrayColumn"); } PinWrapper VectorArrayOffsets(milvus::OpContext* op_ctx, int64_t chunk_id) const override { ThrowInfo( ErrorCode::Unsupported, "VectorArrayOffsets only supported for ChunkedVectorArrayColumn"); } PinWrapper, FixedVector>> StringViewsByOffsets(milvus::OpContext* op_ctx, int64_t chunk_id, const FixedVector& offsets) const override { ThrowInfo(ErrorCode::Unsupported, "ViewsByOffsets only supported for VariableColumn"); } PinWrapper, FixedVector>> ArrayViewsByOffsets(milvus::OpContext* op_ctx, int64_t chunk_id, const FixedVector& offsets) const override { ThrowInfo(ErrorCode::Unsupported, "viewsbyoffsets only supported for ArrayColumn"); } std::pair GetChunkIDByOffset(int64_t offset) const override { AssertInfo(offset >= 0 && offset < num_rows_, "offset {} is out of range, num_rows: {}", offset, num_rows_); auto& num_rows_until_chunk = GetNumRowsUntilChunk(); return ::milvus::GetChunkIDByOffset(offset, num_rows_until_chunk); } std::pair, std::vector> GetChunkIDsByOffsets(const int64_t* offsets, int64_t count) const override { auto& num_rows_until_chunk = GetNumRowsUntilChunk(); auto meta = static_cast( slot_->meta()); auto& virt_chunk_order = meta->virt_chunk_order_; auto& vcid_to_cid_arr = meta->vcid_to_cid_arr_; return ::milvus::GetChunkIDsByOffsets(offsets, count, num_rows_until_chunk, virt_chunk_order, vcid_to_cid_arr); } PinWrapper GetChunk(milvus::OpContext* op_ctx, int64_t chunk_id) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper(ca, chunk); } std::vector> GetAllChunks(milvus::OpContext* op_ctx) const override { auto ca = SemiInlineGet(slot_->PinAllCells(op_ctx)); std::vector> ret; ret.reserve(num_chunks_); for (size_t i = 0; i < num_chunks_; i++) { auto chunk = ca->get_cell_of(i); ret.emplace_back(ca, chunk); } return ret; } int64_t GetNumRowsUntilChunk(int64_t chunk_id) const override { return GetNumRowsUntilChunk()[chunk_id]; } const std::vector& GetNumRowsUntilChunk() const override { auto meta = static_cast( slot_->meta()); return meta->num_rows_until_chunk_; } protected: bool nullable_{false}; DataType data_type_{DataType::NONE}; size_t num_rows_{0}; size_t num_chunks_{0}; mutable std::shared_ptr> slot_; }; class ChunkedColumn : public ChunkedColumnBase { public: // memory mode ctor explicit ChunkedColumn(std::unique_ptr> translator, const FieldMeta& field_meta) : ChunkedColumnBase(std::move(translator), field_meta) { } // BulkValueAt() is used for custom data type in the future void BulkValueAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) override { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { fn(ca->get_cell_of(cids[i])->ValueAt(offsets_in_chunk[i]), i); } } template void BulkPrimitiveValueAtImpl(milvus::OpContext* op_ctx, void* dst, const int64_t* offsets, int64_t count) { static_assert(std::is_fundamental_v && std::is_fundamental_v); auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); auto typed_dst = static_cast(dst); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto value = chunk->ValueAt(offsets_in_chunk[i]); typed_dst[i] = *static_cast(static_cast(value)); } } void BulkPrimitiveValueAt(milvus::OpContext* op_ctx, void* dst, const int64_t* offsets, int64_t count) override { switch (data_type_) { case DataType::INT8: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::INT16: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::INT32: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::INT64: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::FLOAT: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::DOUBLE: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } case DataType::BOOL: { BulkPrimitiveValueAtImpl( op_ctx, dst, offsets, count); break; } default: { ThrowInfo( ErrorCode::Unsupported, "BulkScalarValueAt is not supported for unknown scalar " "data type: {}", data_type_); } } } void BulkVectorValueAt(milvus::OpContext* op_ctx, void* dst, const int64_t* offsets, int64_t element_sizeof, int64_t count) override { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); auto dst_vec = reinterpret_cast(dst); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto value = chunk->ValueAt(offsets_in_chunk[i]); memcpy(dst_vec + i * element_sizeof, value, element_sizeof); } } PinWrapper Span(milvus::OpContext* op_ctx, int64_t chunk_id) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper( ca, static_cast(chunk)->Span()); } }; template class ChunkedVariableColumn : public ChunkedColumnBase { public: static_assert( std::is_same_v || std::is_same_v, "ChunkedVariableColumn only supports std::string or Json types"); // memory mode ctor explicit ChunkedVariableColumn( std::unique_ptr> translator, const FieldMeta& field_meta) : ChunkedColumnBase(std::move(translator), field_meta) { } PinWrapper, FixedVector>> StringViews(milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper< std::pair, FixedVector>>( ca, static_cast(chunk)->StringViews(offset_len)); } PinWrapper, FixedVector>> StringViewsByOffsets(milvus::OpContext* op_ctx, int64_t chunk_id, const FixedVector& offsets) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper< std::pair, FixedVector>>( ca, static_cast(chunk)->ViewsByOffsets(offsets)); } void BulkRawStringAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) const override { if constexpr (!std::is_same_v) { ThrowInfo(ErrorCode::Unsupported, "BulkRawStringAt only supported for " "ChunkedVariableColumn"); } if (offsets == nullptr) { auto ca = SemiInlineGet(slot_->PinAllCells(op_ctx)); for (int64_t i = 0; i < num_rows_; i++) { auto [cid, offset_in_chunk] = GetChunkIDByOffset(i); auto chunk = ca->get_cell_of(cid); auto valid = nullable_ ? chunk->isValid(offset_in_chunk) : true; fn(static_cast(chunk)->operator[]( offset_in_chunk), i, valid); } } else { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto valid = nullable_ ? chunk->isValid(offsets_in_chunk[i]) : true; fn(static_cast(chunk)->operator[]( offsets_in_chunk[i]), i, valid); } } } void BulkRawJsonAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) const override { if constexpr (!std::is_same_v) { ThrowInfo( ErrorCode::Unsupported, "RawJsonAt only supported for ChunkedVariableColumn"); } if (count == 0) { return; } auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto valid = nullable_ ? chunk->isValid(offsets_in_chunk[i]) : true; auto str_view = static_cast(chunk)->operator[]( offsets_in_chunk[i]); fn(Json(str_view.data(), str_view.size()), i, valid); } } void BulkRawBsonAt(milvus::OpContext* op_ctx, std::function fn, const uint32_t* row_offsets, const uint32_t* value_offsets, int64_t count) const override { if (count == 0) { return; } AssertInfo(row_offsets != nullptr && value_offsets != nullptr, "row_offsets and value_offsets must be provided"); auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(row_offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto chunk = ca->get_cell_of(cids[i]); auto str_view = static_cast(chunk)->operator[]( offsets_in_chunk[i]); fn(BsonView(reinterpret_cast(str_view.data()), str_view.size()), row_offsets[i], value_offsets[i]); } } }; class ChunkedArrayColumn : public ChunkedColumnBase { public: // memory mode ctor explicit ChunkedArrayColumn(std::unique_ptr> translator, const FieldMeta& field_meta) : ChunkedColumnBase(std::move(translator), field_meta) { } void BulkArrayAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) const override { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto array = static_cast(ca->get_cell_of(cids[i])) ->View(offsets_in_chunk[i]) .output_data(); fn(std::move(array), i); } } PinWrapper, FixedVector>> ArrayViews(milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { auto ca = SemiInlineGet( slot_->PinCells(op_ctx, {static_cast(chunk_id)})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper, FixedVector>>( ca, static_cast(chunk)->Views(offset_len)); } PinWrapper, FixedVector>> ArrayViewsByOffsets(milvus::OpContext* op_ctx, int64_t chunk_id, const FixedVector& offsets) const override { auto ca = SemiInlineGet(slot_->PinCells(op_ctx, {chunk_id})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper, FixedVector>>( ca, static_cast(chunk)->ViewsByOffsets(offsets)); } }; class ChunkedVectorArrayColumn : public ChunkedColumnBase { public: explicit ChunkedVectorArrayColumn( std::unique_ptr> translator, const FieldMeta& field_meta) : ChunkedColumnBase(std::move(translator), field_meta) { } void BulkVectorArrayAt(milvus::OpContext* op_ctx, std::function fn, const int64_t* offsets, int64_t count) const override { auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count); auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids)); for (int64_t i = 0; i < count; i++) { auto array = static_cast(ca->get_cell_of(cids[i])) ->View(offsets_in_chunk[i]) .output_data(); fn(std::move(array), i); } } PinWrapper, FixedVector>> VectorArrayViews(milvus::OpContext* op_ctx, int64_t chunk_id, std::optional> offset_len = std::nullopt) const override { auto ca = SemiInlineGet( slot_->PinCells(op_ctx, {static_cast(chunk_id)})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper< std::pair, FixedVector>>( ca, static_cast(chunk)->Views(offset_len)); } PinWrapper VectorArrayOffsets(milvus::OpContext* op_ctx, int64_t chunk_id) const override { auto ca = SemiInlineGet( slot_->PinCells(op_ctx, {static_cast(chunk_id)})); auto chunk = ca->get_cell_of(chunk_id); return PinWrapper( ca, static_cast(chunk)->Offsets()); } }; inline std::shared_ptr MakeChunkedColumnBase(DataType data_type, std::unique_ptr> translator, const FieldMeta& field_meta) { if (ChunkedColumnInterface::IsChunkedVariableColumnDataType(data_type)) { if (data_type == DataType::JSON) { return std::static_pointer_cast( std::make_shared>( std::move(translator), field_meta)); } return std::static_pointer_cast( std::make_shared>( std::move(translator), field_meta)); } if (ChunkedColumnInterface::IsChunkedArrayColumnDataType(data_type)) { return std::static_pointer_cast( std::make_shared(std::move(translator), field_meta)); } if (ChunkedColumnInterface::IsChunkedVectorArrayColumnDataType(data_type)) { return std::static_pointer_cast( std::make_shared(std::move(translator), field_meta)); } return std::static_pointer_cast( std::make_shared(std::move(translator), field_meta)); } } // namespace milvus