// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #pragma once #include #include #include #include #include #include #include #include "DeletedRecord.h" #include "FieldIndexing.h" #include "common/Schema.h" #include "common/Span.h" #include "common/SystemProperty.h" #include "common/Types.h" #include "common/LoadInfo.h" #include "common/BitsetView.h" #include "common/QueryResult.h" #include "common/QueryInfo.h" #include "query/Plan.h" #include "query/PlanNode.h" #include "pb/schema.pb.h" #include "pb/segcore.pb.h" #include "index/IndexInfo.h" #include "index/SkipIndex.h" #include "mmap/Column.h" namespace milvus::segcore { struct SegmentStats { // we stat the memory size used by the segment, // including the insert data and delete data. std::atomic mem_size{}; }; // common interface of SegmentSealed and SegmentGrowing used by C API class SegmentInterface { public: virtual ~SegmentInterface() = default; virtual void FillPrimaryKeys(const query::Plan* plan, SearchResult& results) const = 0; virtual void FillTargetEntry(const query::Plan* plan, SearchResult& results) const = 0; virtual bool Contain(const PkType& pk) const = 0; virtual std::unique_ptr Search(const query::Plan* Plan, const query::PlaceholderGroup* placeholder_group, Timestamp timestamp) const = 0; virtual std::unique_ptr Retrieve(tracer::TraceContext* trace_ctx, const query::RetrievePlan* Plan, Timestamp timestamp, int64_t limit_size, bool ignore_non_pk) const = 0; virtual std::unique_ptr Retrieve(tracer::TraceContext* trace_ctx, const query::RetrievePlan* Plan, const int64_t* offsets, int64_t size) const = 0; virtual size_t GetMemoryUsageInBytes() const = 0; virtual int64_t get_row_count() const = 0; virtual const Schema& get_schema() const = 0; virtual int64_t get_deleted_count() const = 0; virtual int64_t get_real_count() const = 0; virtual int64_t get_field_avg_size(FieldId field_id) const = 0; virtual void set_field_avg_size(FieldId field_id, int64_t num_rows, int64_t field_size) = 0; // virtual int64_t // PreDelete(int64_t size) = 0; virtual SegcoreError Delete(int64_t reserved_offset, int64_t size, const IdArray* pks, const Timestamp* timestamps) = 0; virtual void LoadDeletedRecord(const LoadDeletedRecordInfo& info) = 0; virtual void LoadFieldData(const LoadFieldDataInfo& info) = 0; virtual void RemoveDuplicatePkRecords() = 0; virtual int64_t get_segment_id() const = 0; virtual SegmentType type() const = 0; virtual bool HasRawData(int64_t field_id) const = 0; virtual bool is_nullable(FieldId field_id) const = 0; }; // internal API for DSL calculation // only for implementation class SegmentInternalInterface : public SegmentInterface { public: template Span chunk_data(FieldId field_id, int64_t chunk_id) const { return static_cast>(chunk_data_impl(field_id, chunk_id)); } template std::pair, FixedVector> chunk_view(FieldId field_id, int64_t chunk_id) const { auto chunk_info = chunk_view_impl(field_id, chunk_id); auto string_views = chunk_info.first; auto valid_data = chunk_info.second; if constexpr (std::is_same_v) { return std::make_pair(std::move(string_views), std::move(valid_data)); } else { std::vector res; res.reserve(string_views.size()); for (const auto& view : string_views) { res.emplace_back(view); } return std::make_pair(res, valid_data); } } template std::pair, FixedVector> get_batch_views(FieldId field_id, int64_t chunk_id, int64_t start_offset, int64_t length) const { if (this->type() == SegmentType::Growing) { PanicInfo(ErrorCode::Unsupported, "get chunk views not supported for growing segment"); } auto chunk_info = get_chunk_buffer(field_id, chunk_id, start_offset, length); BufferView buffer = chunk_info.first; std::vector res; res.reserve(length); char* pos = buffer.data_; for (size_t j = 0; j < length; j++) { uint32_t size; size = *reinterpret_cast(pos); pos += sizeof(uint32_t); res.emplace_back(ViewType(pos, size)); pos += size; } return std::make_pair(res, chunk_info.second); } template const index::ScalarIndex& chunk_scalar_index(FieldId field_id, int64_t chunk_id) const { static_assert(IsScalar); using IndexType = index::ScalarIndex; auto base_ptr = chunk_index_impl(field_id, chunk_id); auto ptr = dynamic_cast(base_ptr); AssertInfo(ptr, "entry mismatch"); return *ptr; } std::unique_ptr Search(const query::Plan* Plan, const query::PlaceholderGroup* placeholder_group, Timestamp timestamp) const override; void FillPrimaryKeys(const query::Plan* plan, SearchResult& results) const override; void FillTargetEntry(const query::Plan* plan, SearchResult& results) const override; std::unique_ptr Retrieve(tracer::TraceContext* trace_ctx, const query::RetrievePlan* Plan, Timestamp timestamp, int64_t limit_size, bool ignore_non_pk) const override; std::unique_ptr Retrieve(tracer::TraceContext* trace_ctx, const query::RetrievePlan* Plan, const int64_t* offsets, int64_t size) const override; virtual bool HasIndex(FieldId field_id) const = 0; virtual bool HasFieldData(FieldId field_id) const = 0; virtual std::string debug() const = 0; int64_t get_real_count() const override; int64_t get_field_avg_size(FieldId field_id) const override; void set_field_avg_size(FieldId field_id, int64_t num_rows, int64_t field_size) override; const SkipIndex& GetSkipIndex() const; void LoadPrimitiveSkipIndex(FieldId field_id, int64_t chunk_id, DataType data_type, const void* chunk_data, int64_t count); void LoadStringSkipIndex(FieldId field_id, int64_t chunk_id, const milvus::VariableColumn& var_column); virtual DataType GetFieldDataType(FieldId fieldId) const = 0; public: virtual void vector_search(SearchInfo& search_info, const void* query_data, int64_t query_count, Timestamp timestamp, const BitsetView& bitset, SearchResult& output) const = 0; virtual void mask_with_delete(BitsetType& bitset, int64_t ins_barrier, Timestamp timestamp) const = 0; // count of chunk that has index available virtual int64_t num_chunk_index(FieldId field_id) const = 0; // count of chunk that has raw data virtual int64_t num_chunk_data(FieldId field_id) const = 0; // bitset 1 means not hit. 0 means hit. virtual void mask_with_timestamps(BitsetType& bitset_chunk, Timestamp timestamp) const = 0; // count of chunks virtual int64_t num_chunk() const = 0; // element size in each chunk virtual int64_t size_per_chunk() const = 0; virtual int64_t get_active_count(Timestamp ts) const = 0; virtual std::pair, std::vector> search_ids(const IdArray& id_array, Timestamp timestamp) const = 0; /** * Apply timestamp filtering on bitset, the query can't see an entity whose * timestamp is bigger than the timestamp of query. * * @param bitset The final bitset after scalar filtering and delta filtering, * `false` means that the entity will be filtered out. * @param timestamp The timestamp of query. */ void timestamp_filter(BitsetType& bitset, Timestamp timestamp) const; /** * Apply timestamp filtering on bitset, the query can't see an entity whose * timestamp is bigger than the timestamp of query. The passed offsets are * all candidate entities. * * @param bitset The final bitset after scalar filtering and delta filtering, * `true` means that the entity will be filtered out. * @param offsets The segment offsets of all candidates. * @param timestamp The timestamp of query. */ void timestamp_filter(BitsetType& bitset, const std::vector& offsets, Timestamp timestamp) const; /** * Sort all candidates in ascending order, and then return the limit smallest. * Bitset is used to check if the candidate will be filtered out. `false_filtered_out` * determines how to filter out candidates. If `false_filtered_out` is true, we will * filter all candidates whose related bit is false. * * @param limit * @param bitset * @param false_filtered_out * @return All candidates offsets. */ virtual std::pair, bool> find_first(int64_t limit, const BitsetType& bitset, bool false_filtered_out) const = 0; void FillTargetEntry( tracer::TraceContext* trace_ctx, const query::RetrievePlan* plan, const std::unique_ptr& results, const int64_t* offsets, int64_t size, bool ignore_non_pk, bool fill_ids) const; // return whether field mmap or not virtual bool is_mmap_field(FieldId field_id) const = 0; protected: // todo: use an Unified struct for all type in growing/seal segment to store data and valid_data. // internal API: return chunk_data in span virtual SpanBase chunk_data_impl(FieldId field_id, int64_t chunk_id) const = 0; // internal API: return chunk string views in vector virtual std::pair, FixedVector> chunk_view_impl(FieldId field_id, int64_t chunk_id) const = 0; // internal API: return buffer reference to field chunk data located from start_offset virtual std::pair> get_chunk_buffer(FieldId field_id, int64_t chunk_id, int64_t start_offset, int64_t length) const = 0; // internal API: return chunk_index in span, support scalar index only virtual const index::IndexBase* chunk_index_impl(FieldId field_id, int64_t chunk_id) const = 0; // calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to system_type virtual void bulk_subscript(SystemFieldType system_type, const int64_t* seg_offsets, int64_t count, void* output) const = 0; // calculate output[i] = Vec[seg_offsets[i]}, where Vec binds to field_offset virtual std::unique_ptr bulk_subscript(FieldId field_id, const int64_t* seg_offsets, int64_t count) const = 0; virtual void check_search(const query::Plan* plan) const = 0; virtual void check_retrieve(const query::RetrievePlan* plan) const = 0; virtual const ConcurrentVector& get_timestamps() const = 0; protected: mutable std::shared_mutex mutex_; // fieldID -> std::pair std::unordered_map> variable_fields_avg_size_; // bytes; SkipIndex skip_index_; }; } // namespace milvus::segcore