enhance: fix IsMmapSupported for stl sort (#46472)

issue: https://github.com/milvus-io/milvus/issues/44399

this PR also adds `ByteSize()` methods for scalar indexes. currently not
used in milvus code, but used in scalar benchmark. may be used by
cachinglayer in the future.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Refactor**
* Improved and standardized memory-size computation and caching across
index types so reported index footprints are more accurate and
consistent.

* **Chores**
* Ensured byte-size metrics are refreshed immediately after index
build/load operations to keep memory accounting in sync with runtime
state.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
This commit is contained in:
Buqian Zheng 2025-12-23 13:27:18 +08:00 committed by GitHub
parent 99b53316e5
commit 674ac8a006
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 297 additions and 4 deletions

View File

@ -112,6 +112,7 @@ BitmapIndex<T>::Build(size_t n, const T* data, const bool* valid_data) {
}
is_built_ = true;
ComputeByteSize();
}
template <typename T>
@ -168,6 +169,7 @@ BitmapIndex<T>::BuildWithFieldData(
proto::schema::DataType_Name(schema_.data_type())));
}
is_built_ = true;
ComputeByteSize();
}
template <typename T>
@ -568,6 +570,7 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
is_mmap_);
is_built_ = true;
ComputeByteSize();
}
template <typename T>

View File

@ -114,6 +114,69 @@ class BitmapIndex : public ScalarIndex<T> {
return Count();
}
void
ComputeByteSize() override {
ScalarIndex<T>::ComputeByteSize();
int64_t total = this->cached_byte_size_;
// valid_bitset_
total += valid_bitset_.size_in_bytes();
if (is_mmap_) {
// mmap mode
total += mmap_size_;
// bitmap_info_map_ overhead (keys and roaring metadata in memory)
size_t num_entries = bitmap_info_map_.size();
if constexpr (std::is_same_v<T, std::string>) {
for (const auto& [key, bitmap] : bitmap_info_map_) {
total += key.capacity();
}
} else {
total += num_entries * sizeof(T);
}
// roaring metadata + map node overhead per entry
total += num_entries * (sizeof(roaring::Roaring) + 40);
} else if (build_mode_ == BitmapIndexBuildMode::ROARING) {
// data_: map<T, roaring::Roaring>
for (const auto& [key, bitmap] : data_) {
if constexpr (std::is_same_v<T, std::string>) {
total += key.capacity();
} else {
total += sizeof(T);
}
total += bitmap.getSizeInBytes();
// std::map red-black tree node overhead (~40 bytes per node)
total += 40;
}
} else {
// bitsets_: map<T, TargetBitmap>
size_t num_entries = bitsets_.size();
if (num_entries > 0) {
size_t bitset_bytes = bitsets_.begin()->second.size_in_bytes();
total += num_entries * bitset_bytes;
if constexpr (std::is_same_v<T, std::string>) {
for (const auto& [key, bitset] : bitsets_) {
total += key.capacity();
}
} else {
total += num_entries * sizeof(T);
}
// std::map red-black tree node overhead (~40 bytes per node)
total += num_entries * 40;
}
}
// offset cache
total += data_offsets_cache_.capacity() *
sizeof(typename decltype(data_offsets_cache_)::value_type);
total += bitsets_offsets_cache_.capacity() *
sizeof(typename decltype(bitsets_offsets_cache_)::value_type);
total += mmap_offsets_cache_.capacity() *
sizeof(typename decltype(mmap_offsets_cache_)::value_type);
this->cached_byte_size_ = total;
}
IndexStatsPtr
Upload(const Config& config = {}) override;

View File

@ -351,6 +351,7 @@ HybridScalarIndex<T>::Load(const BinarySet& binary_set, const Config& config) {
index->Load(binary_set, config);
is_built_ = true;
ComputeByteSize();
}
template <typename T>
@ -382,6 +383,7 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
index->Load(ctx, config);
is_built_ = true;
ComputeByteSize();
}
template class HybridScalarIndex<bool>;

View File

@ -153,6 +153,16 @@ class HybridScalarIndex : public ScalarIndex<T> {
return internal_index_->Size();
}
void
ComputeByteSize() override {
ScalarIndex<T>::ComputeByteSize();
int64_t total = this->cached_byte_size_;
if (internal_index_) {
total += internal_index_->ByteSize();
}
this->cached_byte_size_ = total;
}
const bool
HasRawData() const override {
if (field_type_ == proto::schema::DataType::Array) {

View File

@ -91,6 +91,29 @@ class IndexBase {
cell_size_ = cell_size;
}
// Returns the memory usage in bytes that scales with data size (O(n)).
// Fixed overhead are minimal and thus not included.
//
// NOTE: This method returns a cached value computed by ComputeByteSize().
// It is designed for SEALED SEGMENTS only, where the index is fully built
// or loaded and no more data will be added. For GROWING SEGMENTS with
// ongoing inserts, the cached value will NOT be updated automatically.
// ComputeByteSize() should only be called after Build() or Load() completes
// on sealed segments.
int64_t
ByteSize() const {
return cached_byte_size_;
}
// Computes and caches the memory usage in bytes.
// Subclasses should override this method to calculate their specific memory usage
// and store the result in cached_byte_size_.
// This method should be called at the end of Build() or Load() for sealed segments.
virtual void
ComputeByteSize() {
cached_byte_size_ = 0;
}
protected:
explicit IndexBase(IndexType index_type)
: index_type_(std::move(index_type)) {
@ -98,6 +121,7 @@ class IndexBase {
IndexType index_type_ = "";
cachinglayer::ResourceUsage cell_size_ = {0, 0};
mutable int64_t cached_byte_size_ = 0;
std::unique_ptr<MmapFileRAII> mmap_file_raii_;
};

View File

@ -218,6 +218,7 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
// the index is loaded in ram, so we can remove files in advance
disk_file_manager_->RemoveIndexFiles();
}
ComputeByteSize();
}
template <typename T>
@ -564,6 +565,7 @@ InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
wrapper_->create_reader(milvus::index::SetBitsetSealed);
finish();
wrapper_->reload();
ComputeByteSize();
}
template <typename T>

View File

@ -190,9 +190,18 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
return Count();
}
int64_t
ByteSize() const {
return wrapper_->index_size_bytes();
void
ComputeByteSize() override {
ScalarIndex<T>::ComputeByteSize();
int64_t total = this->cached_byte_size_;
// Tantivy index size
total += wrapper_->index_size_bytes();
// null_offset_: vector<size_t>
total += null_offset_.capacity() * sizeof(size_t);
this->cached_byte_size_ = total;
}
virtual const TargetBitmap

View File

@ -221,6 +221,7 @@ RTreeIndex<T>::Load(milvus::tracer::TraceContext ctx, const Config& config) {
total_num_rows_ =
wrapper_->count() + static_cast<int64_t>(null_offset_.size());
is_built_ = true;
ComputeByteSize();
LOG_INFO(
"Loaded R-Tree index from {} with {} rows", path_, total_num_rows_);
@ -238,6 +239,7 @@ RTreeIndex<T>::Build(const Config& config) {
total_num_rows_ =
wrapper_->count() + static_cast<int64_t>(null_offset_.size());
is_built_ = true;
ComputeByteSize();
}
template <typename T>
@ -279,6 +281,7 @@ RTreeIndex<T>::BuildWithFieldData(
wrapper_->bulk_load_from_field_data(field_datas, schema_.nullable());
total_num_rows_ = total_rows;
is_built_ = true;
ComputeByteSize();
return;
}
}

View File

@ -143,6 +143,22 @@ class RTreeIndex : public ScalarIndex<T> {
return Count();
}
void
ComputeByteSize() override {
ScalarIndex<T>::ComputeByteSize();
int64_t total = this->cached_byte_size_;
// null_offset_ vector
total += null_offset_.capacity() * sizeof(size_t);
// wrapper_ (RTreeIndexWrapper)
if (wrapper_) {
total += wrapper_->ByteSize();
}
this->cached_byte_size_ = total;
}
// GIS-specific query methods
/**
* @brief Query candidates based on spatial operation

View File

@ -285,5 +285,22 @@ RTreeIndexWrapper::count() const {
return static_cast<int64_t>(rtree_.size());
}
int64_t
RTreeIndexWrapper::ByteSize() const {
int64_t total = 0;
// values_: vector<Value> where Value = std::pair<Box, int64_t>
// Box = bg::model::box<Point> = 2 Points = 2 * 2 * sizeof(double) = 32 bytes
// Value = Box + int64_t = 32 + 8 = 40 bytes
total += values_.capacity() * sizeof(Value);
// rtree_ internal structure (nodes, pointers, MBRs)
// R*-tree with max 16 entries per node has overhead per entry
// Estimated ~18 bytes per entry for internal tree structure
total += rtree_.size() * 18;
return total;
}
// index/leaf capacity setters removed; not applicable for Boost rtree
} // namespace milvus::index

View File

@ -97,6 +97,13 @@ class RTreeIndexWrapper {
int64_t
count() const;
/**
* @brief Get the estimated memory usage of the R-tree index
* @return Memory usage in bytes
*/
int64_t
ByteSize() const;
// Boost rtree does not use index/leaf capacities; keep only fill factor for
// compatibility (no-op currently)

View File

@ -147,7 +147,8 @@ class ScalarIndex : public IndexBase {
index_type_ == milvus::index::HYBRID_INDEX_TYPE ||
index_type_ == milvus::index::INVERTED_INDEX_TYPE ||
index_type_ == milvus::index::MARISA_TRIE ||
index_type_ == milvus::index::MARISA_TRIE_UPPER;
index_type_ == milvus::index::MARISA_TRIE_UPPER ||
index_type_ == milvus::index::ASCENDING_SORT;
}
virtual int64_t

View File

@ -100,6 +100,7 @@ ScalarIndexSort<T>::Build(size_t n, const T* values, const bool* valid_data) {
is_built_ = true;
setup_data_pointers();
ComputeByteSize();
}
template <typename T>
@ -143,6 +144,7 @@ ScalarIndexSort<T>::BuildWithFieldData(
is_built_ = true;
setup_data_pointers();
ComputeByteSize();
}
template <typename T>
@ -281,6 +283,7 @@ ScalarIndexSort<T>::LoadWithoutAssemble(const BinarySet& index_binary,
}
is_built_ = true;
ComputeByteSize();
LOG_INFO("load ScalarIndexSort done, field_id: {}, is_mmap:{}",
field_id_,

View File

@ -116,6 +116,28 @@ class ScalarIndexSort : public ScalarIndex<T> {
return size_ == 0;
}
void
ComputeByteSize() override {
ScalarIndex<T>::ComputeByteSize();
int64_t total = this->cached_byte_size_;
// idx_to_offsets_: vector<int32_t>
total += idx_to_offsets_.capacity() * sizeof(int32_t);
// valid_bitset_: TargetBitmap
total += valid_bitset_.size_in_bytes();
if (is_mmap_) {
// mmap mode: add mmap size and filepath
total += mmap_size_;
} else {
// memory mode: add data vector
total += data_.capacity() * sizeof(IndexStructure<T>);
}
this->cached_byte_size_ = total;
}
IndexStatsPtr
Upload(const Config& config = {}) override;

View File

@ -57,6 +57,29 @@ StringIndexMarisa::Size() {
return total_size_;
}
void
StringIndexMarisa::ComputeByteSize() {
StringIndex::ComputeByteSize();
int64_t total = cached_byte_size_;
// Size of the trie structure (marisa trie uses io_size() for serialized/memory size)
total += trie_.io_size();
// str_ids_: vector<int64_t>
total += str_ids_.capacity() * sizeof(int64_t);
// str_ids_to_offsets_: map<size_t, vector<size_t>>
for (const auto& [key, vec] : str_ids_to_offsets_) {
total += sizeof(size_t); // key
total += vec.capacity() * sizeof(size_t); // vector capacity
total += sizeof(std::vector<size_t>); // vector object overhead
}
// Map node overhead (rough estimate: ~40 bytes per node for std::map)
total += str_ids_to_offsets_.size() * 40;
cached_byte_size_ = total;
}
int64_t
StringIndexMarisa::CalculateTotalSize() const {
int64_t size = 0;
@ -135,6 +158,7 @@ StringIndexMarisa::BuildWithFieldData(
built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
void
@ -161,6 +185,7 @@ StringIndexMarisa::Build(size_t n,
built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
BinarySet
@ -247,6 +272,7 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
fill_offsets();
built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
void

View File

@ -35,6 +35,9 @@ class StringIndexMarisa : public StringIndex {
int64_t
Size() override;
void
ComputeByteSize() override;
BinarySet
Serialize(const Config& config) override;

View File

@ -139,6 +139,7 @@ StringIndexSort::Build(size_t n,
is_built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
void
@ -182,6 +183,7 @@ StringIndexSort::BuildWithFieldData(
is_built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
BinarySet
@ -335,6 +337,7 @@ StringIndexSort::LoadWithoutAssemble(const BinarySet& binary_set,
is_built_ = true;
total_size_ = CalculateTotalSize();
ComputeByteSize();
}
const TargetBitmap
@ -413,6 +416,26 @@ StringIndexSort::CalculateTotalSize() const {
return size;
}
void
StringIndexSort::ComputeByteSize() {
StringIndex::ComputeByteSize();
int64_t total = cached_byte_size_;
// Common structures (always in memory)
// idx_to_offsets_: vector<int32_t>
total += idx_to_offsets_.capacity() * sizeof(int32_t);
// valid_bitset_: TargetBitmap
total += valid_bitset_.size_in_bytes();
// Add impl-specific memory usage
if (impl_) {
total += impl_->ByteSize();
}
cached_byte_size_ = total;
}
void
StringIndexSortMemoryImpl::BuildFromMap(
std::map<std::string, PostingList>&& map,
@ -855,6 +878,36 @@ StringIndexSortMemoryImpl::Size() {
return size;
}
int64_t
StringIndexSortMemoryImpl::ByteSize() const {
int64_t total = 0;
// unique_values_: vector<string>
// sizeof(std::string) includes the SSO buffer
// For heap-allocated strings (capacity > SSO threshold), we need to add external buffer
const size_t sso_threshold = GetStringSSOThreshold();
total += unique_values_.capacity() * sizeof(std::string);
for (const auto& str : unique_values_) {
// Only add capacity for heap-allocated strings (non-SSO)
if (str.capacity() > sso_threshold) {
total += str.capacity();
}
}
// posting_lists_: vector<PostingList>
// PostingList is folly::small_vector<uint32_t, 4>
// sizeof(PostingList) includes the inline buffer for 4 elements
total += posting_lists_.capacity() * sizeof(PostingList);
for (const auto& list : posting_lists_) {
// If the capacity exceeds inline capacity (4), it allocates on heap
if (list.capacity() > 4) {
total += list.capacity() * sizeof(uint32_t);
}
}
return total;
}
StringIndexSortMmapImpl::~StringIndexSortMmapImpl() {
if (mmap_data_ != nullptr && mmap_data_ != MAP_FAILED) {
munmap(mmap_data_, mmap_size_);
@ -1147,4 +1200,10 @@ StringIndexSortMmapImpl::Size() {
return mmap_size_;
}
int64_t
StringIndexSortMmapImpl::ByteSize() const {
// mmap size (O(n) - the mapped index data)
return mmap_size_;
}
} // namespace milvus::index

View File

@ -123,6 +123,11 @@ class StringIndexSort : public StringIndex {
int64_t
Size() override;
// Computes and caches the total memory usage in bytes.
// For mmap mode, this includes both memory-resident structures and mmap size.
void
ComputeByteSize() override;
protected:
int64_t
CalculateTotalSize() const;
@ -199,6 +204,10 @@ class StringIndexSortImpl {
virtual int64_t
Size() = 0;
// Returns the memory usage in bytes for this impl
virtual int64_t
ByteSize() const = 0;
};
class StringIndexSortMemoryImpl : public StringIndexSortImpl {
@ -273,6 +282,9 @@ class StringIndexSortMemoryImpl : public StringIndexSortImpl {
int64_t
Size() override;
int64_t
ByteSize() const override;
private:
// Helper method for binary search
size_t
@ -387,6 +399,9 @@ class StringIndexSortMmapImpl : public StringIndexSortImpl {
int64_t
Size() override;
int64_t
ByteSize() const override;
private:
// Binary search for a value
size_t

View File

@ -234,4 +234,12 @@ void inline SetBitsetGrowing(void* bitset,
}
}
// Get the SSO (Small String Optimization) threshold for std::string.
// Strings with capacity <= this threshold store data inline (no heap allocation).
inline size_t
GetStringSSOThreshold() {
static const size_t threshold = std::string().capacity();
return threshold;
}
} // namespace milvus::index