fix: apply mmap settings correctly during segment load (#46017)

Previously, mmap settings configured at the collection or field level
were not being applied during segment loading in segcore. This was
caused by:

1. A typo in the key name: "mmap.enable" instead of "mmap.enabled"
2. Missing logic to parse and apply mmap settings from schema

This commit fixes the issue by:
- Correcting the key name to "mmap.enabled" to match the standard
- Adding Schema::MmapEnabled() method to retrieve field/collection level
mmap settings with proper fallback logic
- Parsing mmap settings from field type_params and collection properties
during schema parsing
- Applying computed mmap settings in LoadColumnGroup() and Load()
methods instead of hardcoded false values
- Using global MmapConfig as fallback when no explicit setting exists

The mmap setting priority is now:
1. Field-level mmap setting (from type_params)
2. Collection-level mmap setting (from properties)
3. Global mmap config (from MmapManager)

For column groups, if any field has mmap enabled, the entire group uses
mmap (since they are loaded together).

Related issue: #45060

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-12-03 10:31:10 +08:00 committed by GitHub
parent dd3797f3ac
commit 5d0c8b1b40
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 359 additions and 7 deletions

View File

@ -133,3 +133,4 @@ const std::string ELEMENT_TYPE_KEY_FOR_ARROW = "elementType";
// EPSILON value for comparing float numbers // EPSILON value for comparing float numbers
const float EPSILON = 0.0000000119; const float EPSILON = 0.0000000119;
const std::string NAMESPACE_FIELD_NAME = "$namespace_id"; const std::string NAMESPACE_FIELD_NAME = "$namespace_id";
const std::string MMAP_ENABLED_KEY = "mmap.enabled";

View File

@ -26,7 +26,9 @@
#include "Schema.h" #include "Schema.h"
#include "SystemProperty.h" #include "SystemProperty.h"
#include "arrow/util/key_value_metadata.h" #include "arrow/util/key_value_metadata.h"
#include "common/Consts.h"
#include "milvus-storage/common/constants.h" #include "milvus-storage/common/constants.h"
#include "pb/common.pb.h"
#include "protobuf_utils.h" #include "protobuf_utils.h"
namespace milvus { namespace milvus {
@ -61,6 +63,12 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
if (child.name() == namespace_field_name) { if (child.name() == namespace_field_name) {
schema->set_namespace_field_id(field_id); schema->set_namespace_field_id(field_id);
} }
auto [has_setting, enabled] =
GetBoolFromRepeatedKVs(child.type_params(), MMAP_ENABLED_KEY);
if (has_setting) {
schema->mmap_fields_[field_id] = enabled;
}
}; };
for (const milvus::proto::schema::FieldSchema& child : for (const milvus::proto::schema::FieldSchema& child :
@ -75,6 +83,9 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
} }
} }
std::tie(schema->has_mmap_setting_, schema->mmap_enabled_) =
GetBoolFromRepeatedKVs(schema_proto.properties(), MMAP_ENABLED_KEY);
AssertInfo(schema->get_primary_field_id().has_value(), AssertInfo(schema->get_primary_field_id().has_value(),
"primary key should be specified"); "primary key should be specified");
@ -150,4 +161,14 @@ Schema::AbsentFields(Schema& old_schema) const {
return std::make_unique<std::vector<FieldMeta>>(result); return std::make_unique<std::vector<FieldMeta>>(result);
} }
std::pair<bool, bool>
Schema::MmapEnabled(const FieldId& field_id) const {
auto it = mmap_fields_.find(field_id);
// fallback to collection-level config
if (it == mmap_fields_.end()) {
return {has_mmap_setting_, mmap_enabled_};
}
return {true, it->second};
}
} // namespace milvus } // namespace milvus

View File

@ -374,6 +374,24 @@ class Schema {
std::unique_ptr<std::vector<FieldMeta>> std::unique_ptr<std::vector<FieldMeta>>
AbsentFields(Schema& old_schema) const; AbsentFields(Schema& old_schema) const;
/**
* @brief Determines whether the specified field should use mmap for data loading.
*
* This function checks mmap settings at the field level first. If no field-level
* setting is found, it falls back to the collection-level mmap configuration.
*
* @param field The field ID to check mmap settings for.
*
* @return A pair of booleans:
* - first: Whether an mmap setting exists (at field or collection level).
* - second: Whether mmap is enabled (only meaningful when first is true).
*
* @note If no mmap setting exists at any level, first will be false and second
* should be ignored.
*/
std::pair<bool, bool>
MmapEnabled(const FieldId& field) const;
private: private:
int64_t debug_id = START_USER_FIELDID; int64_t debug_id = START_USER_FIELDID;
std::vector<FieldId> field_ids_; std::vector<FieldId> field_ids_;
@ -395,6 +413,11 @@ class Schema {
// schema_version_, currently marked with update timestamp // schema_version_, currently marked with update timestamp
uint64_t schema_version_; uint64_t schema_version_;
// mmap settings
bool has_mmap_setting_ = false;
bool mmap_enabled_ = false;
std::unordered_map<FieldId, bool> mmap_fields_;
}; };
using SchemaPtr = std::shared_ptr<Schema>; using SchemaPtr = std::shared_ptr<Schema>;

View File

@ -0,0 +1,212 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include "common/Schema.h"
using namespace milvus;
class SchemaTest : public ::testing::Test {
protected:
void
SetUp() override {
schema_ = std::make_shared<Schema>();
}
std::shared_ptr<Schema> schema_;
};
TEST_F(SchemaTest, MmapEnabledNoSetting) {
// Add a field without any mmap setting
auto field_id = schema_->AddDebugField("test_field", DataType::INT64);
schema_->set_primary_field_id(field_id);
// When no mmap setting exists at any level, first should be false
auto [has_setting, enabled] = schema_->MmapEnabled(field_id);
EXPECT_FALSE(has_setting);
// The enabled value is undefined when has_setting is false, so we don't check it
}
TEST_F(SchemaTest, MmapEnabledCollectionLevelEnabled) {
// Create schema with collection-level mmap enabled via protobuf
milvus::proto::schema::CollectionSchema schema_proto;
auto* field = schema_proto.add_fields();
field->set_fieldid(100);
field->set_name("pk_field");
field->set_data_type(milvus::proto::schema::DataType::Int64);
field->set_is_primary_key(true);
// Set collection-level mmap enabled
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("true");
auto parsed_schema = Schema::ParseFrom(schema_proto);
FieldId pk_field_id(100);
auto [has_setting, enabled] = parsed_schema->MmapEnabled(pk_field_id);
EXPECT_TRUE(has_setting);
EXPECT_TRUE(enabled);
}
TEST_F(SchemaTest, MmapEnabledCollectionLevelDisabled) {
// Create schema with collection-level mmap disabled via protobuf
milvus::proto::schema::CollectionSchema schema_proto;
auto* field = schema_proto.add_fields();
field->set_fieldid(100);
field->set_name("pk_field");
field->set_data_type(milvus::proto::schema::DataType::Int64);
field->set_is_primary_key(true);
// Set collection-level mmap disabled
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("false");
auto parsed_schema = Schema::ParseFrom(schema_proto);
FieldId pk_field_id(100);
auto [has_setting, enabled] = parsed_schema->MmapEnabled(pk_field_id);
EXPECT_TRUE(has_setting);
EXPECT_FALSE(enabled);
}
TEST_F(SchemaTest, MmapEnabledCollectionLevelCaseInsensitive) {
// Test that mmap value parsing is case-insensitive
milvus::proto::schema::CollectionSchema schema_proto;
auto* field = schema_proto.add_fields();
field->set_fieldid(100);
field->set_name("pk_field");
field->set_data_type(milvus::proto::schema::DataType::Int64);
field->set_is_primary_key(true);
// Set collection-level mmap with uppercase TRUE
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("TRUE");
auto parsed_schema = Schema::ParseFrom(schema_proto);
FieldId pk_field_id(100);
auto [has_setting, enabled] = parsed_schema->MmapEnabled(pk_field_id);
EXPECT_TRUE(has_setting);
EXPECT_TRUE(enabled);
}
TEST_F(SchemaTest, MmapEnabledFieldLevelOverridesCollectionLevel) {
// Test that field-level mmap setting overrides collection-level setting
milvus::proto::schema::CollectionSchema schema_proto;
auto* field = schema_proto.add_fields();
field->set_fieldid(100);
field->set_name("pk_field");
field->set_data_type(milvus::proto::schema::DataType::Int64);
field->set_is_primary_key(true);
// Set collection-level mmap enabled
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("true");
// Note: Field-level mmap settings are set via schema_proto.properties()
// in the current implementation, which applies to all fields.
// This test verifies the fallback behavior when no field-level override exists.
auto parsed_schema = Schema::ParseFrom(schema_proto);
FieldId pk_field_id(100);
// Without field-level override, should use collection-level setting
auto [has_setting, enabled] = parsed_schema->MmapEnabled(pk_field_id);
EXPECT_TRUE(has_setting);
EXPECT_TRUE(enabled);
}
TEST_F(SchemaTest, MmapEnabledNonExistentField) {
// Test MmapEnabled with a field that doesn't exist in mmap_fields_
// but collection-level setting exists
milvus::proto::schema::CollectionSchema schema_proto;
auto* field1 = schema_proto.add_fields();
field1->set_fieldid(100);
field1->set_name("pk_field");
field1->set_data_type(milvus::proto::schema::DataType::Int64);
field1->set_is_primary_key(true);
auto* field2 = schema_proto.add_fields();
field2->set_fieldid(101);
field2->set_name("data_field");
field2->set_data_type(milvus::proto::schema::DataType::Float);
// Set collection-level mmap enabled
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("true");
auto parsed_schema = Schema::ParseFrom(schema_proto);
// Both fields should fallback to collection-level setting
FieldId pk_field_id(100);
auto [has_setting1, enabled1] = parsed_schema->MmapEnabled(pk_field_id);
EXPECT_TRUE(has_setting1);
EXPECT_TRUE(enabled1);
FieldId data_field_id(101);
auto [has_setting2, enabled2] = parsed_schema->MmapEnabled(data_field_id);
EXPECT_TRUE(has_setting2);
EXPECT_TRUE(enabled2);
// Test with a field ID that was never added to the schema
FieldId non_existent_field_id(999);
auto [has_setting3, enabled3] =
parsed_schema->MmapEnabled(non_existent_field_id);
EXPECT_TRUE(has_setting3); // Falls back to collection-level
EXPECT_TRUE(enabled3);
}
TEST_F(SchemaTest, MmapEnabledMultipleFields) {
// Test MmapEnabled with multiple fields, all using collection-level setting
milvus::proto::schema::CollectionSchema schema_proto;
auto* pk_field = schema_proto.add_fields();
pk_field->set_fieldid(100);
pk_field->set_name("pk_field");
pk_field->set_data_type(milvus::proto::schema::DataType::Int64);
pk_field->set_is_primary_key(true);
auto* int_field = schema_proto.add_fields();
int_field->set_fieldid(101);
int_field->set_name("int_field");
int_field->set_data_type(milvus::proto::schema::DataType::Int32);
auto* float_field = schema_proto.add_fields();
float_field->set_fieldid(102);
float_field->set_name("float_field");
float_field->set_data_type(milvus::proto::schema::DataType::Float);
// Set collection-level mmap disabled
auto* prop = schema_proto.add_properties();
prop->set_key("mmap.enabled");
prop->set_value("false");
auto parsed_schema = Schema::ParseFrom(schema_proto);
// All fields should have the same collection-level setting
for (int64_t id = 100; id <= 102; ++id) {
FieldId field_id(id);
auto [has_setting, enabled] = parsed_schema->MmapEnabled(field_id);
EXPECT_TRUE(has_setting);
EXPECT_FALSE(enabled);
}
}

View File

@ -45,6 +45,32 @@ RepeatedKeyValToMap(
return mapping; return mapping;
} }
/**
* @brief Get a boolean value from repeated KeyValuePair by key.
*
* @param kvs The repeated KeyValuePair field to search.
* @param key The key to look for.
* @return std::pair<bool, bool> where:
* - first: whether the key was found.
* - second: the parsed boolean value (true if value is "true", case-insensitive).
*/
static std::pair<bool, bool>
GetBoolFromRepeatedKVs(
const google::protobuf::RepeatedPtrField<proto::common::KeyValuePair>& kvs,
const std::string& key) {
for (auto& kv : kvs) {
if (kv.key() == key) {
std::string lower;
std::transform(kv.value().begin(),
kv.value().end(),
std::back_inserter(lower),
::tolower);
return {true, lower == "true"};
}
}
return {false, false};
}
class ProtoLayout; class ProtoLayout;
using ProtoLayoutPtr = std::unique_ptr<ProtoLayout>; using ProtoLayoutPtr = std::unique_ptr<ProtoLayout>;

View File

@ -289,16 +289,20 @@ ChunkedSegmentSealedImpl::ConvertFieldIndexInfoToLoadIndexInfo(
} }
} }
bool mmap_enabled = false; auto& mmap_config = storage::MmapManager::GetInstance().GetMmapConfig();
auto use_mmap = IsVectorDataType(field_meta.get_data_type())
? mmap_config.GetVectorIndexEnableMmap()
: mmap_config.GetScalarIndexEnableMmap();
// Set index params // Set index params
for (const auto& kv_pair : field_index_info->index_params()) { for (const auto& kv_pair : field_index_info->index_params()) {
if (kv_pair.key() == "mmap.enable") { if (kv_pair.key() == "mmap.enabled") {
std::string lower; std::string lower;
std::transform(kv_pair.value().begin(), std::transform(kv_pair.value().begin(),
kv_pair.value().end(), kv_pair.value().end(),
std::back_inserter(lower), std::back_inserter(lower),
::tolower); ::tolower);
mmap_enabled = lower == "true"; use_mmap = (lower == "true");
} }
load_index_info.index_params[kv_pair.key()] = kv_pair.value(); load_index_info.index_params[kv_pair.key()] = kv_pair.value();
} }
@ -316,7 +320,7 @@ ChunkedSegmentSealedImpl::ConvertFieldIndexInfoToLoadIndexInfo(
milvus::storage::LocalChunkManagerSingleton::GetInstance() milvus::storage::LocalChunkManagerSingleton::GetInstance()
.GetChunkManager() .GetChunkManager()
->GetRootPath(); ->GetRootPath();
load_index_info.enable_mmap = mmap_enabled; load_index_info.enable_mmap = use_mmap;
return load_index_info; return load_index_info;
} }
@ -393,6 +397,46 @@ ChunkedSegmentSealedImpl::LoadColumnGroup(
auto field_metas = schema_->get_field_metas(milvus_field_ids); auto field_metas = schema_->get_field_metas(milvus_field_ids);
// assumption: vector field occupies whole column group
bool is_vector = false;
bool index_has_rawdata = true;
bool has_mmap_setting = false;
bool mmap_enabled = false;
for (auto& [field_id, field_meta] : field_metas) {
if (IsVectorDataType(field_meta.get_data_type())) {
is_vector = true;
}
std::shared_lock lck(mutex_);
if (index_has_raw_data_.find(field_id) != index_has_raw_data_.end()) {
index_has_rawdata =
index_has_raw_data_.at(field_id) && index_has_rawdata;
} else {
index_has_rawdata = false;
}
// if field has mmap setting, use it
// - mmap setting at collection level, then all field are the same
// - mmap setting at field level, we define that as long as one field shall be mmap, then whole group shall be mmaped
auto [field_has_setting, field_mmap_enabled] =
schema_->MmapEnabled(field_id);
has_mmap_setting = has_mmap_setting || field_has_setting;
mmap_enabled = mmap_enabled || field_mmap_enabled;
}
if (index_has_rawdata) {
LOG_INFO(
"[StorageV2] segment {} index(es) provide all raw data for column "
"group index {}, skip loading binlog",
this->get_segment_id(),
index);
return;
}
auto& mmap_config = storage::MmapManager::GetInstance().GetMmapConfig();
bool global_use_mmap = is_vector ? mmap_config.GetVectorFieldEnableMmap()
: mmap_config.GetScalarFieldEnableMmap();
auto use_mmap = has_mmap_setting ? mmap_enabled : global_use_mmap;
auto chunk_reader_result = reader_->get_chunk_reader(index); auto chunk_reader_result = reader_->get_chunk_reader(index);
AssertInfo(chunk_reader_result.ok(), AssertInfo(chunk_reader_result.ok(),
"get chunk reader failed, segment {}, column group index {}", "get chunk reader failed, segment {}, column group index {}",
@ -411,7 +455,7 @@ ChunkedSegmentSealedImpl::LoadColumnGroup(
index, index,
std::move(chunk_reader), std::move(chunk_reader),
field_metas, field_metas,
false, // TODO use_mmap,
column_group->columns.size(), column_group->columns.size(),
segment_load_info_.priority()); segment_load_info_.priority());
auto chunked_column_group = auto chunked_column_group =
@ -429,7 +473,7 @@ ChunkedSegmentSealedImpl::LoadColumnGroup(
column, column,
segment_load_info_.num_of_rows(), segment_load_info_.num_of_rows(),
data_type, data_type,
false, // info.enable_mmap, use_mmap,
true, true,
std:: std::
nullopt); // manifest cannot provide parquet skip index directly nullopt); // manifest cannot provide parquet skip index directly
@ -530,7 +574,7 @@ ChunkedSegmentSealedImpl::load_column_group_data_internal(
std::vector<FieldId> milvus_field_ids; std::vector<FieldId> milvus_field_ids;
milvus_field_ids.reserve(field_id_list.size()); milvus_field_ids.reserve(field_id_list.size());
for (int i = 0; i < field_id_list.size(); ++i) { for (int i = 0; i < field_id_list.size(); ++i) {
milvus_field_ids.push_back(FieldId(field_id_list.Get(i))); milvus_field_ids.emplace_back(field_id_list.Get(i));
merged_in_load_list = merged_in_load_list || merged_in_load_list = merged_in_load_list ||
schema_->ShouldLoadField(milvus_field_ids[i]); schema_->ShouldLoadField(milvus_field_ids[i]);
} }
@ -2960,6 +3004,31 @@ ChunkedSegmentSealedImpl::Load(milvus::tracer::TraceContext& trace_ctx) {
} }
field_binlog_info.row_count = total_entries; field_binlog_info.row_count = total_entries;
bool has_mmap_setting = false;
bool mmap_enabled = false;
bool is_vector = false;
for (const auto& child_field_id : field_binlog.child_fields()) {
auto& field_meta = schema_->operator[](FieldId(child_field_id));
if (IsVectorDataType(field_meta.get_data_type())) {
is_vector = true;
}
// if field has mmap setting, use it
// - mmap setting at collection level, then all field are the same
// - mmap setting at field level, we define that as long as one field shall be mmap, then whole group shall be mmaped
auto [field_has_setting, field_mmap_enabled] =
schema_->MmapEnabled(field_id);
has_mmap_setting = has_mmap_setting || field_has_setting;
mmap_enabled = mmap_enabled || field_mmap_enabled;
}
auto& mmap_config = storage::MmapManager::GetInstance().GetMmapConfig();
auto global_use_mmap = is_vector
? mmap_config.GetVectorFieldEnableMmap()
: mmap_config.GetScalarFieldEnableMmap();
field_binlog_info.enable_mmap =
has_mmap_setting ? mmap_enabled : global_use_mmap;
// Store in map // Store in map
load_field_data_info.field_infos[field_id.get()] = field_binlog_info; load_field_data_info.field_infos[field_id.get()] = field_binlog_info;