enhance: [AddField] Add log for segcore segment schema change (#43215)

Related to #39178

This PR add logs for segment schema change operations.

Also fixes the nit comments from PR #42490

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-07-10 10:22:47 +08:00 committed by GitHub
parent 7f8c5c9bb8
commit f027eea545
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 62 additions and 19 deletions

View File

@ -302,7 +302,7 @@ class Schema {
} }
bool bool
ShallLoadField(FieldId field_id) { ShouldLoadField(FieldId field_id) {
return load_fields_.empty() || load_fields_.count(field_id) > 0; return load_fields_.empty() || load_fields_.count(field_id) > 0;
} }

View File

@ -32,11 +32,11 @@ struct FieldDataInfo {
FieldDataInfo(int64_t field_id, FieldDataInfo(int64_t field_id,
size_t row_count, size_t row_count,
std::string mmap_dir_path = "", std::string mmap_dir_path = "",
bool in_list = false) bool in_load_list = false)
: field_id(field_id), : field_id(field_id),
row_count(row_count), row_count(row_count),
mmap_dir_path(std::move(mmap_dir_path)), mmap_dir_path(std::move(mmap_dir_path)),
in_load_list(in_list) { in_load_list(in_load_list) {
arrow_reader_channel = std::make_shared<ArrowReaderChannel>(); arrow_reader_channel = std::make_shared<ArrowReaderChannel>();
} }

View File

@ -293,7 +293,7 @@ ChunkedSegmentSealedImpl::load_column_group_data_internal(
for (int i = 0; i < field_id_list.size(); ++i) { for (int i = 0; i < field_id_list.size(); ++i) {
milvus_field_ids.push_back(FieldId(field_id_list.Get(i))); milvus_field_ids.push_back(FieldId(field_id_list.Get(i)));
merged_in_load_list = merged_in_load_list || merged_in_load_list = merged_in_load_list ||
schema_->ShallLoadField(milvus_field_ids[i]); schema_->ShouldLoadField(milvus_field_ids[i]);
} }
auto column_group_info = FieldDataInfo(column_group_id.get(), auto column_group_info = FieldDataInfo(column_group_id.get(),
@ -360,7 +360,7 @@ ChunkedSegmentSealedImpl::load_field_data_internal(
auto field_data_info = FieldDataInfo(field_id.get(), auto field_data_info = FieldDataInfo(field_id.get(),
num_rows, num_rows,
load_info.mmap_dir_path, load_info.mmap_dir_path,
schema_->ShallLoadField(field_id)); schema_->ShouldLoadField(field_id));
LOG_INFO("segment {} loads field {} with num_rows {}, sorted by pk {}", LOG_INFO("segment {} loads field {} with num_rows {}, sorted by pk {}",
this->get_segment_id(), this->get_segment_id(),
field_id.get(), field_id.get(),
@ -1892,6 +1892,12 @@ ChunkedSegmentSealedImpl::RemoveFieldFile(const FieldId field_id) {
void void
ChunkedSegmentSealedImpl::LazyCheckSchema(SchemaPtr sch) { ChunkedSegmentSealedImpl::LazyCheckSchema(SchemaPtr sch) {
if (sch->get_schema_version() > schema_->get_schema_version()) { if (sch->get_schema_version() > schema_->get_schema_version()) {
LOG_INFO(
"lazy check schema segment {} found newer schema version, current "
"schema version {}, new schema version {}",
id_,
schema_->get_schema_version(),
sch->get_schema_version());
Reopen(sch); Reopen(sch);
} }
} }
@ -1917,7 +1923,8 @@ ChunkedSegmentSealedImpl::load_field_data_common(
} }
if (!enable_mmap) { if (!enable_mmap) {
if (!is_proxy_column || is_proxy_column && if (!is_proxy_column ||
is_proxy_column &&
field_id.get() != DEFAULT_SHORT_COLUMN_GROUP_ID) { field_id.get() != DEFAULT_SHORT_COLUMN_GROUP_ID) {
stats_.mem_size += column->DataByteSize(); stats_.mem_size += column->DataByteSize();
} }
@ -2017,9 +2024,14 @@ ChunkedSegmentSealedImpl::FinishLoad() {
void void
ChunkedSegmentSealedImpl::fill_empty_field(const FieldMeta& field_meta) { ChunkedSegmentSealedImpl::fill_empty_field(const FieldMeta& field_meta) {
auto field_id = field_meta.get_id();
LOG_INFO("start fill empty field {} (data type {}) for sealed segment {}",
field_meta.get_data_type(),
field_id.get(),
id_);
int64_t size = num_rows_.value(); int64_t size = num_rows_.value();
AssertInfo(size > 0, "Chunked Sealed segment must have more than 0 row"); AssertInfo(size > 0, "Chunked Sealed segment must have more than 0 row");
auto field_data_info = FieldDataInfo(field_meta.get_id().get(), size, ""); auto field_data_info = FieldDataInfo(field_id.get(), size, "");
std::unique_ptr<Translator<milvus::Chunk>> translator = std::unique_ptr<Translator<milvus::Chunk>> translator =
std::make_unique<storagev1translator::DefaultValueChunkTranslator>( std::make_unique<storagev1translator::DefaultValueChunkTranslator>(
get_segment_id(), field_meta, field_data_info, false); get_segment_id(), field_meta, field_data_info, false);
@ -2053,9 +2065,13 @@ ChunkedSegmentSealedImpl::fill_empty_field(const FieldMeta& field_meta) {
break; break;
} }
} }
auto field_id = field_meta.get_id();
fields_.emplace(field_id, column); fields_.emplace(field_id, column);
set_bit(field_data_ready_bitset_, field_id, true); set_bit(field_data_ready_bitset_, field_id, true);
LOG_INFO("fill empty field {} (data type {}) for growing segment {} done",
field_meta.get_data_type(),
field_id.get(),
id_);
} }
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -479,10 +479,6 @@ CreateSealedSegment(
const SegcoreConfig& segcore_config = SegcoreConfig::default_config(), const SegcoreConfig& segcore_config = SegcoreConfig::default_config(),
bool is_sorted_by_pk = false) { bool is_sorted_by_pk = false) {
return std::make_unique<ChunkedSegmentSealedImpl>( return std::make_unique<ChunkedSegmentSealedImpl>(
schema, schema, index_meta, segcore_config, segment_id, is_sorted_by_pk);
index_meta,
segcore_config,
segment_id,
is_sorted_by_pk);
} }
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -101,6 +101,12 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
field_id_to_offset.emplace(field_id, field_offset++); field_id_to_offset.emplace(field_id, field_offset++);
// may be added field, add the null if has existed data // may be added field, add the null if has existed data
if (exist_rows > 0 && !insert_record_.is_data_exist(field_id)) { if (exist_rows > 0 && !insert_record_.is_data_exist(field_id)) {
LOG_WARN(
"heterogeneous insert data found for segment {}, field id {}, "
"data type {}",
id_,
field_id.get(),
field.type());
schema_->AddField(FieldName(field.field_name()), schema_->AddField(FieldName(field.field_name()),
field_id, field_id,
DataType(field.type()), DataType(field.type()),
@ -124,6 +130,13 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
if (field_id_to_offset.count(field_id) > 0) { if (field_id_to_offset.count(field_id) > 0) {
continue; continue;
} }
LOG_INFO(
"schema newer than insert data found for segment {}, attach empty "
"field data"
"not exist field {}, data type {}",
id_,
field_id.get(),
field_meta.get_data_type());
auto data = bulk_subscript_not_exist_field(field_meta, num_rows); auto data = bulk_subscript_not_exist_field(field_meta, num_rows);
insert_record_proto->add_fields_data()->CopyFrom(*data); insert_record_proto->add_fields_data()->CopyFrom(*data);
field_id_to_offset.emplace(field_id, field_offset++); field_id_to_offset.emplace(field_id, field_offset++);
@ -500,7 +513,9 @@ SegmentGrowingImpl::load_column_group_data_internal(
auto field_data = storage::CreateFieldData( auto field_data = storage::CreateFieldData(
data_type, data_type,
field.second.is_nullable(), field.second.is_nullable(),
IsVectorDataType(data_type) && !IsSparseFloatVectorDataType(data_type) ? field.second.get_dim() IsVectorDataType(data_type) &&
!IsSparseFloatVectorDataType(data_type)
? field.second.get_dim()
: 1, : 1,
batch_num_rows); batch_num_rows);
field_data->FillFieldData(table->column(i)); field_data->FillFieldData(table->column(i));
@ -1236,6 +1251,12 @@ SegmentGrowingImpl::BulkGetJsonData(
void void
SegmentGrowingImpl::LazyCheckSchema(SchemaPtr sch) { SegmentGrowingImpl::LazyCheckSchema(SchemaPtr sch) {
if (sch->get_schema_version() > schema_->get_schema_version()) { if (sch->get_schema_version() > schema_->get_schema_version()) {
LOG_INFO(
"lazy check schema segment {} found newer schema version, current "
"schema version {}, new schema version {}",
id_,
schema_->get_schema_version(),
sch->get_schema_version());
Reopen(sch); Reopen(sch);
} }
} }
@ -1271,6 +1292,10 @@ SegmentGrowingImpl::FinishLoad() {
void void
SegmentGrowingImpl::fill_empty_field(const FieldMeta& field_meta) { SegmentGrowingImpl::fill_empty_field(const FieldMeta& field_meta) {
auto field_id = field_meta.get_id(); auto field_id = field_meta.get_id();
LOG_INFO("start fill empty field {} (data type {}) for growing segment {}",
field_meta.get_data_type(),
field_id.get(),
id_);
// append meta only needed when schema is old // append meta only needed when schema is old
// loading old segment with new schema will have meta appended // loading old segment with new schema will have meta appended
if (!insert_record_.is_data_exist(field_id)) { if (!insert_record_.is_data_exist(field_id)) {
@ -1286,9 +1311,10 @@ SegmentGrowingImpl::fill_empty_field(const FieldMeta& field_meta) {
insert_record_.get_data_base(field_id)->set_data_raw( insert_record_.get_data_base(field_id)->set_data_raw(
0, total_row_num, data.get(), field_meta); 0, total_row_num, data.get(), field_meta);
LOG_INFO("Growing segment {} fill empty field {} done", LOG_INFO("fill empty field {} (data type {}) for growing segment {} done",
this->get_segment_id(), field_meta.get_data_type(),
field_meta.get_id().get()); field_id.get(),
id_);
} }
} // namespace milvus::segcore } // namespace milvus::segcore

View File

@ -97,6 +97,11 @@ func (m *collectionManager) PutOrRef(collectionID int64, schema *schemapb.Collec
collection.schema.Store(schema) collection.schema.Store(schema)
collection.ccollection.UpdateSchema(schema, loadMeta.GetSchemaVersion()) collection.ccollection.UpdateSchema(schema, loadMeta.GetSchemaVersion())
collection.schemaVersion = loadMeta.GetSchemaVersion() collection.schemaVersion = loadMeta.GetSchemaVersion()
log.Info("update collection schema",
zap.Int64("collectionID", collectionID),
zap.Uint64("schemaVersion", loadMeta.GetSchemaVersion()),
zap.Any("schema", schema),
)
} }
collection.Ref(1) collection.Ref(1)
return nil return nil