From 1fda4bcae4b3ec4fcfc98560f25c2a2675e831f0 Mon Sep 17 00:00:00 2001 From: Buqian Zheng Date: Fri, 21 Nov 2025 12:39:05 +0800 Subject: [PATCH] enhance: [2.5] add ScalarFieldProto& overload to avoid unnecessary copies (#45744) 1. Array.h: Add output_data(ScalarFieldProto&) overload for both Array and ArrayView classes 2. Use std::string_view instead of std::string for VARCHAR and GEOMETRY types to avoid extra string copies 3. Call Reserve(length_) before writing to proto objects to reduce memory reallocations a simple test shows those optimizations improve the Array of Varchar bulk_subscript performance by 20% issue: https://github.com/milvus-io/milvus/issues/45679 pr: https://github.com/milvus-io/milvus/pull/45743 Signed-off-by: Buqian Zheng --- internal/core/src/common/Array.h | 52 ++++++++++++++----- .../src/segcore/ChunkedSegmentSealedImpl.cpp | 2 +- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/internal/core/src/common/Array.h b/internal/core/src/common/Array.h index 5dab13f552..4464903d04 100644 --- a/internal/core/src/common/Array.h +++ b/internal/core/src/common/Array.h @@ -293,11 +293,11 @@ class Array { return offsets_ptr_; } - ScalarArray - output_data() const { - ScalarArray data_array; + void + output_data(ScalarArray& data_array) const { switch (element_type_) { case DataType::BOOL: { + data_array.mutable_bool_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_bool_data()->add_data(element); @@ -307,6 +307,7 @@ class Array { case DataType::INT8: case DataType::INT16: case DataType::INT32: { + data_array.mutable_int_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_int_data()->add_data(element); @@ -314,6 +315,7 @@ class Array { break; } case DataType::INT64: { + data_array.mutable_long_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_long_data()->add_data(element); @@ -322,13 +324,15 @@ class Array { } case DataType::STRING: case DataType::VARCHAR: { + data_array.mutable_string_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { - auto element = get_data(j); - data_array.mutable_string_data()->add_data(element); + auto element = get_data(j); + data_array.mutable_string_data()->add_data(element.data(), element.size()); } break; } case DataType::FLOAT: { + data_array.mutable_float_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_float_data()->add_data(element); @@ -336,6 +340,7 @@ class Array { break; } case DataType::DOUBLE: { + data_array.mutable_double_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_double_data()->add_data(element); @@ -343,9 +348,10 @@ class Array { break; } case DataType::GEOMETRY: { + data_array.mutable_geometry_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { - auto element = get_data(j); - data_array.mutable_geometry_data()->add_data(element); + auto element = get_data(j); + data_array.mutable_geometry_data()->add_data(element.data(), element.size()); } break; } @@ -353,6 +359,12 @@ class Array { // empty array } } + } + + ScalarArray + output_data() const { + ScalarArray data_array; + output_data(data_array); return data_array; } @@ -538,11 +550,11 @@ class ArrayView { return reinterpret_cast(data_)[index]; } - ScalarArray - output_data() const { - ScalarArray data_array; + void + output_data(ScalarArray& data_array) const { switch (element_type_) { case DataType::BOOL: { + data_array.mutable_bool_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_bool_data()->add_data(element); @@ -552,6 +564,7 @@ class ArrayView { case DataType::INT8: case DataType::INT16: case DataType::INT32: { + data_array.mutable_int_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_int_data()->add_data(element); @@ -559,6 +572,7 @@ class ArrayView { break; } case DataType::INT64: { + data_array.mutable_long_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_long_data()->add_data(element); @@ -567,13 +581,15 @@ class ArrayView { } case DataType::STRING: case DataType::VARCHAR: { + data_array.mutable_string_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { - auto element = get_data(j); - data_array.mutable_string_data()->add_data(element); + auto element = get_data(j); + data_array.mutable_string_data()->add_data(element.data(), element.size()); } break; } case DataType::FLOAT: { + data_array.mutable_float_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_float_data()->add_data(element); @@ -581,6 +597,7 @@ class ArrayView { break; } case DataType::DOUBLE: { + data_array.mutable_double_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { auto element = get_data(j); data_array.mutable_double_data()->add_data(element); @@ -588,9 +605,10 @@ class ArrayView { break; } case DataType::GEOMETRY: { + data_array.mutable_geometry_data()->mutable_data()->Reserve(length_); for (int j = 0; j < length_; ++j) { - auto element = get_data(j); - data_array.mutable_geometry_data()->add_data(element); + auto element = get_data(j); + data_array.mutable_geometry_data()->add_data(element.data(), element.size()); } break; } @@ -598,6 +616,12 @@ class ArrayView { // empty array } } + } + + ScalarArray + output_data() const { + ScalarArray data_array; + output_data(data_array); return data_array; } diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index c8f6b6cf17..adf8687669 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -1436,7 +1436,7 @@ ChunkedSegmentSealedImpl::bulk_subscript_array_impl( auto field = reinterpret_cast(column); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; - dst->at(i) = std::move(field->RawAt(offset)); + field->operator[](offset).output_data(dst->at(i)); } }