enhance: add ScalarFieldProto& overload to avoid unnecessary copies (#45743)

1. Array.h: Add output_data(ScalarFieldProto&) overload for both Array
and ArrayView classes
2. Use std::string_view instead of std::string for VARCHAR and GEOMETRY
types to avoid extra string copies
3. Call Reserve(length_) before writing to proto objects to reduce
memory reallocations

a simple test shows those optimizations improve the Array of Varchar
bulk_subscript performance by 20%

issue: https://github.com/milvus-io/milvus/issues/45679

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
This commit is contained in:
Buqian Zheng 2025-11-21 18:35:05 +08:00 committed by GitHub
parent f51fcc09ae
commit e00ad1098f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 65 additions and 27 deletions

View File

@ -296,11 +296,12 @@ class Array {
return offsets_ptr_.get();
}
ScalarFieldProto
output_data() const {
ScalarFieldProto data_array;
void
output_data(ScalarFieldProto& data_array) const {
switch (element_type_) {
case DataType::BOOL: {
data_array.mutable_bool_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<bool>(j);
data_array.mutable_bool_data()->add_data(element);
@ -310,6 +311,7 @@ class Array {
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
data_array.mutable_int_data()->mutable_data()->Reserve(length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<int>(j);
data_array.mutable_int_data()->add_data(element);
@ -317,6 +319,8 @@ class Array {
break;
}
case DataType::INT64: {
data_array.mutable_long_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<int64_t>(j);
data_array.mutable_long_data()->add_data(element);
@ -325,13 +329,18 @@ class Array {
}
case DataType::STRING:
case DataType::VARCHAR: {
data_array.mutable_string_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<std::string>(j);
data_array.mutable_string_data()->add_data(element);
auto element = get_data<std::string_view>(j);
data_array.mutable_string_data()->add_data(element.data(),
element.size());
}
break;
}
case DataType::FLOAT: {
data_array.mutable_float_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<float>(j);
data_array.mutable_float_data()->add_data(element);
@ -339,6 +348,8 @@ class Array {
break;
}
case DataType::DOUBLE: {
data_array.mutable_double_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<double>(j);
data_array.mutable_double_data()->add_data(element);
@ -346,9 +357,12 @@ class Array {
break;
}
case DataType::GEOMETRY: {
data_array.mutable_geometry_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<std::string>(j);
data_array.mutable_geometry_data()->add_data(element);
auto element = get_data<std::string_view>(j);
data_array.mutable_geometry_data()->add_data(
element.data(), element.size());
}
break;
}
@ -356,6 +370,12 @@ class Array {
// empty array
}
}
}
ScalarFieldProto
output_data() const {
ScalarFieldProto data_array;
output_data(data_array);
return data_array;
}
@ -541,11 +561,12 @@ class ArrayView {
return reinterpret_cast<T*>(data_)[index];
}
ScalarFieldProto
output_data() const {
ScalarFieldProto data_array;
void
output_data(ScalarFieldProto& data_array) const {
switch (element_type_) {
case DataType::BOOL: {
data_array.mutable_bool_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<bool>(j);
data_array.mutable_bool_data()->add_data(element);
@ -555,6 +576,7 @@ class ArrayView {
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
data_array.mutable_int_data()->mutable_data()->Reserve(length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<int>(j);
data_array.mutable_int_data()->add_data(element);
@ -562,6 +584,8 @@ class ArrayView {
break;
}
case DataType::INT64: {
data_array.mutable_long_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<int64_t>(j);
data_array.mutable_long_data()->add_data(element);
@ -570,13 +594,18 @@ class ArrayView {
}
case DataType::STRING:
case DataType::VARCHAR: {
data_array.mutable_string_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<std::string>(j);
data_array.mutable_string_data()->add_data(element);
auto element = get_data<std::string_view>(j);
data_array.mutable_string_data()->add_data(element.data(),
element.size());
}
break;
}
case DataType::FLOAT: {
data_array.mutable_float_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<float>(j);
data_array.mutable_float_data()->add_data(element);
@ -584,6 +613,8 @@ class ArrayView {
break;
}
case DataType::DOUBLE: {
data_array.mutable_double_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<double>(j);
data_array.mutable_double_data()->add_data(element);
@ -591,9 +622,12 @@ class ArrayView {
break;
}
case DataType::GEOMETRY: {
data_array.mutable_geometry_data()->mutable_data()->Reserve(
length_);
for (int j = 0; j < length_; ++j) {
auto element = get_data<std::string>(j);
data_array.mutable_geometry_data()->add_data(element);
auto element = get_data<std::string_view>(j);
data_array.mutable_geometry_data()->add_data(
element.data(), element.size());
}
break;
}
@ -601,6 +635,12 @@ class ArrayView {
// empty array
}
}
}
ScalarFieldProto
output_data() const {
ScalarFieldProto data_array;
output_data(data_array);
return data_array;
}

View File

@ -629,16 +629,15 @@ class ChunkedArrayColumn : public ChunkedColumnBase {
void
BulkArrayAt(milvus::OpContext* op_ctx,
std::function<void(ScalarFieldProto&&, size_t)> fn,
std::function<void(const ArrayView&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count);
auto ca = SemiInlineGet(slot_->PinCells(op_ctx, cids));
for (int64_t i = 0; i < count; i++) {
auto array = static_cast<ArrayChunk*>(ca->get_cell_of(cids[i]))
->View(offsets_in_chunk[i])
.output_data();
fn(std::move(array), i);
auto view = static_cast<ArrayChunk*>(ca->get_cell_of(cids[i]))
->View(offsets_in_chunk[i]);
fn(view, i);
}
}

View File

@ -626,7 +626,7 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
void
BulkArrayAt(milvus::OpContext* op_ctx,
std::function<void(ScalarFieldProto&&, size_t)> fn,
std::function<void(const ArrayView&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
if (!IsChunkedArrayColumnDataType(data_type_)) {
@ -639,10 +639,9 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
for (int64_t i = 0; i < count; i++) {
auto* group_chunk = ca->get_cell_of(cids[i]);
auto chunk = group_chunk->GetChunk(field_id_);
auto array = static_cast<ArrayChunk*>(chunk.get())
->View(offsets_in_chunk[i])
.output_data();
fn(std::move(array), i);
auto view = static_cast<ArrayChunk*>(chunk.get())
->View(offsets_in_chunk[i]);
fn(view, i);
}
}

View File

@ -187,7 +187,7 @@ class ChunkedColumnInterface {
virtual void
BulkArrayAt(milvus::OpContext* op_ctx,
std::function<void(ScalarFieldProto&&, size_t)> fn,
std::function<void(const ArrayView&, size_t)> fn,
const int64_t* offsets,
int64_t count) const {
ThrowInfo(ErrorCode::Unsupported,

View File

@ -1678,8 +1678,8 @@ ChunkedSegmentSealedImpl::bulk_subscript_array_impl(
google::protobuf::RepeatedPtrField<T>* dst) {
column->BulkArrayAt(
op_ctx,
[dst](ScalarFieldProto&& array, size_t i) {
dst->at(i) = std::move(array);
[dst](const ArrayView& view, size_t i) {
view.output_data(dst->at(i));
},
seg_offsets,
count);