From 09218bfd3d2d9f0d4b7d9370767b00badbf6cb4f Mon Sep 17 00:00:00 2001 From: yah01 Date: Fri, 1 Sep 2023 10:15:01 +0800 Subject: [PATCH] Optimize loading by reduce 1x copy while reading data (#26746) Signed-off-by: yah01 --- internal/core/src/storage/FieldData.cpp | 2 + .../core/src/storage/FieldDataInterface.h | 57 +++++++++++++++++-- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/internal/core/src/storage/FieldData.cpp b/internal/core/src/storage/FieldData.cpp index 1ed8012187..2d7cf85a89 100644 --- a/internal/core/src/storage/FieldData.cpp +++ b/internal/core/src/storage/FieldData.cpp @@ -15,7 +15,9 @@ // limitations under the License. #include "storage/FieldData.h" +#include "arrow/array/array_binary.h" #include "common/Json.h" +#include "simdjson/padded_string.h" namespace milvus::storage { diff --git a/internal/core/src/storage/FieldDataInterface.h b/internal/core/src/storage/FieldDataInterface.h index 2dd236af59..3165972ab8 100644 --- a/internal/core/src/storage/FieldDataInterface.h +++ b/internal/core/src/storage/FieldDataInterface.h @@ -19,12 +19,14 @@ #include #include #include +#include #include #include #include #include #include "arrow/api.h" +#include "arrow/array/array_binary.h" #include "common/FieldMeta.h" #include "common/Utils.h" #include "common/VectorTrait.h" @@ -113,6 +115,12 @@ class FieldDataImpl : public FieldDataBase { void FillFieldData(const std::shared_ptr array) override; + virtual void + FillFieldData(const std::shared_ptr& array){}; + + virtual void + FillFieldData(const std::shared_ptr& array){}; + std::string GetName() const { return "FieldDataImpl"; @@ -212,7 +220,7 @@ class FieldDataStringImpl : public FieldDataImpl { } int64_t - Size() const { + Size() const override { int64_t data_size = 0; for (size_t offset = 0; offset < length(); ++offset) { data_size += field_data_[offset].size(); @@ -222,13 +230,33 @@ class FieldDataStringImpl : public FieldDataImpl { } int64_t - Size(ssize_t offset) const { + Size(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); return field_data_[offset].size(); } + + void + FillFieldData(const std::shared_ptr& array) override { + auto n = array->length(); + if (n == 0) { + return; + } + + std::lock_guard lck(tell_mutex_); + if (length_ + n > get_num_rows()) { + resize_field_data(length_ + n); + } + + auto i = 0; + for (const auto& str : *array) { + field_data_[length_ + i] = str.value(); + i++; + } + length_ += n; + } }; class FieldDataJsonImpl : public FieldDataImpl { @@ -238,7 +266,7 @@ class FieldDataJsonImpl : public FieldDataImpl { } int64_t - Size() const { + Size() const override { int64_t data_size = 0; for (size_t offset = 0; offset < length(); ++offset) { data_size += field_data_[offset].data().size(); @@ -248,13 +276,34 @@ class FieldDataJsonImpl : public FieldDataImpl { } int64_t - Size(ssize_t offset) const { + Size(ssize_t offset) const override { AssertInfo(offset < get_num_rows(), "field data subscript out of range"); AssertInfo(offset < length(), "subscript position don't has valid value"); return field_data_[offset].data().size(); } + + void + FillFieldData(const std::shared_ptr& array) override { + auto n = array->length(); + if (n == 0) { + return; + } + + std::lock_guard lck(tell_mutex_); + if (length_ + n > get_num_rows()) { + resize_field_data(length_ + n); + } + + auto i = 0; + for (const auto& json : *array) { + field_data_[length_ + i] = + Json(simdjson::padded_string(json.value())); + i++; + } + length_ += n; + } }; } // namespace milvus::storage