Spade A 911a8df17c
feat: impl StructArray -- data storage support in segcore (#42406)
Ref https://github.com/milvus-io/milvus/issues/42148
This PR mainly enables segcore to support array of vector (read and
write, but not indexing). Now only float vector as the element type is
supported.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
2025-06-12 14:38:35 +08:00

305 lines
9.7 KiB
C++

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <boost/algorithm/string/join.hpp>
#include <boost/algorithm/string/replace.hpp>
#include <cstddef>
#include <cstdlib>
#include <optional>
#include <string>
#include <string_view>
#include "common/EasyAssert.h"
#include "simdjson.h"
#include "fmt/core.h"
#include "simdjson/common_defs.h"
#include "simdjson/dom/array.h"
#include "simdjson/dom/document.h"
#include "simdjson/dom/element.h"
#include "simdjson/error.h"
#include "simdjson/padded_string.h"
#include "rapidjson/document.h"
#include "rapidjson/error/en.h"
#include "rapidjson/writer.h"
#include "rapidjson/stringbuffer.h"
namespace milvus {
// function to extract specific keys and convert them to json
// rapidjson is suitable for extract and reconstruct serialization
// instead of simdjson which not suitable for serialization
inline std::string
ExtractSubJson(const std::string& json, const std::vector<std::string>& keys) {
rapidjson::Document doc;
doc.Parse(json.c_str());
if (doc.HasParseError()) {
PanicInfo(ErrorCode::UnexpectedError,
"json parse failed, error:{}",
rapidjson::GetParseError_En(doc.GetParseError()));
}
rapidjson::Document result_doc;
result_doc.SetObject();
rapidjson::Document::AllocatorType& allocator = result_doc.GetAllocator();
for (const auto& key : keys) {
if (doc.HasMember(key.c_str())) {
result_doc.AddMember(rapidjson::Value(key.c_str(), allocator),
doc[key.c_str()],
allocator);
}
}
rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
result_doc.Accept(writer);
return buffer.GetString();
}
using document = simdjson::ondemand::document;
template <typename T>
using value_result = simdjson::simdjson_result<T>;
class Json {
public:
Json() = default;
explicit Json(simdjson::padded_string data) : own_data_(std::move(data)) {
data_ = own_data_.value();
}
Json(const char* data, size_t len, size_t cap) : data_(data, len) {
AssertInfo(
len + simdjson::SIMDJSON_PADDING <= cap,
"create json without enough memory size for SIMD, len={}, cap={}",
len,
cap);
}
// WARN: this is used for fast non-copy construction,
// MUST make sure that the data points to a memory that
// with size at least len + SIMDJSON_PADDING
Json(const char* data, size_t len)
: data_(data, len, len + simdjson::SIMDJSON_PADDING) {
}
Json(const Json& json) {
if (json.own_data_.has_value()) {
own_data_ = simdjson::padded_string(
json.own_data_.value().data(), json.own_data_.value().length());
data_ = own_data_.value();
} else {
data_ = json.data_;
}
};
Json(Json&& json) noexcept {
if (json.own_data_.has_value()) {
own_data_ = std::move(json.own_data_);
data_ = own_data_.value();
} else {
data_ = json.data_;
}
}
Json&
operator=(const Json& json) {
if (json.own_data_.has_value()) {
own_data_ = simdjson::padded_string(
json.own_data_.value().data(), json.own_data_.value().length());
data_ = own_data_.value();
} else {
data_ = json.data_;
}
return *this;
}
operator std::string_view() const {
return data_;
}
value_result<document>
doc() const {
if (data_.size() == 0) {
return {};
}
thread_local simdjson::ondemand::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc =
parser.iterate(data_, data_.size() + simdjson::SIMDJSON_PADDING);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {}: {}",
data_,
simdjson::error_message(doc.error()));
return doc;
}
value_result<document>
doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::ondemand::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.iterate(
data_.data() + offset, length, length + simdjson::SIMDJSON_PADDING);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {} offset {}, length {}: {}, "
"total_json:{}",
std::string(data_.data() + offset, length),
offset,
length,
simdjson::error_message(doc.error()),
data_);
return doc;
}
value_result<simdjson::dom::element>
dom_doc() const {
if (data_.size() == 0) {
return {};
}
thread_local simdjson::dom::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.parse(data_);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {}: {}",
data_,
simdjson::error_message(doc.error()));
return doc;
}
value_result<simdjson::dom::element>
dom_doc(uint16_t offset, uint16_t length) const {
thread_local simdjson::dom::parser parser;
// it's always safe to add the padding,
// as we have allocated the memory with this padding
auto doc = parser.parse(data_.data() + offset, length);
AssertInfo(doc.error() == simdjson::SUCCESS,
"failed to parse the json {}: {}",
std::string(data_.data() + offset, length),
simdjson::error_message(doc.error()));
return doc;
}
bool
exist(std::string_view pointer) const {
auto doc = this->doc();
if (pointer.empty()) {
return doc.error() == simdjson::SUCCESS && !doc.is_null();
} else {
auto res = doc.at_pointer(pointer);
return res.error() == simdjson::SUCCESS && !res.is_null();
}
}
// construct JSON pointer with provided path
static std::string
pointer(std::vector<std::string> nested_path) {
if (nested_path.empty()) {
return "";
}
std::for_each(
nested_path.begin(), nested_path.end(), [](std::string& key) {
boost::replace_all(key, "~", "~0");
boost::replace_all(key, "/", "~1");
});
auto pointer = "/" + boost::algorithm::join(nested_path, "/");
return pointer;
}
auto
type(const std::string& pointer) const {
return pointer.empty() ? doc().type()
: doc().at_pointer(pointer).type();
}
auto
get_number_type(const std::string& pointer) const {
return pointer.empty() ? doc().get_number_type()
: doc().at_pointer(pointer).get_number_type();
}
template <typename T>
value_result<T>
at(std::string_view pointer) const {
if (pointer == "") {
if constexpr (std::is_same_v<std::string_view, T> ||
std::is_same_v<std::string, T>) {
return doc().get_string(false);
} else if constexpr (std::is_same_v<bool, T>) {
return doc().get_bool();
} else if constexpr (std::is_same_v<int64_t, T>) {
return doc().get_int64();
} else if constexpr (std::is_same_v<double, T>) {
return doc().get_double();
}
}
return doc().at_pointer(pointer).get<T>();
}
template <typename T>
value_result<T>
at(uint16_t offset, uint16_t length) const {
return doc(offset, length).get<T>();
}
std::string_view
at_string(uint16_t offset, uint16_t length) const {
return std::string_view(data_.data() + offset, length);
}
value_result<simdjson::dom::array>
array_at(uint16_t offset, uint16_t length) const {
return dom_doc(offset, length).get_array();
}
// get dom array by JSON pointer,
// call `size()` to get array size,
// call `at()` to get array element by index,
// iterate through array elements by iterator.
value_result<simdjson::dom::array>
array_at(std::string_view pointer) const {
return dom_doc().at_pointer(pointer).get_array();
}
size_t
size() const {
return data_.size();
}
std::string_view
data() const {
return data_;
}
const char*
c_str() const {
return data_.data();
}
private:
std::optional<simdjson::padded_string>
own_data_{}; // this could be empty, then the Json will be just s view on bytes
simdjson::padded_string_view data_{};
};
} // namespace milvus