milvus/internal/core/src/common/FieldDataInterface.h
congqixia 8d1ea751a6
fix: Support JSON default values in FillFieldData (#45455)
Related to #45445

Previously, FillFieldData for JSON fields would assert and fail when a
default_value was provided, blocking index creation for JSON fields with
default values (including dynamic fields like $meta).

This change enables JSON default value support by:
- Removing the assertion that blocked default values
- Parsing bytes_data into Json objects when default_value is present
- Properly filling data_ array and setting valid_data_ bitset to true
- Maintaining null behavior when no default_value is provided

Impact:
- Fixes index creation failure for JSON fields with default values
- Resolves upgrade issues from 2.5 to 2.6.5 where dynamic fields with
default values couldn't be indexed
- Index builds that were stuck in InProgress state can now complete

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
2025-11-11 10:35:36 +08:00

946 lines
27 KiB
C++

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstddef>
#include <iostream>
#include <memory>
#include <type_traits>
#include <vector>
#include <string>
#include <mutex>
#include <shared_mutex>
#include "log/Log.h"
#include "Types.h"
#include "arrow/api.h"
#include "arrow/array/array_binary.h"
#include "arrow/chunked_array.h"
#include "common/FieldMeta.h"
#include "common/Utils.h"
#include "common/VectorTrait.h"
#include "common/EasyAssert.h"
#include "common/Array.h"
#include "common/VectorArray.h"
#include "knowhere/dataset.h"
#include "common/TypeTraits.h"
namespace milvus {
class FieldDataBase {
public:
explicit FieldDataBase(DataType data_type, bool nullable)
: data_type_(data_type), nullable_(nullable) {
}
virtual ~FieldDataBase() = default;
// For all FieldDataImpl subclasses, source is a pointer to element_count of
// Type
virtual void
FillFieldData(const void* source, ssize_t element_count) = 0;
virtual void
FillFieldData(const void* field_data,
const uint8_t* valid_data,
ssize_t element_count,
ssize_t offset) = 0;
virtual void
FillFieldData(const std::shared_ptr<arrow::ChunkedArray> arrays) = 0;
virtual void
FillFieldData(const std::shared_ptr<arrow::Array> array) = 0;
virtual void
FillFieldData(const std::optional<DefaultValueType> default_value,
ssize_t element_count) = 0;
// For all FieldDataImpl subclasses, this method returns Type* that points
// at all rows in this field data.
virtual void*
Data() = 0;
virtual uint8_t*
ValidData() = 0;
// For all FieldDataImpl subclasses, this method returns a Type* that points
// at the offset-th row of this field data.
virtual const void*
RawValue(ssize_t offset) const = 0;
// Returns the serialized bytes size of all rows.
virtual int64_t
Size() const = 0;
virtual int64_t
DataSize() const = 0;
virtual int64_t
ValidDataSize() const = 0;
// Returns the serialized bytes size of the index-th row.
virtual int64_t
DataSize(ssize_t index) const = 0;
// Number of filled rows
virtual size_t
Length() const = 0;
virtual bool
IsFull() const = 0;
virtual bool
IsNullable() const = 0;
virtual void
Reserve(size_t cap) = 0;
public:
// row capacity
virtual int64_t
get_num_rows() const = 0;
// each row is represented as how many Type elements
virtual int64_t
get_dim() const = 0;
DataType
get_data_type() const {
return data_type_;
}
virtual int64_t
get_null_count() const = 0;
virtual bool
is_valid(ssize_t offset) const = 0;
protected:
const DataType data_type_;
const bool nullable_;
};
template <typename Type>
class FieldBitsetImpl : public FieldDataBase {
public:
FieldBitsetImpl() = delete;
FieldBitsetImpl(FieldBitsetImpl&&) = delete;
FieldBitsetImpl(const FieldBitsetImpl&) = delete;
FieldBitsetImpl&
operator=(FieldBitsetImpl&&) = delete;
FieldBitsetImpl&
operator=(const FieldBitsetImpl&) = delete;
explicit FieldBitsetImpl(DataType data_type, TargetBitmap&& bitmap)
: FieldDataBase(data_type, false), length_(bitmap.size()) {
data_ = std::move(bitmap).into();
cap_ = data_.size() * sizeof(Type) * 8;
Assert(cap_ >= length_);
}
// FillFieldData used for read and write with storage,
// no need to implement for bitset which used in runtime process.
void
FillFieldData(const void* source, ssize_t element_count) override {
ThrowInfo(NotImplemented,
"FillFieldData(const void* source, ssize_t element_count)"
"not implemented for bitset");
}
void
FillFieldData(const void* field_data,
const uint8_t* valid_data,
ssize_t element_count,
ssize_t offset) override {
ThrowInfo(NotImplemented,
"FillFieldData(const void* field_data, "
"const uint8_t* valid_data, ssize_t element_count)"
"not implemented for bitset");
}
void
FillFieldData(const std::shared_ptr<arrow::Array> array) override {
ThrowInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::Array>& array) "
"not implemented for bitset");
}
void
FillFieldData(const std::shared_ptr<arrow::ChunkedArray> arrays) override {
ThrowInfo(
NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::ChunkedArray>& arrays) "
"not implemented for bitset");
}
void
FillFieldData(const std::optional<DefaultValueType> default_value,
ssize_t element_count) override {
ThrowInfo(NotImplemented,
"FillFieldData(const const std::optional<DefaultValueType> "
"default_value, "
"ssize_t element_count) not implemented for bitset");
}
virtual void
FillFieldData(const std::shared_ptr<arrow::StringArray>& array) {
ThrowInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::StringArray>& "
"array) not implemented for bitset");
}
virtual void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) {
ThrowInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::BinaryArray>& "
"array) not implemented for bitset");
}
std::string
GetName() const {
return "FieldBitsetImpl";
}
void*
Data() override {
return data_.data();
}
uint8_t*
ValidData() override {
ThrowInfo(NotImplemented, "ValidData() not implemented for bitset");
}
const void*
RawValue(ssize_t offset) const override {
ThrowInfo(NotImplemented,
"RawValue(ssize_t offset) not implemented for bitset");
}
int64_t
Size() const override {
return DataSize() + ValidDataSize();
}
int64_t
DataSize() const override {
return sizeof(Type) * get_num_rows();
}
int64_t
DataSize(ssize_t offset) const override {
return sizeof(Type);
}
int64_t
ValidDataSize() const override {
return 0;
}
size_t
Length() const override {
return get_length();
}
bool
IsFull() const override {
auto cap_num_rows = get_num_rows();
auto filled_num_rows = get_length();
return cap_num_rows == filled_num_rows;
}
bool
IsNullable() const override {
return false;
}
void
Reserve(size_t cap) override {
std::lock_guard lck(cap_mutex_);
AssertInfo(cap % (8 * sizeof(Type)) == 0,
"Reverse bitset size must be a multiple of {}",
8 * sizeof(Type));
if (cap > cap_) {
data_.resize(cap / (8 * sizeof(Type)));
cap_ = cap;
}
}
public:
int64_t
get_num_rows() const override {
std::shared_lock lck(cap_mutex_);
return cap_;
}
size_t
get_length() const {
std::shared_lock lck(length_mutex_);
return length_;
}
int64_t
get_dim() const override {
return 1;
}
int64_t
get_null_count() const override {
ThrowInfo(NotImplemented,
"get_null_count() not implemented for bitset");
}
bool
is_valid(ssize_t offset) const override {
ThrowInfo(NotImplemented,
"is_valid(ssize_t offset) not implemented for bitset");
}
private:
FixedVector<Type> data_{};
// capacity that data_ can store
int64_t cap_;
mutable std::shared_mutex cap_mutex_;
// number of actual elements in data_
size_t length_{};
mutable std::shared_mutex length_mutex_;
};
template <typename Type, bool is_type_entire_row = false>
class FieldDataImpl : public FieldDataBase {
public:
FieldDataImpl(FieldDataImpl&&) = delete;
FieldDataImpl(const FieldDataImpl&) = delete;
FieldDataImpl&
operator=(FieldDataImpl&&) = delete;
FieldDataImpl&
operator=(const FieldDataImpl&) = delete;
public:
explicit FieldDataImpl(ssize_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataBase(data_type, nullable),
num_rows_(buffered_num_rows),
dim_(is_type_entire_row ? 1 : dim) {
data_.resize(num_rows_ * dim_);
if (nullable) {
if (IsVectorDataType(data_type)) {
ThrowInfo(NotImplemented, "vector type not support null");
}
valid_data_.resize((num_rows_ + 7) / 8, 0xFF);
}
}
explicit FieldDataImpl(size_t dim,
DataType type,
bool nullable,
FixedVector<Type>&& data)
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
AssertInfo(!nullable, "need to fill valid_data when nullable is true");
data_ = std::move(data);
Assert(data_.size() % dim == 0);
num_rows_ = data_.size() / dim;
}
explicit FieldDataImpl(size_t dim,
DataType type,
bool nullable,
FixedVector<Type>&& data,
FixedVector<uint8_t>&& valid_data)
: FieldDataBase(type, nullable), dim_(is_type_entire_row ? 1 : dim) {
AssertInfo(nullable,
"no need to fill valid_data when nullable is false");
data_ = std::move(data);
valid_data_ = std::move(valid_data);
Assert(data_.size() % dim == 0);
num_rows_ = data_.size() / dim;
}
void
FillFieldData(const void* source, ssize_t element_count) override;
void
FillFieldData(const void* field_data,
const uint8_t* valid_data,
ssize_t element_count,
ssize_t offset) override;
void
FillFieldData(const std::shared_ptr<arrow::ChunkedArray> arrays) override;
void
FillFieldData(const std::shared_ptr<arrow::Array> array) override;
void
FillFieldData(const std::optional<DefaultValueType> default_value,
ssize_t element_count) override;
virtual void
FillFieldData(const std::shared_ptr<arrow::StringArray>& array) {
ThrowInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::StringArray>& "
"array) not implemented by default");
}
virtual void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) {
ThrowInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::BinaryArray>& "
"array) not implemented by default");
}
std::string
GetName() const {
return "FieldDataImpl";
}
void*
Data() override {
return data_.data();
}
uint8_t*
ValidData() override {
return valid_data_.data();
}
const void*
RawValue(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return &data_[offset];
}
int64_t
Size() const override {
return DataSize() + ValidDataSize();
}
int64_t
DataSize() const override {
return sizeof(Type) * length() * dim_;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return sizeof(Type) * dim_;
}
int64_t
ValidDataSize() const override {
if (nullable_) {
return sizeof(uint8_t) * (length() + 7) / 8;
}
return 0;
}
size_t
Length() const override {
return length_;
}
bool
IsFull() const override {
auto buffered_num_rows = get_num_rows();
auto filled_num_rows = length();
return buffered_num_rows == filled_num_rows;
}
bool
IsNullable() const override {
return nullable_;
}
void
Reserve(size_t cap) override {
std::lock_guard lck(num_rows_mutex_);
if (cap > num_rows_) {
num_rows_ = cap;
data_.resize(num_rows_ * dim_);
}
if (nullable_) {
valid_data_.resize((num_rows_ + 7) / 8, 0xFF);
}
}
public:
int64_t
get_num_rows() const override {
std::shared_lock lck(num_rows_mutex_);
return num_rows_;
}
void
resize_field_data(int64_t num_rows) {
std::lock_guard lck(num_rows_mutex_);
if (num_rows > num_rows_) {
num_rows_ = num_rows;
data_.resize(num_rows_ * dim_);
if (nullable_) {
valid_data_.resize((num_rows + 7) / 8, 0xFF);
}
}
}
size_t
length() const {
std::shared_lock lck(tell_mutex_);
return length_;
}
int64_t
get_dim() const override {
return dim_;
}
int64_t
get_null_count() const override {
std::shared_lock lck(tell_mutex_);
return null_count_;
}
bool
is_valid(ssize_t offset) const override {
std::shared_lock lck(tell_mutex_);
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
if (!nullable_) {
return true;
}
auto bit = (valid_data_[offset >> 3] >> ((offset & 0x07))) & 1;
return bit;
}
protected:
FixedVector<Type> data_{};
FixedVector<uint8_t> valid_data_{};
// number of elements data_ can hold
int64_t num_rows_;
mutable std::shared_mutex num_rows_mutex_;
int64_t null_count_{0};
// number of actual elements in data_
size_t length_{};
mutable std::shared_mutex tell_mutex_;
private:
const ssize_t dim_;
};
class FieldDataStringImpl : public FieldDataImpl<std::string, true> {
public:
explicit FieldDataStringImpl(DataType data_type,
bool nullable,
int64_t total_num_rows = 0)
: FieldDataImpl<std::string, true>(
1, data_type, nullable, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].size();
}
void
FillFieldData(const std::shared_ptr<arrow::StringArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
resize_field_data(length_ + n);
}
auto i = 0;
for (const auto& str : *array) {
data_[length_ + i] = str.value();
i++;
}
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data != nullptr) {
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_copy(
valid_data,
array->offset(),
valid_data_.data(),
length_,
n);
}
}
length_ += n;
}
};
class FieldDataGeometryImpl : public FieldDataImpl<std::string, true> {
public:
explicit FieldDataGeometryImpl(DataType data_type,
bool nullable,
int64_t total_num_rows = 0)
: FieldDataImpl<std::string, true>(
1, data_type, nullable, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].size();
}
void
FillFieldData(const std::shared_ptr<arrow::Array> array) override {
AssertInfo(array->type()->id() == arrow::Type::type::BINARY,
"inconsistent data type, expected: {}, got: {}",
"BINARY",
array->type()->ToString());
auto geometry_array =
std::dynamic_pointer_cast<arrow::BinaryArray>(array);
FillFieldData(geometry_array);
}
void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
null_count_ = array->null_count();
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
resize_field_data(length_ + n);
}
auto i = 0;
for (const auto& geometry : *array) {
if (!geometry.has_value()) {
i++;
continue;
}
data_[length_ + i] = geometry.value();
i++;
}
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data != nullptr) {
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_copy(
valid_data,
array->offset(),
valid_data_.data(),
length_,
n);
}
}
length_ += n;
}
};
class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
public:
explicit FieldDataJsonImpl(DataType data_type,
bool nullable,
int64_t total_num_rows = 0)
: FieldDataImpl<Json, true>(1, data_type, nullable, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].data().size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].data().size();
}
void
FillFieldData(const std::shared_ptr<arrow::Array> array) override {
AssertInfo(array->type()->id() == arrow::Type::type::BINARY,
"inconsistent data type, expected: {}, got: {}",
"BINARY",
array->type()->ToString());
auto json_array = std::dynamic_pointer_cast<arrow::BinaryArray>(array);
FillFieldData(json_array);
}
// used for generate added field which has no related binlogs
void
FillFieldData(const std::optional<DefaultValueType> default_value,
ssize_t element_count) override {
if (element_count == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + element_count > get_num_rows()) {
resize_field_data(length_ + element_count);
}
if (default_value.has_value()) {
AssertInfo(default_value->has_bytes_data(),
"json type default_value shall be bytes data");
auto data = default_value->bytes_data();
Json default_json = Json(data.data(), data.size());
std::fill(data_.data() + length_,
data_.data() + length_ + element_count,
default_json);
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_fill(
valid_data_.data(), length_, element_count, true);
} else {
null_count_ = element_count;
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_fill(
valid_data_.data(), length_, element_count, false);
}
length_ += element_count;
}
void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
null_count_ = array->null_count();
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
resize_field_data(length_ + n);
}
auto i = 0;
for (const auto& json : *array) {
if (!json.has_value()) {
i++;
continue;
}
data_[length_ + i++] = Json(simdjson::padded_string(json.value()));
}
if (IsNullable()) {
auto valid_data = array->null_bitmap_data();
if (valid_data != nullptr) {
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_copy(
valid_data,
array->offset(),
valid_data_.data(),
length_,
n);
}
}
length_ += n;
}
// only for test
void
add_json_data(const std::vector<Json>& json) {
std::lock_guard lck(tell_mutex_);
if (length_ + json.size() > get_num_rows()) {
resize_field_data(length_ + json.size());
}
for (size_t i = 0; i < json.size(); ++i) {
data_[length_ + i] = json[i];
}
length_ += json.size();
}
};
class FieldDataSparseVectorImpl
: public FieldDataImpl<knowhere::sparse::SparseRow<SparseValueType>, true> {
public:
explicit FieldDataSparseVectorImpl(DataType data_type,
int64_t total_num_rows = 0)
: FieldDataImpl<knowhere::sparse::SparseRow<SparseValueType>, true>(
/*dim=*/1, data_type, false, total_num_rows),
vec_dim_(0) {
AssertInfo(data_type == DataType::VECTOR_SPARSE_U32_F32,
"invalid data type for sparse vector");
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t i = 0; i < length(); ++i) {
data_size += data_[i].data_byte_size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].data_byte_size();
}
// source is a pointer to element_count of
// knowhere::sparse::SparseRow<SparseValueType>
void
FillFieldData(const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + element_count > get_num_rows()) {
resize_field_data(length_ + element_count);
}
auto ptr =
static_cast<const knowhere::sparse::SparseRow<SparseValueType>*>(
source);
for (int64_t i = 0; i < element_count; ++i) {
auto& row = ptr[i];
vec_dim_ = std::max(vec_dim_, row.dim());
}
std::copy_n(ptr, element_count, data_.data() + length_);
length_ += element_count;
}
// each binary in array is a knowhere::sparse::SparseRow<SparseValueType>
void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
resize_field_data(length_ + n);
}
for (int64_t i = 0; i < array->length(); ++i) {
auto view = array->GetView(i);
auto& row = data_[length_ + i];
row = CopyAndWrapSparseRow(view.data(), view.size());
vec_dim_ = std::max(vec_dim_, row.dim());
}
length_ += n;
}
int64_t
Dim() const {
return vec_dim_;
}
private:
int64_t vec_dim_ = 0;
};
class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
public:
explicit FieldDataArrayImpl(DataType data_type,
bool nullable,
int64_t total_num_rows = 0)
: FieldDataImpl<Array, true>(1, data_type, nullable, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].byte_size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].byte_size();
}
};
// is_type_entire_row set be true as each element in data_ is a VectorArray
class FieldDataVectorArrayImpl : public FieldDataImpl<VectorArray, true> {
public:
explicit FieldDataVectorArrayImpl(DataType data_type,
int64_t total_num_rows = 0)
: FieldDataImpl<VectorArray, true>(
1, data_type, false, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].byte_size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].byte_size();
}
};
} // namespace milvus