milvus/internal/core/src/common/FieldData.h
marcelo-cjl 3b599441fd
feat: Add nullable vector support for proxy and querynode (#46305)
related: #45993 

This commit extends nullable vector support to the proxy layer,
querynode,
and adds comprehensive validation, search reduce, and field data
handling
    for nullable vectors with sparse storage.
    
    Proxy layer changes:
- Update validate_util.go checkAligned() with getExpectedVectorRows()
helper
      to validate nullable vector field alignment using valid data count
- Update checkFloatVectorFieldData/checkSparseFloatVectorFieldData for
      nullable vector validation with proper row count expectations
- Add FieldDataIdxComputer in typeutil/schema.go for logical-to-physical
      index translation during search reduce operations
- Update search_reduce_util.go reduceSearchResultData to use
idxComputers
      for correct field data indexing with nullable vectors
- Update task.go, task_query.go, task_upsert.go for nullable vector
handling
    - Update msg_pack.go with nullable vector field data processing
    
    QueryNode layer changes:
    - Update segments/result.go for nullable vector result handling
- Update segments/search_reduce.go with nullable vector offset
translation
    
    Storage and index changes:
- Update data_codec.go and utils.go for nullable vector serialization
- Update indexcgowrapper/dataset.go and index.go for nullable vector
indexing
    
    Utility changes:
- Add FieldDataIdxComputer struct with Compute() method for efficient
      logical-to-physical index mapping across multiple field data
- Update EstimateEntitySize() and AppendFieldData() with fieldIdxs
parameter
    - Update funcutil.go with nullable vector support functions

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Full support for nullable vector fields (float, binary, float16,
bfloat16, int8, sparse) across ingest, storage, indexing, search and
retrieval; logical↔physical offset mapping preserves row semantics.
  * Client: compaction control and compaction-state APIs.

* **Bug Fixes**
* Improved validation for adding vector fields (nullable + dimension
checks) and corrected search/query behavior for nullable vectors.

* **Chores**
  * Persisted validity maps with indexes and on-disk formats.

* **Tests**
  * Extensive new and updated end-to-end nullable-vector tests.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: marcelo-cjl <marcelo.chen@zilliz.com>
2025-12-24 10:13:19 +08:00

450 lines
14 KiB
C++

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <memory>
#include <utility>
#include <unordered_map>
#include <vector>
#include <oneapi/tbb/concurrent_queue.h>
#include "common/FieldDataInterface.h"
#include "common/Channel.h"
#include "common/ArrowDataWrapper.h"
namespace milvus {
template <typename Type>
class FieldData : public FieldDataImpl<Type, true> {
public:
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, nullable, buffered_num_rows) {
}
static_assert(IsScalar<Type> || std::is_same_v<Type, PkType>);
explicit FieldData(DataType data_type,
bool nullable,
FixedVector<Type>&& inner_data)
: FieldDataImpl<Type, true>::FieldDataImpl(
1, data_type, nullable, std::move(inner_data)) {
}
};
template <>
class FieldData<std::string> : public FieldDataStringImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataStringImpl(data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<Json> : public FieldDataJsonImpl {
public:
static_assert(IsScalar<std::string> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataJsonImpl(data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<Geometry> : public FieldDataGeometryImpl {
public:
static_assert(IsScalar<Geometry>);
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataGeometryImpl(data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<Array> : public FieldDataArrayImpl {
public:
static_assert(IsScalar<Array> || std::is_same_v<std::string, PkType>);
explicit FieldData(DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataArrayImpl(data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<VectorArray> : public FieldDataVectorArrayImpl {
public:
explicit FieldData(int64_t dim,
DataType element_type,
int64_t buffered_num_rows = 0)
: FieldDataVectorArrayImpl(DataType::VECTOR_ARRAY, buffered_num_rows),
dim_(dim),
element_type_(element_type) {
AssertInfo(element_type != DataType::NONE,
"element_type must be specified for VECTOR_ARRAY");
}
int64_t
get_dim() const override {
return dim_;
}
DataType
get_element_type() const {
return element_type_;
}
void
set_element_type(DataType element_type) {
element_type_ = element_type;
}
const VectorArray*
value_at(ssize_t offset) const {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return &data_[offset];
}
private:
int64_t dim_;
DataType element_type_;
};
template <typename Type, bool is_type_entire_row = false>
class FieldDataVectorImpl : public FieldDataImpl<Type, is_type_entire_row> {
private:
struct LogicalToPhysicalMapping {
bool mapping{false};
std::unordered_map<int64_t, int64_t> l2p_map;
std::vector<int64_t> l2p_vec;
int64_t
get_physical_offset(int64_t logical_offset) const {
if (!mapping) {
return logical_offset;
}
if (!l2p_map.empty()) {
auto it = l2p_map.find(logical_offset);
if (it != l2p_map.end()) {
return it->second;
}
return -1;
}
if (logical_offset < static_cast<int64_t>(l2p_vec.size())) {
return l2p_vec[logical_offset];
}
return -1;
}
void
build(const uint8_t* valid_data,
int64_t start_physical,
int64_t start_logical,
int64_t total_count,
int64_t valid_count) {
if (total_count == 0) {
return;
}
mapping = true;
// use map when valid ratio < 10%
bool use_map = (valid_count * 10 < total_count);
if (use_map) {
int64_t physical_idx = start_physical;
for (int64_t i = 0; i < total_count; ++i) {
int64_t bit_pos = start_logical + i;
if (valid_data == nullptr ||
((valid_data[bit_pos >> 3] >> (bit_pos & 0x07)) & 1)) {
l2p_map[start_logical + i] = physical_idx++;
}
}
} else {
// resize l2p_vec if needed
int64_t required_size = start_logical + total_count;
if (static_cast<int64_t>(l2p_vec.size()) < required_size) {
l2p_vec.resize(required_size, -1);
}
int64_t physical_idx = start_physical;
for (int64_t i = 0; i < total_count; ++i) {
int64_t bit_pos = start_logical + i;
if (valid_data == nullptr ||
((valid_data[bit_pos >> 3] >> (bit_pos & 0x07)) & 1)) {
l2p_vec[start_logical + i] = physical_idx++;
} else {
l2p_vec[start_logical + i] = -1;
}
}
}
}
};
void
resize_field_data(int64_t num_rows, int64_t valid_count) {
Assert(this->nullable_);
std::lock_guard lck(this->num_rows_mutex_);
if (num_rows > this->num_rows_) {
this->num_rows_ = num_rows;
this->valid_data_.resize((num_rows + 7) / 8, 0x00);
}
if (valid_count > this->valid_count_) {
this->data_.resize(valid_count * this->dim_);
}
}
LogicalToPhysicalMapping l2p_mapping_;
public:
using FieldDataImpl<Type, is_type_entire_row>::FieldDataImpl;
using FieldDataImpl<Type, is_type_entire_row>::resize_field_data;
void
FillFieldData(const void* field_data,
const uint8_t* valid_data,
ssize_t element_count,
ssize_t offset) override;
const void*
RawValue(ssize_t offset) const override {
auto physical_offset = l2p_mapping_.get_physical_offset(offset);
if (physical_offset == -1) {
return nullptr;
}
return &this->data_[physical_offset * this->dim_];
}
int64_t
DataSize() const override {
auto dim = this->dim_;
if (this->nullable_) {
return sizeof(Type) * this->valid_count_ * dim;
}
return sizeof(Type) * this->length_ * dim;
}
int64_t
DataSize(ssize_t offset) const override {
auto dim = this->dim_;
AssertInfo(offset < this->get_num_rows(),
"field data subscript out of range");
return sizeof(Type) * dim;
}
int64_t
get_valid_rows() const override {
if (this->nullable_) {
return this->valid_count_;
}
return this->get_num_rows();
}
};
class FieldDataSparseVectorImpl
: public FieldDataVectorImpl<knowhere::sparse::SparseRow<SparseValueType>,
true> {
using Base =
FieldDataVectorImpl<knowhere::sparse::SparseRow<SparseValueType>, true>;
public:
// Bring base class FillFieldData overloads into scope (for nullable support)
using Base::FillFieldData;
explicit FieldDataSparseVectorImpl(DataType data_type,
bool nullable = false,
int64_t total_num_rows = 0)
: FieldDataVectorImpl<knowhere::sparse::SparseRow<SparseValueType>,
true>(
/*dim=*/1, data_type, nullable, total_num_rows),
vec_dim_(0) {
AssertInfo(data_type == DataType::VECTOR_SPARSE_U32_F32,
"invalid data type for sparse vector");
}
int64_t
DataSize() const override {
int64_t data_size = 0;
size_t count = nullable_ ? valid_count_ : length_;
for (size_t i = 0; i < count; ++i) {
data_size += data_[i].data_byte_size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
size_t count = nullable_ ? valid_count_ : length_;
AssertInfo(
offset < count,
"subscript position don't has valid value offset={}, count={}",
offset,
count);
return data_[offset].data_byte_size();
}
void
FillFieldData(const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + element_count > get_num_rows()) {
FieldDataImpl::resize_field_data(length_ + element_count);
}
auto ptr =
static_cast<const knowhere::sparse::SparseRow<SparseValueType>*>(
source);
for (int64_t i = 0; i < element_count; ++i) {
auto& row = ptr[i];
vec_dim_ = std::max(vec_dim_, row.dim());
}
std::copy_n(ptr, element_count, data_.data() + length_);
length_ += element_count;
}
void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
FieldDataImpl::resize_field_data(length_ + n);
}
for (int64_t i = 0; i < array->length(); ++i) {
auto view = array->GetView(i);
auto& row = data_[length_ + i];
row = CopyAndWrapSparseRow(view.data(), view.size());
vec_dim_ = std::max(vec_dim_, row.dim());
}
length_ += n;
}
int64_t
Dim() const {
return vec_dim_;
}
private:
int64_t vec_dim_ = 0;
};
template <>
class FieldData<FloatVector> : public FieldDataVectorImpl<float, false> {
public:
explicit FieldData(int64_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataVectorImpl<float, false>::FieldDataVectorImpl(
dim, data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<BinaryVector> : public FieldDataVectorImpl<uint8_t, false> {
public:
explicit FieldData(int64_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataVectorImpl(dim / 8, data_type, nullable, buffered_num_rows),
binary_dim_(dim) {
Assert(dim % 8 == 0);
}
int64_t
get_dim() const override {
return binary_dim_;
}
private:
int64_t binary_dim_;
};
template <>
class FieldData<Float16Vector> : public FieldDataVectorImpl<float16, false> {
public:
explicit FieldData(int64_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataVectorImpl<float16, false>::FieldDataVectorImpl(
dim, data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<BFloat16Vector> : public FieldDataVectorImpl<bfloat16, false> {
public:
explicit FieldData(int64_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataVectorImpl<bfloat16, false>::FieldDataVectorImpl(
dim, data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<SparseFloatVector> : public FieldDataSparseVectorImpl {
public:
explicit FieldData(DataType data_type,
bool nullable = false,
int64_t buffered_num_rows = 0)
: FieldDataSparseVectorImpl(data_type, nullable, buffered_num_rows) {
}
};
template <>
class FieldData<Int8Vector> : public FieldDataVectorImpl<int8, false> {
public:
explicit FieldData(int64_t dim,
DataType data_type,
bool nullable,
int64_t buffered_num_rows = 0)
: FieldDataVectorImpl<int8, false>::FieldDataVectorImpl(
dim, data_type, nullable, buffered_num_rows) {
}
};
using FieldDataPtr = std::shared_ptr<FieldDataBase>;
using FieldDataChannel = Channel<FieldDataPtr>;
using FieldDataChannelPtr = std::shared_ptr<FieldDataChannel>;
FieldDataPtr
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows);
} // namespace milvus