marcelo-cjl 3b599441fd
feat: Add nullable vector support for proxy and querynode (#46305)
related: #45993 

This commit extends nullable vector support to the proxy layer,
querynode,
and adds comprehensive validation, search reduce, and field data
handling
    for nullable vectors with sparse storage.
    
    Proxy layer changes:
- Update validate_util.go checkAligned() with getExpectedVectorRows()
helper
      to validate nullable vector field alignment using valid data count
- Update checkFloatVectorFieldData/checkSparseFloatVectorFieldData for
      nullable vector validation with proper row count expectations
- Add FieldDataIdxComputer in typeutil/schema.go for logical-to-physical
      index translation during search reduce operations
- Update search_reduce_util.go reduceSearchResultData to use
idxComputers
      for correct field data indexing with nullable vectors
- Update task.go, task_query.go, task_upsert.go for nullable vector
handling
    - Update msg_pack.go with nullable vector field data processing
    
    QueryNode layer changes:
    - Update segments/result.go for nullable vector result handling
- Update segments/search_reduce.go with nullable vector offset
translation
    
    Storage and index changes:
- Update data_codec.go and utils.go for nullable vector serialization
- Update indexcgowrapper/dataset.go and index.go for nullable vector
indexing
    
    Utility changes:
- Add FieldDataIdxComputer struct with Compute() method for efficient
      logical-to-physical index mapping across multiple field data
- Update EstimateEntitySize() and AppendFieldData() with fieldIdxs
parameter
    - Update funcutil.go with nullable vector support functions

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Full support for nullable vector fields (float, binary, float16,
bfloat16, int8, sparse) across ingest, storage, indexing, search and
retrieval; logical↔physical offset mapping preserves row semantics.
  * Client: compaction control and compaction-state APIs.

* **Bug Fixes**
* Improved validation for adding vector fields (nullable + dimension
checks) and corrected search/query behavior for nullable vectors.

* **Chores**
  * Persisted validity maps with indexes and on-disk formats.

* **Tests**
  * Extensive new and updated end-to-end nullable-vector tests.

<sub>✏️ Tip: You can customize this high-level summary in your review
settings.</sub>
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: marcelo-cjl <marcelo.chen@zilliz.com>
2025-12-24 10:13:19 +08:00

622 lines
20 KiB
C++

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <sys/types.h>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string_view>
#include <utility>
#include <vector>
#include "arrow/array/array_base.h"
#include "arrow/record_batch.h"
#include "common/Array.h"
#include "common/File.h"
#include "common/VectorArray.h"
#include "common/ChunkTarget.h"
#include "common/EasyAssert.h"
#include "common/FieldDataInterface.h"
#include "common/Json.h"
#include "common/Span.h"
#include "knowhere/sparse_utils.h"
#include "simdjson/common_defs.h"
#include "sys/mman.h"
#include "common/Types.h"
#include "cachinglayer/Utils.h"
namespace milvus {
constexpr uint64_t MMAP_STRING_PADDING = 1;
constexpr uint64_t MMAP_GEOMETRY_PADDING = 1;
constexpr uint64_t MMAP_ARRAY_PADDING = 1;
// Shared mmap region manager for group chunks
class ChunkMmapGuard {
public:
ChunkMmapGuard(char* mmap_ptr, size_t mmap_size, std::string file_path)
: mmap_ptr_(mmap_ptr), mmap_size_(mmap_size), file_path_(file_path) {
}
~ChunkMmapGuard() {
if (mmap_ptr_ != nullptr) {
munmap(mmap_ptr_, mmap_size_);
}
if (!file_path_.empty()) {
unlink(file_path_.c_str());
}
}
char*
get_ptr() const {
return mmap_ptr_;
}
bool
is_file_backed() const {
return !file_path_.empty();
}
private:
char* mmap_ptr_;
size_t mmap_size_;
const std::string file_path_;
};
class Chunk {
public:
Chunk() = default;
Chunk(int64_t row_nums,
char* data,
uint64_t size,
bool nullable,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: data_(data),
row_nums_(row_nums),
size_(size),
nullable_(nullable),
chunk_mmap_guard_(chunk_mmap_guard) {
if (nullable) {
valid_.reserve(row_nums);
for (int i = 0; i < row_nums; i++) {
valid_.push_back((data[i >> 3] >> (i & 0x07)) & 1);
}
}
}
virtual ~Chunk() {
// The ChunkMmapGuard will handle the unmapping and unlinking of the file if it is file backed
}
uint64_t
Size() const {
return size_;
}
cachinglayer::ResourceUsage
CellByteSize() const {
if (chunk_mmap_guard_ && chunk_mmap_guard_->is_file_backed()) {
return cachinglayer::ResourceUsage(0, static_cast<int64_t>(size_));
}
return cachinglayer::ResourceUsage(static_cast<int64_t>(size_), 0);
}
int64_t
RowNums() const {
return row_nums_;
}
virtual const char*
ValueAt(int64_t idx) const = 0;
virtual const char*
Data() const {
return data_;
}
const char*
RawData() const {
return data_;
}
FixedVector<bool>&
Valid() {
return valid_;
}
virtual bool
isValid(int offset) const {
if (nullable_) {
return valid_[offset];
}
return true;
};
protected:
char* data_;
int64_t row_nums_;
uint64_t size_;
bool nullable_;
FixedVector<bool>
valid_; // parse null bitmap to valid_ to be compatible with SpanBase
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard_{nullptr};
};
// for fixed size data, includes fixed size array
class FixedWidthChunk : public Chunk {
public:
FixedWidthChunk(int32_t row_nums,
int32_t dim,
char* data,
uint64_t size,
uint64_t element_size,
bool nullable,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: Chunk(row_nums, data, size, nullable, chunk_mmap_guard),
dim_(dim),
element_size_(element_size) {
auto null_bitmap_bytes_num = nullable_ ? (row_nums_ + 7) / 8 : 0;
data_start_ = data_ + null_bitmap_bytes_num;
};
milvus::SpanBase
Span() const {
return milvus::SpanBase(data_start_,
nullable_ ? valid_.data() : nullptr,
row_nums_,
element_size_ * dim_);
}
const char*
ValueAt(int64_t idx) const override {
return data_start_ + idx * element_size_ * dim_;
}
const char*
Data() const override {
return data_start_;
}
private:
int dim_;
int element_size_;
const char* data_start_;
};
// A StringChunk is a class that represents a collection of strings stored in a contiguous memory block.
// It is initialized with the number of rows, a pointer to the data, the size of the data, and a boolean
// indicating whether the data can contain null values. The data is accessed using offsets, which are
// stored after an optional null bitmap. Each string is represented by a range in the data block, defined
// by these offsets.
//
// Example of a valid StringChunk:
//
// Suppose we have a data block containing the strings "apple", "banana", and "cherry", and we want to
// create a StringChunk for these strings. The data block might look like this:
//
// [null_bitmap][offsets][string_data]
// [00000000] [17, 22, 28, 34] ["apple", "banana", "cherry"]
//
// Here, the null_bitmap is empty (indicating no nulls), the offsets array indicates the start of each
// string in the data block, and the string_data contains the actual string content.
//
// StringChunk exampleChunk(3, dataPointer, dataSize, false);
//
// In this example, 'exampleChunk' is a StringChunk with 3 rows, a pointer to the data stored in 'dataPointer',
// a total data size of 'dataSize', and it does not support nullability.
class StringChunk : public Chunk {
public:
StringChunk() = default;
StringChunk(int32_t row_nums,
char* data,
uint64_t size,
bool nullable,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: Chunk(row_nums, data, size, nullable, chunk_mmap_guard) {
auto null_bitmap_bytes_num = nullable_ ? (row_nums_ + 7) / 8 : 0;
offsets_ = reinterpret_cast<uint32_t*>(data + null_bitmap_bytes_num);
}
std::string_view
operator[](const int i) const {
if (i < 0 || i >= row_nums_) {
ThrowInfo(ErrorCode::OutOfRange,
"index out of range {} at {}",
i,
row_nums_);
}
return {data_ + offsets_[i], offsets_[i + 1] - offsets_[i]};
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
StringViews(std::optional<std::pair<int64_t, int64_t>> offset_len);
int
binary_search_string(std::string_view target) {
// only supported sorted pk
int left = 0;
int right = row_nums_ - 1; // `right` should be num_rows_ - 1
int result =
-1; // Initialize result to store the first occurrence index
while (left <= right) {
int mid = left + (right - left) / 2;
std::string_view midString = (*this)[mid];
auto cmp = midString.compare(target);
if (cmp == 0) {
result = mid; // Store the index of match
right = mid - 1; // Continue searching in the left half
} else if (cmp < 0) {
// midString < target
left = mid + 1;
} else {
// midString > target
right = mid - 1;
}
}
return result;
}
int
lower_bound_string(std::string_view target) {
int left = 0;
int right = row_nums_;
while (left < right) {
int mid = left + (right - left) / 2;
std::string_view midString = (*this)[mid];
if (midString < target) {
left = mid + 1;
} else {
right = mid;
}
}
return left;
}
int
upper_bound_string(std::string_view target) {
int left = 0;
int right = row_nums_;
while (left < right) {
int mid = left + (right - left) / 2;
std::string_view midString = (*this)[mid];
if (midString <= target) {
left = mid + 1;
} else {
right = mid;
}
}
return left;
}
std::pair<std::vector<std::string_view>, FixedVector<bool>>
ViewsByOffsets(const FixedVector<int32_t>& offsets);
const char*
ValueAt(int64_t idx) const override {
return (*this)[idx].data();
}
uint32_t*
Offsets() {
return offsets_;
}
protected:
uint32_t* offsets_;
};
using JSONChunk = StringChunk;
using GeometryChunk = StringChunk;
// An ArrayChunk is a class that represents a collection of arrays stored in a contiguous memory block.
// It is initialized with the number of rows, a pointer to the data, the size of the data, the element type,
// and a boolean indicating whether the data can contain null values. The data is accessed using offsets and lengths,
// which are stored after an optional null bitmap. Each array is represented by a range in the data block.
//
// Example of a valid ArrayChunk:
//
// Suppose we have a data block containing arrays of integers [1, 2, 3], [4, 5], and [6, 7, 8, 9], and we want to
// create an ArrayChunk for these arrays. The data block might look like this:
//
// [null_bitmap][offsets_lens][array_data]
// [00000000] [29, 3, 41, 2, 49, 4, 65] [1, 2, 3, 4, 5, 6, 7, 8, 9]
//
// For string arrays, the structure is more complex as each string element needs its own offset:
// [null_bitmap][offsets_lens][array1_offsets][array1_data][array2_offsets][array2_data][array3_offsets][array3_data]
// [00000000] [29, 3, 53, 2, 69, 4, 101] [0, 5, 11, 16] ["hello", "world", "!"] [0, 3, 6] ["foo", "bar"] [0, 6, 12, 18, 24] ["apple", "orange", "banana", "grape"]
//
// Here, the null_bitmap is empty (indicating no nulls), the offsets_lens array contains pairs of (offset, length)
// for each array, and the array_data contains the actual array elements.
//
// ArrayChunk exampleChunk(3, dataPointer, dataSize, DataType::INT32, false);
//
// In this example, 'exampleChunk' is an ArrayChunk with 3 rows, a pointer to the data stored in 'dataPointer',
// a total data size of 'dataSize', element type INT32, and it does not support nullability.
class ArrayChunk : public Chunk {
public:
ArrayChunk(int32_t row_nums,
char* data,
uint64_t size,
milvus::DataType element_type,
bool nullable,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: Chunk(row_nums, data, size, nullable, chunk_mmap_guard),
element_type_(element_type) {
auto null_bitmap_bytes_num = 0;
if (nullable) {
null_bitmap_bytes_num = (row_nums + 7) / 8;
}
offsets_lens_ =
reinterpret_cast<uint32_t*>(data + null_bitmap_bytes_num);
}
ArrayView
View(int idx) const {
int idx_off = 2 * idx;
auto offset = offsets_lens_[idx_off];
auto len = offsets_lens_[idx_off + 1];
auto next_offset = offsets_lens_[idx_off + 2];
auto data_ptr = data_ + offset;
uint32_t offsets_bytes_len = 0;
uint32_t* offsets_ptr = nullptr;
if (IsStringDataType(element_type_)) {
offsets_bytes_len = len * sizeof(uint32_t);
offsets_ptr = reinterpret_cast<uint32_t*>(data_ptr);
}
return ArrayView(data_ptr + offsets_bytes_len,
len,
next_offset - offset - offsets_bytes_len,
element_type_,
offsets_ptr);
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
ViewsByOffsets(const FixedVector<int32_t>& offsets) {
std::vector<ArrayView> views;
FixedVector<bool> valid_res;
size_t size = offsets.size();
views.reserve(size);
valid_res.reserve(size);
for (auto i = 0; i < size; ++i) {
views.emplace_back(View(offsets[i]));
valid_res.emplace_back(isValid(offsets[i]));
}
return {std::move(views), std::move(valid_res)};
}
std::pair<std::vector<ArrayView>, FixedVector<bool>>
Views(std::optional<std::pair<int64_t, int64_t>> offset_len =
std::nullopt) const {
auto start_offset = 0;
auto len = row_nums_;
if (offset_len.has_value()) {
start_offset = offset_len->first;
len = offset_len->second;
AssertInfo(start_offset >= 0 && start_offset < row_nums_,
"Retrieve array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
AssertInfo(len > 0 && len <= row_nums_,
"Retrieve array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
AssertInfo(start_offset + len <= row_nums_,
"Retrieve array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
}
std::vector<ArrayView> views;
views.reserve(len);
auto end_offset = start_offset + len;
for (auto i = start_offset; i < end_offset; i++) {
views.emplace_back(View(i));
}
if (nullable_) {
FixedVector<bool> res_valid(valid_.begin() + start_offset,
valid_.begin() + end_offset);
return {std::move(views), std::move(res_valid)};
}
return {std::move(views), {}};
}
const char*
ValueAt(int64_t idx) const override {
ThrowInfo(ErrorCode::Unsupported,
"ArrayChunk::ValueAt is not supported");
}
private:
milvus::DataType element_type_;
uint32_t* offsets_lens_;
};
// A VectorArrayChunk is similar to an ArrayChunk but is specialized for storing arrays of vectors.
// Key differences and characteristics:
// - No Nullability: VectorArrayChunk does not support null values. Unlike ArrayChunk, it does not have a null bitmap.
// - Fixed Vector Dimensions: All vectors within a VectorArrayChunk have the same, fixed dimension, specified at creation.
// However, each row (array of vectors) can contain a variable number of these fixed-dimension vectors.
//
// Due to these characteristics, the data layout is simpler:
// [offsets_lens][all_vector_data_concatenated]
//
// Example:
// Suppose we have a data block containing arrays of vectors [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[10, 11, 12]], and [[13, 14, 15], [16, 17, 18]], and we want to
// create a VectorArrayChunk for these arrays. The data block might look like this:
//
// [offsets_lens][all_vector_data_concatenated]
// [28, 3, 36, 1, 76, 2, 100] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
class VectorArrayChunk : public Chunk {
public:
VectorArrayChunk(int64_t dim,
int32_t row_nums,
char* data,
uint64_t size,
milvus::DataType element_type,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: Chunk(row_nums, data, size, false, chunk_mmap_guard),
dim_(dim),
element_type_(element_type) {
offsets_lens_ = reinterpret_cast<uint32_t*>(data);
auto offset = 0;
offsets_.reserve(row_nums_ + 1);
offsets_.push_back(offset);
for (int64_t i = 0; i < row_nums_; i++) {
offset += offsets_lens_[i * 2 + 1];
offsets_.push_back(offset);
}
}
VectorArrayView
View(int64_t idx) const {
int idx_off = 2 * idx;
auto offset = offsets_lens_[idx_off];
auto len = offsets_lens_[idx_off + 1];
auto next_offset = offsets_lens_[idx_off + 2];
auto data_ptr = data_ + offset;
return VectorArrayView(
data_ptr, dim_, len, next_offset - offset, element_type_);
}
std::pair<std::vector<VectorArrayView>, FixedVector<bool>>
Views(std::optional<std::pair<int64_t, int64_t>> offset_len =
std::nullopt) const {
auto start_offset = 0;
auto len = row_nums_;
if (offset_len.has_value()) {
start_offset = offset_len->first;
len = offset_len->second;
AssertInfo(
start_offset >= 0 && start_offset < row_nums_,
"Retrieve vector array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
AssertInfo(
len > 0 && len <= row_nums_,
"Retrieve vector array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
AssertInfo(
start_offset + len <= row_nums_,
"Retrieve vector array views with out-of-bound offset:{}, "
"len:{}, wrong",
start_offset,
len);
}
std::vector<VectorArrayView> views;
views.reserve(len);
auto end_offset = start_offset + len;
for (int64_t i = start_offset; i < end_offset; i++) {
views.emplace_back(View(i));
}
// vector array does not support null, so just return {}.
return {std::move(views), {}};
}
const char*
ValueAt(int64_t idx) const override {
ThrowInfo(ErrorCode::Unsupported,
"VectorArrayChunk::ValueAt is not supported");
}
const char*
Data() const override {
return data_ + offsets_lens_[0];
}
const size_t*
Offsets() const {
return offsets_.data();
}
private:
int64_t dim_;
uint32_t* offsets_lens_;
milvus::DataType element_type_;
std::vector<size_t> offsets_;
};
class SparseFloatVectorChunk : public Chunk {
public:
SparseFloatVectorChunk(int32_t row_nums,
char* data,
uint64_t size,
bool nullable,
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
: Chunk(row_nums, data, size, nullable, chunk_mmap_guard) {
auto null_bitmap_bytes_num = nullable ? (row_nums + 7) / 8 : 0;
auto offsets_ptr =
reinterpret_cast<uint64_t*>(data + null_bitmap_bytes_num);
if (nullable_) {
for (int i = 0; i < row_nums; i++) {
if (isValid(i)) {
vec_.emplace_back(
(offsets_ptr[i + 1] - offsets_ptr[i]) /
knowhere::sparse::SparseRow<
SparseValueType>::element_size(),
reinterpret_cast<uint8_t*>(data + offsets_ptr[i]),
false);
dim_ = std::max(dim_, vec_.back().dim());
}
}
} else {
vec_.resize(row_nums);
for (int i = 0; i < row_nums; i++) {
vec_[i] = {(offsets_ptr[i + 1] - offsets_ptr[i]) /
knowhere::sparse::SparseRow<
SparseValueType>::element_size(),
reinterpret_cast<uint8_t*>(data + offsets_ptr[i]),
false};
dim_ = std::max(dim_, vec_[i].dim());
}
}
}
const char*
Data() const override {
return static_cast<const char*>(static_cast<const void*>(vec_.data()));
}
const char*
ValueAt(int64_t i) const override {
return static_cast<const char*>(
static_cast<const void*>(vec_.data() + i));
}
// only for test
std::vector<knowhere::sparse::SparseRow<SparseValueType>>&
Vec() {
return vec_;
}
int64_t
Dim() {
return dim_;
}
private:
int64_t dim_ = 0;
std::vector<knowhere::sparse::SparseRow<SparseValueType>> vec_;
};
} // namespace milvus