mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-30 15:35:33 +08:00
related: #45993 This commit extends nullable vector support to the proxy layer, querynode, and adds comprehensive validation, search reduce, and field data handling for nullable vectors with sparse storage. Proxy layer changes: - Update validate_util.go checkAligned() with getExpectedVectorRows() helper to validate nullable vector field alignment using valid data count - Update checkFloatVectorFieldData/checkSparseFloatVectorFieldData for nullable vector validation with proper row count expectations - Add FieldDataIdxComputer in typeutil/schema.go for logical-to-physical index translation during search reduce operations - Update search_reduce_util.go reduceSearchResultData to use idxComputers for correct field data indexing with nullable vectors - Update task.go, task_query.go, task_upsert.go for nullable vector handling - Update msg_pack.go with nullable vector field data processing QueryNode layer changes: - Update segments/result.go for nullable vector result handling - Update segments/search_reduce.go with nullable vector offset translation Storage and index changes: - Update data_codec.go and utils.go for nullable vector serialization - Update indexcgowrapper/dataset.go and index.go for nullable vector indexing Utility changes: - Add FieldDataIdxComputer struct with Compute() method for efficient logical-to-physical index mapping across multiple field data - Update EstimateEntitySize() and AppendFieldData() with fieldIdxs parameter - Update funcutil.go with nullable vector support functions <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Full support for nullable vector fields (float, binary, float16, bfloat16, int8, sparse) across ingest, storage, indexing, search and retrieval; logical↔physical offset mapping preserves row semantics. * Client: compaction control and compaction-state APIs. * **Bug Fixes** * Improved validation for adding vector fields (nullable + dimension checks) and corrected search/query behavior for nullable vectors. * **Chores** * Persisted validity maps with indexes and on-disk formats. * **Tests** * Extensive new and updated end-to-end nullable-vector tests. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: marcelo-cjl <marcelo.chen@zilliz.com>
312 lines
11 KiB
C++
312 lines
11 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#pragma once
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include "arrow/array/array_primitive.h"
|
|
#include "arrow/type_fwd.h"
|
|
#include "common/ChunkTarget.h"
|
|
#include "arrow/record_batch.h"
|
|
#include "common/Chunk.h"
|
|
|
|
namespace milvus {
|
|
class ChunkWriterBase {
|
|
public:
|
|
explicit ChunkWriterBase(bool nullable) : nullable_(nullable) {
|
|
}
|
|
|
|
virtual std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& data) = 0;
|
|
|
|
virtual void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) = 0;
|
|
|
|
protected:
|
|
void
|
|
write_null_bit_maps(
|
|
const std::vector<std::tuple<const uint8_t*, int64_t, int64_t>>&
|
|
null_bitmaps,
|
|
const std::shared_ptr<ChunkTarget>& target) {
|
|
if (nullable_) {
|
|
// merge all null bitmaps in case of multiple chunk null bitmap dislocation
|
|
// say [0xFF, 0x00] with size [7, 8] cannot be treated as [0xFF, 0x00] after merged but
|
|
// [0x7F, 0x00], othersize the null index will be dislocated
|
|
std::vector<uint8_t> merged_null_bitmap;
|
|
int64_t size_total_bit = 0;
|
|
for (auto [data, size_bits, offset_bits] : null_bitmaps) {
|
|
// resize in byte
|
|
merged_null_bitmap.resize((size_total_bit + size_bits + 7) / 8,
|
|
0xFF);
|
|
if (data != nullptr) {
|
|
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_copy(
|
|
data,
|
|
offset_bits,
|
|
merged_null_bitmap.data(),
|
|
size_total_bit,
|
|
size_bits);
|
|
} else {
|
|
// have to append always-true bitmap due to arrow optimize this
|
|
std::vector<uint8_t> null_bitmap(size_bits, 0xff);
|
|
bitset::detail::ElementWiseBitsetPolicy<uint8_t>::op_copy(
|
|
null_bitmap.data(),
|
|
0,
|
|
merged_null_bitmap.data(),
|
|
size_total_bit,
|
|
size_bits);
|
|
}
|
|
size_total_bit += size_bits;
|
|
}
|
|
target->write(merged_null_bitmap.data(), (size_total_bit + 7) / 8);
|
|
}
|
|
}
|
|
|
|
protected:
|
|
size_t row_nums_ = 0;
|
|
bool nullable_ = false;
|
|
};
|
|
|
|
template <typename ArrowType, typename T>
|
|
class ChunkWriter final : public ChunkWriterBase {
|
|
public:
|
|
ChunkWriter(int dim, bool nullable) : ChunkWriterBase(nullable), dim_(dim) {
|
|
}
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override {
|
|
size_t size = 0;
|
|
size_t row_nums = 0;
|
|
for (const auto& data : array_vec) {
|
|
row_nums += data->length();
|
|
auto array = std::static_pointer_cast<ArrowType>(data);
|
|
size += array->length() * dim_ * sizeof(T);
|
|
}
|
|
if (nullable_) {
|
|
size += (row_nums + 7) / 8;
|
|
}
|
|
row_nums_ = row_nums;
|
|
return {size, row_nums};
|
|
}
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override {
|
|
// Chunk layout:
|
|
// 1. Null bitmap (if nullable_=true): Indicates which values are null
|
|
// 2. Data values: Contiguous storage of data elements in the order:
|
|
// data1, data2, ..., dataN where each data element has size dim_*sizeof(T)
|
|
if (nullable_) {
|
|
// tuple <data, size, offset>
|
|
std::vector<std::tuple<const uint8_t*, int64_t, int64_t>>
|
|
null_bitmaps;
|
|
for (const auto& data : array_vec) {
|
|
null_bitmaps.emplace_back(
|
|
data->null_bitmap_data(), data->length(), data->offset());
|
|
}
|
|
write_null_bit_maps(null_bitmaps, target);
|
|
}
|
|
|
|
for (const auto& data : array_vec) {
|
|
auto array = std::static_pointer_cast<ArrowType>(data);
|
|
auto data_ptr = array->raw_values();
|
|
target->write(data_ptr, array->length() * dim_ * sizeof(T));
|
|
}
|
|
}
|
|
|
|
private:
|
|
const int64_t dim_;
|
|
};
|
|
|
|
template <typename T>
|
|
class NullableVectorChunkWriter final : public ChunkWriterBase {
|
|
public:
|
|
NullableVectorChunkWriter(int64_t dim, bool nullable)
|
|
: ChunkWriterBase(nullable), dim_(dim) {
|
|
Assert(nullable && "NullableVectorChunkWriter requires nullable=true");
|
|
}
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override {
|
|
size_t size = 0;
|
|
size_t row_nums = 0;
|
|
|
|
for (const auto& data : array_vec) {
|
|
row_nums += data->length();
|
|
auto binary_array =
|
|
std::static_pointer_cast<arrow::BinaryArray>(data);
|
|
int64_t valid_count = data->length() - binary_array->null_count();
|
|
size += valid_count * dim_ * sizeof(T);
|
|
}
|
|
|
|
// null bitmap size
|
|
size += (row_nums + 7) / 8;
|
|
row_nums_ = row_nums;
|
|
return {size, row_nums};
|
|
}
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override {
|
|
std::vector<std::tuple<const uint8_t*, int64_t, int64_t>> null_bitmaps;
|
|
for (const auto& data : array_vec) {
|
|
null_bitmaps.emplace_back(
|
|
data->null_bitmap_data(), data->length(), data->offset());
|
|
}
|
|
write_null_bit_maps(null_bitmaps, target);
|
|
|
|
for (const auto& data : array_vec) {
|
|
auto binary_array =
|
|
std::static_pointer_cast<arrow::BinaryArray>(data);
|
|
auto data_offset = binary_array->value_offset(0);
|
|
auto data_ptr = binary_array->value_data()->data() + data_offset;
|
|
int64_t valid_count = data->length() - binary_array->null_count();
|
|
target->write(data_ptr, valid_count * dim_ * sizeof(T));
|
|
}
|
|
}
|
|
|
|
private:
|
|
const int64_t dim_;
|
|
};
|
|
|
|
template <>
|
|
inline void
|
|
ChunkWriter<arrow::BooleanArray, bool>::write_to_target(
|
|
const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) {
|
|
if (nullable_) {
|
|
// tuple <data, size, offset>
|
|
std::vector<std::tuple<const uint8_t*, int64_t, int64_t>> null_bitmaps;
|
|
for (const auto& data : array_vec) {
|
|
null_bitmaps.emplace_back(
|
|
data->null_bitmap_data(), data->length(), data->offset());
|
|
}
|
|
write_null_bit_maps(null_bitmaps, target);
|
|
}
|
|
|
|
for (const auto& data : array_vec) {
|
|
auto array = std::dynamic_pointer_cast<arrow::BooleanArray>(data);
|
|
for (int i = 0; i < array->length(); i++) {
|
|
auto value = array->Value(i);
|
|
target->write(&value, sizeof(bool));
|
|
}
|
|
}
|
|
}
|
|
|
|
class StringChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
using ChunkWriterBase::ChunkWriterBase;
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
|
|
private:
|
|
std::vector<std::string_view> strs_;
|
|
};
|
|
|
|
class JSONChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
using ChunkWriterBase::ChunkWriterBase;
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
};
|
|
|
|
class GeometryChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
using ChunkWriterBase::ChunkWriterBase;
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
};
|
|
|
|
class ArrayChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
ArrayChunkWriter(const milvus::DataType element_type, bool nullable)
|
|
: ChunkWriterBase(nullable), element_type_(element_type) {
|
|
}
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
|
|
private:
|
|
const milvus::DataType element_type_;
|
|
};
|
|
|
|
class VectorArrayChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
VectorArrayChunkWriter(int64_t dim, const milvus::DataType element_type)
|
|
: ChunkWriterBase(false), element_type_(element_type), dim_(dim) {
|
|
}
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
|
|
private:
|
|
const milvus::DataType element_type_;
|
|
const int64_t dim_;
|
|
};
|
|
|
|
class SparseFloatVectorChunkWriter : public ChunkWriterBase {
|
|
public:
|
|
using ChunkWriterBase::ChunkWriterBase;
|
|
|
|
std::pair<size_t, size_t>
|
|
calculate_size(const arrow::ArrayVector& array_vec) override;
|
|
|
|
void
|
|
write_to_target(const arrow::ArrayVector& array_vec,
|
|
const std::shared_ptr<ChunkTarget>& target) override;
|
|
};
|
|
|
|
std::unique_ptr<Chunk>
|
|
create_chunk(const FieldMeta& field_meta,
|
|
const arrow::ArrayVector& array_vec,
|
|
const std::string& file_path = "",
|
|
proto::common::LoadPriority load_priority =
|
|
proto::common::LoadPriority::HIGH);
|
|
|
|
std::unordered_map<FieldId, std::shared_ptr<Chunk>>
|
|
create_group_chunk(const std::vector<FieldId>& field_ids,
|
|
const std::vector<FieldMeta>& field_metas,
|
|
const std::vector<arrow::ArrayVector>& array_vec,
|
|
const std::string& file_path = "",
|
|
proto::common::LoadPriority load_priority =
|
|
proto::common::LoadPriority::HIGH);
|
|
|
|
arrow::ArrayVector
|
|
read_single_column_batches(std::shared_ptr<arrow::RecordBatchReader> reader);
|
|
|
|
} // namespace milvus
|