mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-29 06:55:27 +08:00
related: #45993 This commit extends nullable vector support to the proxy layer, querynode, and adds comprehensive validation, search reduce, and field data handling for nullable vectors with sparse storage. Proxy layer changes: - Update validate_util.go checkAligned() with getExpectedVectorRows() helper to validate nullable vector field alignment using valid data count - Update checkFloatVectorFieldData/checkSparseFloatVectorFieldData for nullable vector validation with proper row count expectations - Add FieldDataIdxComputer in typeutil/schema.go for logical-to-physical index translation during search reduce operations - Update search_reduce_util.go reduceSearchResultData to use idxComputers for correct field data indexing with nullable vectors - Update task.go, task_query.go, task_upsert.go for nullable vector handling - Update msg_pack.go with nullable vector field data processing QueryNode layer changes: - Update segments/result.go for nullable vector result handling - Update segments/search_reduce.go with nullable vector offset translation Storage and index changes: - Update data_codec.go and utils.go for nullable vector serialization - Update indexcgowrapper/dataset.go and index.go for nullable vector indexing Utility changes: - Add FieldDataIdxComputer struct with Compute() method for efficient logical-to-physical index mapping across multiple field data - Update EstimateEntitySize() and AppendFieldData() with fieldIdxs parameter - Update funcutil.go with nullable vector support functions <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Full support for nullable vector fields (float, binary, float16, bfloat16, int8, sparse) across ingest, storage, indexing, search and retrieval; logical↔physical offset mapping preserves row semantics. * Client: compaction control and compaction-state APIs. * **Bug Fixes** * Improved validation for adding vector fields (nullable + dimension checks) and corrected search/query behavior for nullable vectors. * **Chores** * Persisted validity maps with indexes and on-disk formats. * **Tests** * Extensive new and updated end-to-end nullable-vector tests. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: marcelo-cjl <marcelo.chen@zilliz.com>
97 lines
3.2 KiB
C++
97 lines
3.2 KiB
C++
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <mutex>
|
|
#include <shared_mutex>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace milvus {
|
|
|
|
// Bidirectional offset mapping for nullable vector storage
|
|
// Maps between logical offsets (with nulls) and physical offsets (only valid data)
|
|
// Supports two storage modes:
|
|
// - vec mode: uses vector for both L2P and P2L, efficient when valid ratio >= 10%
|
|
// - map mode: uses unordered_map for L2P, efficient when valid ratio < 10%
|
|
class OffsetMapping {
|
|
public:
|
|
OffsetMapping() = default;
|
|
|
|
// Build mapping from valid_data (bool array format)
|
|
// If use_vec is not specified, auto-select based on valid ratio (< 10% uses map)
|
|
void
|
|
Build(const bool* valid_data,
|
|
int64_t total_count,
|
|
int64_t start_logical = 0,
|
|
int64_t start_physical = 0);
|
|
|
|
// Build mapping incrementally (always uses vec mode for incremental builds)
|
|
void
|
|
BuildIncremental(const bool* valid_data,
|
|
int64_t count,
|
|
int64_t start_logical,
|
|
int64_t start_physical);
|
|
|
|
// Get physical offset from logical offset. Returns -1 if null.
|
|
int64_t
|
|
GetPhysicalOffset(int64_t logical_offset) const;
|
|
|
|
// Get logical offset from physical offset. Returns -1 if not found.
|
|
int64_t
|
|
GetLogicalOffset(int64_t physical_offset) const;
|
|
|
|
// Check if a logical offset is valid (not null)
|
|
bool
|
|
IsValid(int64_t logical_offset) const;
|
|
|
|
// Get count of valid (non-null) elements
|
|
int64_t
|
|
GetValidCount() const;
|
|
|
|
// Check if mapping is enabled
|
|
bool
|
|
IsEnabled() const;
|
|
|
|
// Get next physical offset (for incremental builds)
|
|
int64_t
|
|
GetNextPhysicalOffset() const;
|
|
|
|
// Get total logical count (including nulls)
|
|
int64_t
|
|
GetTotalCount() const;
|
|
|
|
private:
|
|
bool enabled_{false};
|
|
bool use_map_{false}; // true: use map for L2P, false: use vec
|
|
|
|
// Vec mode storage (uses int32_t to save memory)
|
|
std::vector<int32_t> l2p_vec_; // logical -> physical, -1 means null
|
|
std::vector<int32_t> p2l_vec_; // physical -> logical
|
|
|
|
// Map mode storage (for sparse valid data)
|
|
std::unordered_map<int32_t, int32_t> l2p_map_; // logical -> physical
|
|
std::unordered_map<int32_t, int32_t> p2l_map_; // physical -> logical
|
|
|
|
int64_t valid_count_{0};
|
|
int64_t total_count_{0}; // total logical count (including nulls)
|
|
mutable std::shared_mutex mutex_;
|
|
};
|
|
|
|
} // namespace milvus
|