mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
This commit adds sparse float vector support to segcore with the following: 1. data type enum declarations 2. Adds corresponding data structures for handling sparse float vectors in various scenarios, including: * FieldData as a bridge between the binlog and the in memory data structures * mmap::Column as the in memory representation of a sparse float vector column of a sealed segment; * ConcurrentVector as the in memory representation of a sparse float vector of a growing segment which supports inserts. 3. Adds logic in payload reader/writer to serialize/deserialize from/to binlog 4. Adds the ability to allow the index node to build sparse float vector index 5. Adds the ability to allow the query node to build growing index for growing segment and temp index for sealed segment without index built This commit also includes some code cleanness, comment improvement, and some unit tests for sparse vector. https://github.com/milvus-io/milvus/issues/29419 Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
460 lines
14 KiB
C++
460 lines
14 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#pragma once
|
|
|
|
#include <cstddef>
|
|
#include <optional>
|
|
#include <map>
|
|
#include <memory>
|
|
|
|
#include <tbb/concurrent_vector.h>
|
|
#include <index/Index.h>
|
|
#include <index/ScalarIndex.h>
|
|
|
|
#include "AckResponder.h"
|
|
#include "InsertRecord.h"
|
|
#include "common/Schema.h"
|
|
#include "common/IndexMeta.h"
|
|
#include "IndexConfigGenerator.h"
|
|
#include "log/Log.h"
|
|
#include "segcore/SegcoreConfig.h"
|
|
#include "index/VectorIndex.h"
|
|
|
|
namespace milvus::segcore {
|
|
|
|
// this should be concurrent
|
|
// All concurrent
|
|
class FieldIndexing {
|
|
public:
|
|
explicit FieldIndexing(const FieldMeta& field_meta,
|
|
const SegcoreConfig& segcore_config)
|
|
: field_meta_(field_meta), segcore_config_(segcore_config) {
|
|
}
|
|
FieldIndexing(const FieldIndexing&) = delete;
|
|
FieldIndexing&
|
|
operator=(const FieldIndexing&) = delete;
|
|
virtual ~FieldIndexing() = default;
|
|
|
|
// Do this in parallel
|
|
virtual void
|
|
BuildIndexRange(int64_t ack_beg,
|
|
int64_t ack_end,
|
|
const VectorBase* vec_base) = 0;
|
|
|
|
virtual void
|
|
AppendSegmentIndexDense(int64_t reserved_offset,
|
|
int64_t size,
|
|
const VectorBase* vec_base,
|
|
const void* data_source) = 0;
|
|
|
|
// new_data_dim is the dimension of the new data being appended(data_source)
|
|
virtual void
|
|
AppendSegmentIndexSparse(int64_t reserved_offset,
|
|
int64_t size,
|
|
int64_t new_data_dim,
|
|
const VectorBase* vec_base,
|
|
const void* data_source) = 0;
|
|
|
|
virtual void
|
|
GetDataFromIndex(const int64_t* seg_offsets,
|
|
int64_t count,
|
|
int64_t element_size,
|
|
void* output) = 0;
|
|
|
|
virtual int64_t
|
|
get_build_threshold() const = 0;
|
|
|
|
virtual bool
|
|
sync_data_with_index() const = 0;
|
|
|
|
virtual bool
|
|
has_raw_data() const {
|
|
return true;
|
|
}
|
|
|
|
const FieldMeta&
|
|
get_field_meta() {
|
|
return field_meta_;
|
|
}
|
|
|
|
virtual idx_t
|
|
get_index_cursor() = 0;
|
|
|
|
int64_t
|
|
get_size_per_chunk() const {
|
|
return segcore_config_.get_chunk_rows();
|
|
}
|
|
|
|
virtual index::IndexBase*
|
|
get_chunk_indexing(int64_t chunk_id) const = 0;
|
|
|
|
virtual index::IndexBase*
|
|
get_segment_indexing() const = 0;
|
|
|
|
protected:
|
|
// additional info
|
|
const FieldMeta& field_meta_;
|
|
const SegcoreConfig& segcore_config_;
|
|
};
|
|
|
|
template <typename T>
|
|
class ScalarFieldIndexing : public FieldIndexing {
|
|
public:
|
|
using FieldIndexing::FieldIndexing;
|
|
|
|
void
|
|
BuildIndexRange(int64_t ack_beg,
|
|
int64_t ack_end,
|
|
const VectorBase* vec_base) override;
|
|
|
|
void
|
|
AppendSegmentIndexDense(int64_t reserved_offset,
|
|
int64_t size,
|
|
const VectorBase* vec_base,
|
|
const void* data_source) override {
|
|
PanicInfo(Unsupported,
|
|
"scalar index doesn't support append vector segment index");
|
|
}
|
|
|
|
void
|
|
AppendSegmentIndexSparse(int64_t reserved_offset,
|
|
int64_t size,
|
|
int64_t new_data_dim,
|
|
const VectorBase* vec_base,
|
|
const void* data_source) override {
|
|
PanicInfo(Unsupported,
|
|
"scalar index doesn't support append vector segment index");
|
|
}
|
|
|
|
void
|
|
GetDataFromIndex(const int64_t* seg_offsets,
|
|
int64_t count,
|
|
int64_t element_size,
|
|
void* output) override {
|
|
PanicInfo(Unsupported,
|
|
"scalar index don't support get data from index");
|
|
}
|
|
idx_t
|
|
get_index_cursor() override {
|
|
return 0;
|
|
}
|
|
|
|
int64_t
|
|
get_build_threshold() const override {
|
|
return 0;
|
|
}
|
|
|
|
bool
|
|
sync_data_with_index() const override {
|
|
return false;
|
|
}
|
|
|
|
// concurrent
|
|
index::ScalarIndex<T>*
|
|
get_chunk_indexing(int64_t chunk_id) const override {
|
|
Assert(!field_meta_.is_vector());
|
|
return data_.at(chunk_id).get();
|
|
}
|
|
|
|
index::IndexBase*
|
|
get_segment_indexing() const override {
|
|
return nullptr;
|
|
}
|
|
|
|
private:
|
|
tbb::concurrent_vector<index::ScalarIndexPtr<T>> data_;
|
|
};
|
|
|
|
class VectorFieldIndexing : public FieldIndexing {
|
|
public:
|
|
using FieldIndexing::FieldIndexing;
|
|
|
|
explicit VectorFieldIndexing(const FieldMeta& field_meta,
|
|
const FieldIndexMeta& field_index_meta,
|
|
int64_t segment_max_row_count,
|
|
const SegcoreConfig& segcore_config);
|
|
|
|
void
|
|
BuildIndexRange(int64_t ack_beg,
|
|
int64_t ack_end,
|
|
const VectorBase* vec_base) override;
|
|
|
|
void
|
|
AppendSegmentIndexDense(int64_t reserved_offset,
|
|
int64_t size,
|
|
const VectorBase* field_raw_data,
|
|
const void* data_source) override;
|
|
|
|
void
|
|
AppendSegmentIndexSparse(int64_t reserved_offset,
|
|
int64_t size,
|
|
int64_t new_data_dim,
|
|
const VectorBase* field_raw_data,
|
|
const void* data_source) override;
|
|
|
|
void
|
|
GetDataFromIndex(const int64_t* seg_offsets,
|
|
int64_t count,
|
|
int64_t element_size,
|
|
void* output) override;
|
|
|
|
int64_t
|
|
get_build_threshold() const override {
|
|
return config_->GetBuildThreshold();
|
|
}
|
|
|
|
// concurrent
|
|
index::IndexBase*
|
|
get_chunk_indexing(int64_t chunk_id) const override {
|
|
Assert(field_meta_.is_vector());
|
|
return data_.at(chunk_id).get();
|
|
}
|
|
index::IndexBase*
|
|
get_segment_indexing() const override {
|
|
return index_.get();
|
|
}
|
|
|
|
bool
|
|
sync_data_with_index() const override;
|
|
|
|
bool
|
|
has_raw_data() const override;
|
|
|
|
idx_t
|
|
get_index_cursor() override;
|
|
|
|
knowhere::Json
|
|
get_build_params() const;
|
|
|
|
SearchInfo
|
|
get_search_params(const SearchInfo& searchInfo) const;
|
|
|
|
private:
|
|
// current number of rows in index.
|
|
std::atomic<idx_t> index_cur_ = 0;
|
|
// whether the growing index has been built.
|
|
std::atomic<bool> built_;
|
|
// whether all insertd data has been added to growing index and can be
|
|
// searched.
|
|
std::atomic<bool> sync_with_index_;
|
|
std::unique_ptr<VecIndexConfig> config_;
|
|
std::unique_ptr<index::VectorIndex> index_;
|
|
tbb::concurrent_vector<std::unique_ptr<index::VectorIndex>> data_;
|
|
};
|
|
|
|
std::unique_ptr<FieldIndexing>
|
|
CreateIndex(const FieldMeta& field_meta,
|
|
const FieldIndexMeta& field_index_meta,
|
|
int64_t segment_max_row_count,
|
|
const SegcoreConfig& segcore_config);
|
|
|
|
class IndexingRecord {
|
|
public:
|
|
explicit IndexingRecord(const Schema& schema,
|
|
const IndexMetaPtr& indexMetaPtr,
|
|
const SegcoreConfig& segcore_config)
|
|
: schema_(schema),
|
|
index_meta_(indexMetaPtr),
|
|
segcore_config_(segcore_config) {
|
|
Initialize();
|
|
}
|
|
|
|
void
|
|
Initialize() {
|
|
int offset_id = 0;
|
|
for (auto& [field_id, field_meta] : schema_.get_fields()) {
|
|
++offset_id;
|
|
if (field_meta.is_vector() &&
|
|
segcore_config_.get_enable_interim_segment_index()) {
|
|
// TODO: skip binary small index now, reenable after config.yaml is ready
|
|
if (field_meta.get_data_type() == DataType::VECTOR_BINARY) {
|
|
continue;
|
|
}
|
|
|
|
if (index_meta_ == nullptr) {
|
|
LOG_INFO("miss index meta for growing interim index");
|
|
continue;
|
|
}
|
|
//Small-Index enabled, create index for vector field only
|
|
if (index_meta_->GetIndexMaxRowCount() > 0 &&
|
|
index_meta_->HasFiled(field_id)) {
|
|
auto vec_filed_meta =
|
|
index_meta_->GetFieldIndexMeta(field_id);
|
|
//Disable growing index for flat
|
|
if (!vec_filed_meta.IsFlatIndex()) {
|
|
field_indexings_.try_emplace(
|
|
field_id,
|
|
CreateIndex(field_meta,
|
|
vec_filed_meta,
|
|
index_meta_->GetIndexMaxRowCount(),
|
|
segcore_config_));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
assert(offset_id == schema_.size());
|
|
}
|
|
|
|
// concurrent, reentrant
|
|
template <bool is_sealed>
|
|
void
|
|
AppendingIndex(int64_t reserved_offset,
|
|
int64_t size,
|
|
FieldId fieldId,
|
|
const DataArray* stream_data,
|
|
const InsertRecord<is_sealed>& record) {
|
|
if (!is_in(fieldId)) {
|
|
return;
|
|
}
|
|
auto& indexing = field_indexings_.at(fieldId);
|
|
auto type = indexing->get_field_meta().get_data_type();
|
|
auto field_raw_data = record.get_field_data_base(fieldId);
|
|
if (type == DataType::VECTOR_FLOAT &&
|
|
reserved_offset + size >= indexing->get_build_threshold()) {
|
|
indexing->AppendSegmentIndexDense(
|
|
reserved_offset,
|
|
size,
|
|
field_raw_data,
|
|
stream_data->vectors().float_vector().data().data());
|
|
} else if (type == DataType::VECTOR_SPARSE_FLOAT) {
|
|
auto data = SparseBytesToRows(
|
|
stream_data->vectors().sparse_float_vector().contents());
|
|
indexing->AppendSegmentIndexSparse(
|
|
reserved_offset,
|
|
size,
|
|
stream_data->vectors().sparse_float_vector().dim(),
|
|
field_raw_data,
|
|
data.get());
|
|
}
|
|
}
|
|
|
|
// concurrent, reentrant
|
|
template <bool is_sealed>
|
|
void
|
|
AppendingIndex(int64_t reserved_offset,
|
|
int64_t size,
|
|
FieldId fieldId,
|
|
const FieldDataPtr data,
|
|
const InsertRecord<is_sealed>& record) {
|
|
if (!is_in(fieldId)) {
|
|
return;
|
|
}
|
|
auto& indexing = field_indexings_.at(fieldId);
|
|
auto type = indexing->get_field_meta().get_data_type();
|
|
const void* p = data->Data();
|
|
|
|
if (type == DataType::VECTOR_FLOAT &&
|
|
reserved_offset + size >= indexing->get_build_threshold()) {
|
|
auto vec_base = record.get_field_data_base(fieldId);
|
|
indexing->AppendSegmentIndexDense(
|
|
reserved_offset, size, vec_base, data->Data());
|
|
} else if (type == DataType::VECTOR_SPARSE_FLOAT) {
|
|
auto vec_base = record.get_field_data_base(fieldId);
|
|
indexing->AppendSegmentIndexSparse(
|
|
reserved_offset,
|
|
size,
|
|
std::dynamic_pointer_cast<const FieldData<SparseFloatVector>>(
|
|
data)
|
|
->Dim(),
|
|
vec_base,
|
|
p);
|
|
}
|
|
}
|
|
|
|
void
|
|
GetDataFromIndex(FieldId fieldId,
|
|
const int64_t* seg_offsets,
|
|
int64_t count,
|
|
int64_t element_size,
|
|
void* output_raw) const {
|
|
if (is_in(fieldId)) {
|
|
auto& indexing = field_indexings_.at(fieldId);
|
|
if (indexing->get_field_meta().is_vector() &&
|
|
indexing->get_field_meta().get_data_type() ==
|
|
DataType::VECTOR_FLOAT) {
|
|
indexing->GetDataFromIndex(
|
|
seg_offsets, count, element_size, output_raw);
|
|
}
|
|
}
|
|
}
|
|
|
|
// result shows the index has synchronized with all inserted data or not
|
|
bool
|
|
SyncDataWithIndex(FieldId fieldId) const {
|
|
if (is_in(fieldId)) {
|
|
const FieldIndexing& indexing = get_field_indexing(fieldId);
|
|
return indexing.sync_data_with_index();
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool
|
|
HasRawData(FieldId fieldId) const {
|
|
if (is_in(fieldId) && SyncDataWithIndex(fieldId)) {
|
|
const FieldIndexing& indexing = get_field_indexing(fieldId);
|
|
return indexing.has_raw_data();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// concurrent
|
|
int64_t
|
|
get_finished_ack() const {
|
|
return finished_ack_.GetAck();
|
|
}
|
|
|
|
const FieldIndexing&
|
|
get_field_indexing(FieldId field_id) const {
|
|
Assert(field_indexings_.count(field_id));
|
|
return *field_indexings_.at(field_id);
|
|
}
|
|
|
|
const VectorFieldIndexing&
|
|
get_vec_field_indexing(FieldId field_id) const {
|
|
auto& field_indexing = get_field_indexing(field_id);
|
|
auto ptr = dynamic_cast<const VectorFieldIndexing*>(&field_indexing);
|
|
AssertInfo(ptr, "invalid indexing");
|
|
return *ptr;
|
|
}
|
|
|
|
bool
|
|
is_in(FieldId field_id) const {
|
|
return field_indexings_.count(field_id);
|
|
}
|
|
|
|
template <typename T>
|
|
auto
|
|
get_scalar_field_indexing(FieldId field_id) const
|
|
-> const ScalarFieldIndexing<T>& {
|
|
auto& entry = get_field_indexing(field_id);
|
|
auto ptr = dynamic_cast<const ScalarFieldIndexing<T>*>(&entry);
|
|
AssertInfo(ptr, "invalid indexing");
|
|
return *ptr;
|
|
}
|
|
|
|
private:
|
|
const Schema& schema_;
|
|
IndexMetaPtr index_meta_;
|
|
const SegcoreConfig& segcore_config_;
|
|
|
|
// control info
|
|
std::atomic<int64_t> resource_ack_ = 0;
|
|
// std::atomic<int64_t> finished_ack_ = 0;
|
|
AckResponder finished_ack_;
|
|
std::mutex mutex_;
|
|
|
|
// field_offset => indexing
|
|
std::map<FieldId, std::unique_ptr<FieldIndexing>> field_indexings_;
|
|
};
|
|
|
|
} // namespace milvus::segcore
|