feat: [Sparse Float Vector] segcore basics and index building (#30357)

This commit adds sparse float vector support to segcore with the
following:

1. data type enum declarations
2. Adds corresponding data structures for handling sparse float vectors
in various scenarios, including:
* FieldData as a bridge between the binlog and the in memory data
structures
* mmap::Column as the in memory representation of a sparse float vector
column of a sealed segment;
* ConcurrentVector as the in memory representation of a sparse float
vector of a growing segment which supports inserts.
3. Adds logic in payload reader/writer to serialize/deserialize from/to
binlog
4. Adds the ability to allow the index node to build sparse float vector
index
5. Adds the ability to allow the query node to build growing index for
growing segment and temp index for sealed segment without index built

This commit also includes some code cleanness, comment improvement, and
some unit tests for sparse vector.

https://github.com/milvus-io/milvus/issues/29419

Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
This commit is contained in:
Buqian Zheng 2024-03-11 14:45:02 +08:00 committed by GitHub
parent a99143dd52
commit 070dfc77bf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
58 changed files with 1229 additions and 625 deletions

View File

@ -26,9 +26,9 @@
namespace milvus { namespace milvus {
template <typename Type, bool is_scalar> template <typename Type, bool is_type_entire_row>
void void
FieldDataImpl<Type, is_scalar>::FillFieldData(const void* source, FieldDataImpl<Type, is_type_entire_row>::FillFieldData(const void* source,
ssize_t element_count) { ssize_t element_count) {
if (element_count == 0) { if (element_count == 0) {
return; return;
@ -57,9 +57,9 @@ GetDataInfoFromArray(const std::shared_ptr<arrow::Array> array) {
return std::make_pair(typed_array->raw_values(), element_count); return std::make_pair(typed_array->raw_values(), element_count);
} }
template <typename Type, bool is_scalar> template <typename Type, bool is_type_entire_row>
void void
FieldDataImpl<Type, is_scalar>::FillFieldData( FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
const std::shared_ptr<arrow::Array> array) { const std::shared_ptr<arrow::Array> array) {
AssertInfo(array != nullptr, "null arrow array"); AssertInfo(array != nullptr, "null arrow array");
auto element_count = array->length(); auto element_count = array->length();
@ -159,6 +159,18 @@ FieldDataImpl<Type, is_scalar>::FillFieldData(
array); array);
return FillFieldData(array_info.first, array_info.second); return FillFieldData(array_info.first, array_info.second);
} }
case DataType::VECTOR_SPARSE_FLOAT: {
AssertInfo(array->type()->id() == arrow::Type::type::BINARY,
"inconsistent data type");
auto arr = std::dynamic_pointer_cast<arrow::BinaryArray>(array);
std::vector<knowhere::sparse::SparseRow<float>> values;
for (size_t index = 0; index < element_count; ++index) {
auto view = arr->GetString(index);
values.push_back(
CopyAndWrapSparseRow(view.data(), view.size()));
}
return FillFieldData(values.data(), element_count);
}
default: { default: {
throw SegcoreError(DataTypeInvalid, throw SegcoreError(DataTypeInvalid,
GetName() + "::FillFieldData" + GetName() + "::FillFieldData" +
@ -186,6 +198,7 @@ template class FieldDataImpl<int8_t, false>;
template class FieldDataImpl<float, false>; template class FieldDataImpl<float, false>;
template class FieldDataImpl<float16, false>; template class FieldDataImpl<float16, false>;
template class FieldDataImpl<bfloat16, false>; template class FieldDataImpl<bfloat16, false>;
template class FieldDataImpl<knowhere::sparse::SparseRow<float>, true>;
FieldDataPtr FieldDataPtr
InitScalarFieldData(const DataType& type, int64_t cap_rows) { InitScalarFieldData(const DataType& type, int64_t cap_rows) {

View File

@ -121,6 +121,14 @@ class FieldData<BFloat16Vector> : public FieldDataImpl<bfloat16, false> {
} }
}; };
template <>
class FieldData<SparseFloatVector> : public FieldDataSparseVectorImpl {
public:
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataSparseVectorImpl(data_type, buffered_num_rows) {
}
};
using FieldDataPtr = std::shared_ptr<FieldDataBase>; using FieldDataPtr = std::shared_ptr<FieldDataBase>;
using FieldDataChannel = Channel<FieldDataPtr>; using FieldDataChannel = Channel<FieldDataPtr>;
using FieldDataChannelPtr = std::shared_ptr<FieldDataChannel>; using FieldDataChannelPtr = std::shared_ptr<FieldDataChannel>;

View File

@ -32,6 +32,7 @@
#include "common/VectorTrait.h" #include "common/VectorTrait.h"
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "common/Array.h" #include "common/Array.h"
#include "knowhere/dataset.h"
namespace milvus { namespace milvus {
@ -43,24 +44,33 @@ class FieldDataBase {
} }
virtual ~FieldDataBase() = default; virtual ~FieldDataBase() = default;
// For all FieldDataImpl subclasses, source is a pointer to element_count of
// Type
virtual void virtual void
FillFieldData(const void* source, ssize_t element_count) = 0; FillFieldData(const void* source, ssize_t element_count) = 0;
virtual void virtual void
FillFieldData(const std::shared_ptr<arrow::Array> array) = 0; FillFieldData(const std::shared_ptr<arrow::Array> array) = 0;
// For all FieldDataImpl subclasses, this method returns Type* that points
// at all rows in this field data.
virtual void* virtual void*
Data() = 0; Data() = 0;
// For all FieldDataImpl subclasses, this method returns a Type* that points
// at the offset-th row of this field data.
virtual const void* virtual const void*
RawValue(ssize_t offset) const = 0; RawValue(ssize_t offset) const = 0;
// Returns the serialized bytes size of all rows.
virtual int64_t virtual int64_t
Size() const = 0; Size() const = 0;
// Returns the serialized bytes size of the index-th row.
virtual int64_t virtual int64_t
Size(ssize_t index) const = 0; Size(ssize_t index) const = 0;
// Number of filled rows
virtual size_t virtual size_t
Length() const = 0; Length() const = 0;
@ -71,9 +81,11 @@ class FieldDataBase {
Reserve(size_t cap) = 0; Reserve(size_t cap) = 0;
public: public:
// row capacity
virtual int64_t virtual int64_t
get_num_rows() const = 0; get_num_rows() const = 0;
// each row is represented as how many Type elements
virtual int64_t virtual int64_t
get_dim() const = 0; get_dim() const = 0;
@ -86,11 +98,9 @@ class FieldDataBase {
const DataType data_type_; const DataType data_type_;
}; };
template <typename Type, bool is_scalar = false> template <typename Type, bool is_type_entire_row = false>
class FieldDataImpl : public FieldDataBase { class FieldDataImpl : public FieldDataBase {
public: public:
// constants
using Chunk = FixedVector<Type>;
FieldDataImpl(FieldDataImpl&&) = delete; FieldDataImpl(FieldDataImpl&&) = delete;
FieldDataImpl(const FieldDataImpl&) = delete; FieldDataImpl(const FieldDataImpl&) = delete;
@ -105,13 +115,16 @@ class FieldDataImpl : public FieldDataBase {
int64_t buffered_num_rows = 0) int64_t buffered_num_rows = 0)
: FieldDataBase(data_type), : FieldDataBase(data_type),
num_rows_(buffered_num_rows), num_rows_(buffered_num_rows),
dim_(is_scalar ? 1 : dim) { dim_(is_type_entire_row ? 1 : dim) {
field_data_.resize(num_rows_ * dim_); field_data_.resize(num_rows_ * dim_);
} }
explicit FieldDataImpl(size_t dim, DataType type, Chunk&& field_data) explicit FieldDataImpl(size_t dim,
: FieldDataBase(type), dim_(is_scalar ? 1 : dim) { DataType type,
FixedVector<Type>&& field_data)
: FieldDataBase(type), dim_(is_type_entire_row ? 1 : dim) {
field_data_ = std::move(field_data); field_data_ = std::move(field_data);
Assert(field_data.size() % dim == 0);
num_rows_ = field_data.size() / dim; num_rows_ = field_data.size() / dim;
} }
@ -122,10 +135,18 @@ class FieldDataImpl : public FieldDataBase {
FillFieldData(const std::shared_ptr<arrow::Array> array) override; FillFieldData(const std::shared_ptr<arrow::Array> array) override;
virtual void virtual void
FillFieldData(const std::shared_ptr<arrow::StringArray>& array){}; FillFieldData(const std::shared_ptr<arrow::StringArray>& array) {
PanicInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::StringArray>& "
"array) not implemented by default");
}
virtual void virtual void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array){}; FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) {
PanicInfo(NotImplemented,
"FillFieldData(const std::shared_ptr<arrow::BinaryArray>& "
"array) not implemented by default");
}
std::string std::string
GetName() const { GetName() const {
@ -209,9 +230,11 @@ class FieldDataImpl : public FieldDataBase {
} }
protected: protected:
Chunk field_data_; FixedVector<Type> field_data_;
// number of elements field_data_ can hold
int64_t num_rows_; int64_t num_rows_;
mutable std::shared_mutex num_rows_mutex_; mutable std::shared_mutex num_rows_mutex_;
// number of actual elements in field_data_
size_t length_{}; size_t length_{};
mutable std::shared_mutex tell_mutex_; mutable std::shared_mutex tell_mutex_;
@ -322,6 +345,89 @@ class FieldDataJsonImpl : public FieldDataImpl<Json, true> {
} }
}; };
class FieldDataSparseVectorImpl
: public FieldDataImpl<knowhere::sparse::SparseRow<float>, true> {
public:
explicit FieldDataSparseVectorImpl(DataType data_type,
int64_t total_num_rows = 0)
: FieldDataImpl<knowhere::sparse::SparseRow<float>, true>(
/*dim=*/1, data_type, total_num_rows),
vec_dim_(0) {
AssertInfo(data_type == DataType::VECTOR_SPARSE_FLOAT,
"invalid data type for sparse vector");
}
int64_t
Size() const override {
int64_t data_size = 0;
for (size_t i = 0; i < length(); ++i) {
data_size += field_data_[i].data_byte_size();
}
return data_size;
}
int64_t
Size(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return field_data_[offset].data_byte_size();
}
// source is a pointer to element_count of
// knowhere::sparse::SparseRow<float>
void
FillFieldData(const void* source, ssize_t element_count) override {
if (element_count == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + element_count > get_num_rows()) {
resize_field_data(length_ + element_count);
}
auto ptr =
static_cast<const knowhere::sparse::SparseRow<float>*>(source);
for (int64_t i = 0; i < element_count; ++i) {
auto& row = ptr[i];
vec_dim_ = std::max(vec_dim_, row.dim());
}
std::copy_n(ptr, element_count, field_data_.data() + length_);
length_ += element_count;
}
// each binary in array is a knowhere::sparse::SparseRow<float>
void
FillFieldData(const std::shared_ptr<arrow::BinaryArray>& array) override {
auto n = array->length();
if (n == 0) {
return;
}
std::lock_guard lck(tell_mutex_);
if (length_ + n > get_num_rows()) {
resize_field_data(length_ + n);
}
for (int64_t i = 0; i < array->length(); ++i) {
auto view = array->GetView(i);
auto& row = field_data_[length_ + i];
row = CopyAndWrapSparseRow(view.data(), view.size());
vec_dim_ = std::max(vec_dim_, row.dim());
}
length_ += n;
}
int64_t
Dim() const {
return vec_dim_;
}
private:
int64_t vec_dim_;
};
class FieldDataArrayImpl : public FieldDataImpl<Array, true> { class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
public: public:
explicit FieldDataArrayImpl(DataType data_type, int64_t total_num_rows = 0) explicit FieldDataArrayImpl(DataType data_type, int64_t total_num_rows = 0)

View File

@ -54,6 +54,10 @@ datatype_sizeof(DataType data_type, int dim = 1) {
case DataType::VECTOR_BFLOAT16: { case DataType::VECTOR_BFLOAT16: {
return sizeof(bfloat16) * dim; return sizeof(bfloat16) * dim;
} }
// Not supporting VECTOR_SPARSE_FLOAT here intentionally. We can't
// easily estimately the size of a sparse float vector. Caller of this
// method must handle this case themselves and must not pass
// VECTOR_SPARSE_FLOAT data_type.
default: { default: {
throw SegcoreError(DataTypeInvalid, throw SegcoreError(DataTypeInvalid,
fmt::format("invalid type is {}", data_type)); fmt::format("invalid type is {}", data_type));
@ -100,6 +104,9 @@ datatype_name(DataType data_type) {
case DataType::VECTOR_BFLOAT16: { case DataType::VECTOR_BFLOAT16: {
return "vector_bfloat16"; return "vector_bfloat16";
} }
case DataType::VECTOR_SPARSE_FLOAT: {
return "vector_sparse_float";
}
default: { default: {
PanicInfo(DataTypeInvalid, "Unsupported DataType({})", data_type); PanicInfo(DataTypeInvalid, "Unsupported DataType({})", data_type);
} }
@ -111,7 +118,13 @@ datatype_is_vector(DataType datatype) {
return datatype == DataType::VECTOR_BINARY || return datatype == DataType::VECTOR_BINARY ||
datatype == DataType::VECTOR_FLOAT || datatype == DataType::VECTOR_FLOAT ||
datatype == DataType::VECTOR_FLOAT16 || datatype == DataType::VECTOR_FLOAT16 ||
datatype == DataType::VECTOR_BFLOAT16; datatype == DataType::VECTOR_BFLOAT16 ||
datatype == DataType::VECTOR_SPARSE_FLOAT;
}
inline bool
datatype_is_sparse_vector(DataType datatype) {
return datatype == DataType::VECTOR_SPARSE_FLOAT;
} }
inline bool inline bool
@ -153,6 +166,7 @@ datatype_is_variable(DataType datatype) {
case DataType::STRING: case DataType::STRING:
case DataType::ARRAY: case DataType::ARRAY:
case DataType::JSON: case DataType::JSON:
case DataType::VECTOR_SPARSE_FLOAT:
return true; return true;
default: default:
return false; return false;
@ -217,6 +231,8 @@ class FieldMeta {
Assert(datatype_is_array(type_)); Assert(datatype_is_array(type_));
} }
// pass in any value for dim for sparse vector is ok as it'll never be used:
// get_dim() not allowed to be invoked on a sparse vector field.
FieldMeta(const FieldName& name, FieldMeta(const FieldName& name,
FieldId id, FieldId id,
DataType type, DataType type,
@ -232,6 +248,8 @@ class FieldMeta {
int64_t int64_t
get_dim() const { get_dim() const {
Assert(datatype_is_vector(type_)); Assert(datatype_is_vector(type_));
// should not attempt to get dim() of a sparse vector from schema.
Assert(!datatype_is_sparse_vector(type_));
Assert(vector_info_.has_value()); Assert(vector_info_.has_value());
return vector_info_->dim_; return vector_info_->dim_;
} }
@ -282,6 +300,9 @@ class FieldMeta {
size_t size_t
get_sizeof() const { get_sizeof() const {
AssertInfo(!datatype_is_sparse_vector(type_),
"should not attempt to get_sizeof() of a sparse vector from "
"schema");
static const size_t ARRAY_SIZE = 128; static const size_t ARRAY_SIZE = 128;
static const size_t JSON_SIZE = 512; static const size_t JSON_SIZE = 512;
if (is_vector()) { if (is_vector()) {

View File

@ -54,8 +54,11 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto type_map = RepeatedKeyValToMap(child.type_params()); auto type_map = RepeatedKeyValToMap(child.type_params());
auto index_map = RepeatedKeyValToMap(child.index_params()); auto index_map = RepeatedKeyValToMap(child.index_params());
int64_t dim = 0;
if (!datatype_is_sparse_vector(data_type)) {
AssertInfo(type_map.count("dim"), "dim not found"); AssertInfo(type_map.count("dim"), "dim not found");
auto dim = boost::lexical_cast<int64_t>(type_map.at("dim")); dim = boost::lexical_cast<int64_t>(type_map.at("dim"));
}
if (!index_map.count("metric_type")) { if (!index_map.count("metric_type")) {
schema->AddField(name, field_id, data_type, dim, std::nullopt); schema->AddField(name, field_id, data_type, dim, std::nullopt);
} else { } else {

View File

@ -132,11 +132,6 @@ class Schema {
return fields_.at(field_id); return fields_.at(field_id);
} }
auto
get_total_sizeof() const {
return total_sizeof_;
}
FieldId FieldId
get_field_id(const FieldName& field_name) const { get_field_id(const FieldName& field_name) const {
AssertInfo(name_ids_.count(field_name), "Cannot find field_name"); AssertInfo(name_ids_.count(field_name), "Cannot find field_name");
@ -181,9 +176,6 @@ class Schema {
fields_.emplace(field_id, field_meta); fields_.emplace(field_id, field_meta);
field_ids_.emplace_back(field_id); field_ids_.emplace_back(field_id);
auto field_sizeof = field_meta.get_sizeof();
total_sizeof_ += field_sizeof;
} }
private: private:
@ -197,7 +189,6 @@ class Schema {
std::unordered_map<FieldName, FieldId> name_ids_; // field_name -> field_id std::unordered_map<FieldName, FieldId> name_ids_; // field_name -> field_id
std::unordered_map<FieldId, FieldName> id_names_; // field_id -> field_name std::unordered_map<FieldId, FieldName> id_names_; // field_id -> field_name
int64_t total_sizeof_ = 0;
std::optional<FieldId> primary_field_id_opt_; std::optional<FieldId> primary_field_id_opt_;
}; };

View File

@ -60,9 +60,9 @@ class Span;
// TODO: refine Span to support T=FloatVector // TODO: refine Span to support T=FloatVector
template <typename T> template <typename T>
class Span< class Span<T,
T, typename std::enable_if_t<IsSparse<T> || IsScalar<T> ||
typename std::enable_if_t<IsScalar<T> || std::is_same_v<T, PkType>>> { std::is_same_v<T, PkType>>> {
public: public:
using embedded_type = T; using embedded_type = T;
explicit Span(const T* data, int64_t row_count) explicit Span(const T* data, int64_t row_count)

View File

@ -80,6 +80,7 @@ enum class DataType {
VECTOR_FLOAT = 101, VECTOR_FLOAT = 101,
VECTOR_FLOAT16 = 102, VECTOR_FLOAT16 = 102,
VECTOR_BFLOAT16 = 103, VECTOR_BFLOAT16 = 103,
VECTOR_SPARSE_FLOAT = 104,
}; };
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
@ -92,7 +93,7 @@ using ScalarArray = proto::schema::ScalarField;
using DataArray = proto::schema::FieldData; using DataArray = proto::schema::FieldData;
using VectorArray = proto::schema::VectorField; using VectorArray = proto::schema::VectorField;
using IdArray = proto::schema::IDs; using IdArray = proto::schema::IDs;
using InsertData = proto::segcore::InsertRecord; using InsertRecordProto = proto::segcore::InsertRecord;
using PkType = std::variant<std::monostate, int64_t, std::string>; using PkType = std::variant<std::monostate, int64_t, std::string>;
inline size_t inline size_t
@ -379,6 +380,9 @@ struct fmt::formatter<milvus::DataType> : formatter<string_view> {
case milvus::DataType::VECTOR_BFLOAT16: case milvus::DataType::VECTOR_BFLOAT16:
name = "VECTOR_BFLOAT16"; name = "VECTOR_BFLOAT16";
break; break;
case milvus::DataType::VECTOR_SPARSE_FLOAT:
name = "VECTOR_SPARSE_FLOAT";
break;
} }
return formatter<string_view>::format(name, ctx); return formatter<string_view>::format(name, ctx);
} }

View File

@ -31,6 +31,7 @@
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "knowhere/dataset.h" #include "knowhere/dataset.h"
#include "knowhere/expected.h" #include "knowhere/expected.h"
#include "knowhere/sparse_utils.h"
#include "simdjson.h" #include "simdjson.h"
namespace milvus { namespace milvus {
@ -213,4 +214,51 @@ GetCommonPrefix(const std::string& str1, const std::string& str2) {
return str1.substr(0, i); return str1.substr(0, i);
} }
inline knowhere::sparse::SparseRow<float>
CopyAndWrapSparseRow(const void* data, size_t size) {
size_t num_elements =
size / knowhere::sparse::SparseRow<float>::element_size();
knowhere::sparse::SparseRow<float> row(num_elements);
std::memcpy(row.data(), data, size);
// TODO(SPARSE): validate
return row;
}
// Iterable is a list of bytes, each is a byte array representation of a single
// sparse float row. This helper function converts such byte arrays into a list
// of knowhere::sparse::SparseRow<float>. The resulting list is a deep copy of
// the source data.
template <typename Iterable>
std::unique_ptr<knowhere::sparse::SparseRow<float>[]>
SparseBytesToRows(const Iterable& rows) {
AssertInfo(rows.size() > 0, "at least 1 sparse row should be provided");
auto res =
std::make_unique<knowhere::sparse::SparseRow<float>[]>(rows.size());
for (size_t i = 0; i < rows.size(); ++i) {
res[i] =
std::move(CopyAndWrapSparseRow(rows[i].data(), rows[i].size()));
}
return res;
}
// SparseRowsToProto converts a vector of knowhere::sparse::SparseRow<float> to
// a milvus::proto::schema::SparseFloatArray. The resulting proto is a deep copy
// of the source data.
inline void SparseRowsToProto(const knowhere::sparse::SparseRow<float>* source,
int64_t rows,
milvus::proto::schema::SparseFloatArray* proto) {
int64_t max_dim = 0;
for (size_t i = 0; i < rows; ++i) {
if (source + i == nullptr) {
// empty row
proto->add_contents();
continue;
}
auto& row = source[i];
max_dim = std::max(max_dim, row.dim());
proto->add_contents(row.data(), row.data_byte_size());
}
proto->set_dim(max_dim);
}
} // namespace milvus } // namespace milvus

View File

@ -48,20 +48,11 @@ class BFloat16Vector : public VectorTrait {
static constexpr auto metric_type = DataType::VECTOR_BFLOAT16; static constexpr auto metric_type = DataType::VECTOR_BFLOAT16;
}; };
template <typename VectorType> class SparseFloatVector : public VectorTrait {
inline constexpr int64_t public:
element_sizeof(int64_t dim) { using embedded_type = float;
static_assert(std::is_base_of_v<VectorType, VectorTrait>); static constexpr auto metric_type = DataType::VECTOR_SPARSE_FLOAT;
if constexpr (std::is_same_v<VectorType, FloatVector>) { };
return dim * sizeof(float);
} else if constexpr (std::is_same_v<VectorType, Float16Vector>) {
return dim * sizeof(float16);
} else if constexpr (std::is_same_v<VectorType, BFloat16Vector>) {
return dim * sizeof(bfloat16);
} else {
return dim / 8;
}
}
template <typename T> template <typename T>
constexpr bool IsVector = std::is_base_of_v<VectorTrait, T>; constexpr bool IsVector = std::is_base_of_v<VectorTrait, T>;
@ -73,6 +64,10 @@ constexpr bool IsScalar =
std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> || std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> ||
std::is_same_v<T, proto::plan::Array>; std::is_same_v<T, proto::plan::Array>;
template <typename T>
constexpr bool IsSparse = std::is_same_v<T, SparseFloatVector> ||
std::is_same_v<T, knowhere::sparse::SparseRow<float>>;
template <typename T, typename Enabled = void> template <typename T, typename Enabled = void>
struct EmbeddedTypeImpl; struct EmbeddedTypeImpl;
@ -86,11 +81,15 @@ struct EmbeddedTypeImpl<T, std::enable_if_t<IsVector<T>>> {
using type = std::conditional_t< using type = std::conditional_t<
std::is_same_v<T, FloatVector>, std::is_same_v<T, FloatVector>,
float, float,
std::conditional_t<std::is_same_v<T, Float16Vector>, std::conditional_t<
std::is_same_v<T, Float16Vector>,
float16, float16,
std::conditional_t<std::is_same_v<T, BFloat16Vector>, std::conditional_t<
std::is_same_v<T, BFloat16Vector>,
bfloat16, bfloat16,
uint8_t>>>; std::conditional_t<std::is_same_v<T, SparseFloatVector>,
void,
uint8_t>>>>;
}; };
template <typename T> template <typename T>

View File

@ -52,6 +52,7 @@ enum CDataType {
FloatVector = 101, FloatVector = 101,
Float16Vector = 102, Float16Vector = 102,
BFloat16Vector = 103, BFloat16Vector = 103,
SparseFloatVector = 104,
}; };
typedef enum CDataType CDataType; typedef enum CDataType CDataType;

View File

@ -205,7 +205,8 @@ IndexFactory::CreateVectorIndex(
} }
} else { // create mem index } else { // create mem index
switch (data_type) { switch (data_type) {
case DataType::VECTOR_FLOAT: { case DataType::VECTOR_FLOAT:
case DataType::VECTOR_SPARSE_FLOAT: {
return std::make_unique<VectorMemIndex<float>>( return std::make_unique<VectorMemIndex<float>>(
index_type, metric_type, version, file_manager_context); index_type, metric_type, version, file_manager_context);
} }
@ -311,7 +312,8 @@ IndexFactory::CreateVectorIndex(
} }
} else { // create mem index } else { // create mem index
switch (data_type) { switch (data_type) {
case DataType::VECTOR_FLOAT: { case DataType::VECTOR_FLOAT:
case DataType::VECTOR_SPARSE_FLOAT: {
return std::make_unique<VectorMemIndex<float>>( return std::make_unique<VectorMemIndex<float>>(
create_index_info, file_manager_context, space); create_index_info, file_manager_context, space);
} }

View File

@ -68,6 +68,30 @@ unsupported_index_combinations() {
static std::vector<std::tuple<IndexType, MetricType>> ret{ static std::vector<std::tuple<IndexType, MetricType>> ret{
std::make_tuple(knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, std::make_tuple(knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT,
knowhere::metric::L2), knowhere::metric::L2),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::L2),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::COSINE),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::HAMMING),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::JACCARD),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::SUBSTRUCTURE),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::SUPERSTRUCTURE),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::L2),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::COSINE),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::HAMMING),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::JACCARD),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::SUBSTRUCTURE),
std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::SUPERSTRUCTURE),
}; };
return ret; return ret;
} }

View File

@ -483,10 +483,15 @@ VectorMemIndex<T>::Build(const Config& config) {
auto insert_files = auto insert_files =
GetValueFromConfig<std::vector<std::string>>(config, "insert_files"); GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
AssertInfo(insert_files.has_value(), AssertInfo(insert_files.has_value(),
"insert file paths is empty when build disk ann index"); "insert file paths is empty when building in memory index");
auto field_datas = auto field_datas =
file_manager_->CacheRawDataToMemory(insert_files.value()); file_manager_->CacheRawDataToMemory(insert_files.value());
Config build_config;
build_config.update(config);
build_config.erase("insert_files");
build_config.erase(VEC_OPT_FIELDS);
if (GetIndexType().find("SPARSE") == std::string::npos) {
int64_t total_size = 0; int64_t total_size = 0;
int64_t total_num_rows = 0; int64_t total_num_rows = 0;
int64_t dim = 0; int64_t dim = 0;
@ -500,6 +505,7 @@ VectorMemIndex<T>::Build(const Config& config) {
auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[total_size]); auto buf = std::shared_ptr<uint8_t[]>(new uint8_t[total_size]);
int64_t offset = 0; int64_t offset = 0;
// TODO: avoid copying
for (auto data : field_datas) { for (auto data : field_datas) {
std::memcpy(buf.get() + offset, data->Data(), data->Size()); std::memcpy(buf.get() + offset, data->Data(), data->Size());
offset += data->Size(); offset += data->Size();
@ -507,13 +513,38 @@ VectorMemIndex<T>::Build(const Config& config) {
} }
field_datas.clear(); field_datas.clear();
Config build_config;
build_config.update(config);
build_config.erase("insert_files");
build_config.erase(VEC_OPT_FIELDS);
auto dataset = GenDataset(total_num_rows, dim, buf.get()); auto dataset = GenDataset(total_num_rows, dim, buf.get());
BuildWithDataset(dataset, build_config); BuildWithDataset(dataset, build_config);
} else {
// sparse
int64_t total_rows = 0;
int64_t dim = 0;
for (auto field_data : field_datas) {
total_rows += field_data->Length();
dim = std::max(
dim,
std::dynamic_pointer_cast<FieldData<SparseFloatVector>>(
field_data)
->Dim());
}
std::vector<knowhere::sparse::SparseRow<float>> vec(total_rows);
int64_t offset = 0;
for (auto field_data : field_datas) {
auto ptr = static_cast<const knowhere::sparse::SparseRow<float>*>(
field_data->Data());
AssertInfo(ptr, "failed to cast field data to sparse rows");
for (size_t i = 0; i < field_data->Length(); ++i) {
// this does a deep copy of field_data's data.
// TODO: avoid copying by enforcing field data to give up
// ownership.
vec[offset + i] = ptr[i];
}
offset += field_data->Length();
}
auto dataset = GenDataset(total_rows, dim, vec.data());
dataset->SetIsSparse(true);
BuildWithDataset(dataset, build_config);
}
} }
template <typename T> template <typename T>

View File

@ -66,6 +66,7 @@ class IndexFactory {
case DataType::VECTOR_FLOAT16: case DataType::VECTOR_FLOAT16:
case DataType::VECTOR_BFLOAT16: case DataType::VECTOR_BFLOAT16:
case DataType::VECTOR_BINARY: case DataType::VECTOR_BINARY:
case DataType::VECTOR_SPARSE_FLOAT:
return std::make_unique<VecIndexCreator>(type, config, context); return std::make_unique<VecIndexCreator>(type, config, context);
default: default:
throw SegcoreError( throw SegcoreError(
@ -101,6 +102,7 @@ class IndexFactory {
case DataType::VECTOR_BINARY: case DataType::VECTOR_BINARY:
case DataType::VECTOR_FLOAT16: case DataType::VECTOR_FLOAT16:
case DataType::VECTOR_BFLOAT16: case DataType::VECTOR_BFLOAT16:
case DataType::VECTOR_SPARSE_FLOAT:
return std::make_unique<VecIndexCreator>( return std::make_unique<VecIndexCreator>(
type, field_name, dim, config, file_manager_context, space); type, field_name, dim, config, file_manager_context, space);
default: default:

View File

@ -351,6 +351,32 @@ BuildBinaryVecIndex(CIndex index, int64_t data_size, const uint8_t* vectors) {
return status; return status;
} }
CStatus
BuildSparseFloatVecIndex(CIndex index,
int64_t row_num,
int64_t dim,
const uint8_t* vectors) {
auto status = CStatus();
try {
AssertInfo(
index,
"failed to build sparse float vector index, passed index was null");
auto real_index =
reinterpret_cast<milvus::indexbuilder::IndexCreatorBase*>(index);
auto cIndex =
dynamic_cast<milvus::indexbuilder::VecIndexCreator*>(real_index);
auto ds = knowhere::GenDataSet(row_num, dim, vectors);
ds->SetIsSparse(true);
cIndex->Build(ds);
status.error_code = Success;
status.error_msg = "";
} catch (std::exception& e) {
status.error_code = UnexpectedError;
status.error_msg = strdup(e.what());
}
return status;
}
// field_data: // field_data:
// 1, serialized proto::schema::BoolArray, if type is bool; // 1, serialized proto::schema::BoolArray, if type is bool;
// 2, serialized proto::schema::StringArray, if type is string; // 2, serialized proto::schema::StringArray, if type is string;

View File

@ -20,6 +20,7 @@ extern "C" {
#include "common/binary_set_c.h" #include "common/binary_set_c.h"
#include "indexbuilder/type_c.h" #include "indexbuilder/type_c.h"
// used only in test
CStatus CStatus
CreateIndexV0(enum CDataType dtype, CreateIndexV0(enum CDataType dtype,
const char* serialized_type_params, const char* serialized_type_params,
@ -43,6 +44,13 @@ BuildFloat16VecIndex(CIndex index, int64_t data_size, const uint8_t* vectors);
CStatus CStatus
BuildBFloat16VecIndex(CIndex index, int64_t data_size, const uint8_t* vectors); BuildBFloat16VecIndex(CIndex index, int64_t data_size, const uint8_t* vectors);
CStatus
BuildSparseFloatVecIndex(CIndex index,
int64_t row_num,
int64_t dim,
const uint8_t* vectors);
// field_data: // field_data:
// 1, serialized proto::schema::BoolArray, if type is bool; // 1, serialized proto::schema::BoolArray, if type is bool;
// 2, serialized proto::schema::StringArray, if type is string; // 2, serialized proto::schema::StringArray, if type is string;

View File

@ -34,6 +34,10 @@
#include "fmt/format.h" #include "fmt/format.h"
#include "log/Log.h" #include "log/Log.h"
#include "mmap/Utils.h" #include "mmap/Utils.h"
#include "common/FieldData.h"
#include "common/FieldDataInterface.h"
#include "common/Array.h"
#include "knowhere/dataset.h"
namespace milvus { namespace milvus {
@ -134,7 +138,7 @@ class ColumnBase {
column.size_ = 0; column.size_ = 0;
} }
const char* virtual const char*
Data() const { Data() const {
return data_; return data_;
} }
@ -144,14 +148,14 @@ class ColumnBase {
return num_rows_; return num_rows_;
}; };
const size_t virtual size_t
ByteSize() const { ByteSize() const {
return cap_size_ + padding_; return cap_size_ + padding_;
} }
// The capacity of the column, // The capacity of the column,
// DO NOT call this for variable length column. // DO NOT call this for variable length column(including SparseFloatColumn).
size_t virtual size_t
Capacity() const { Capacity() const {
return cap_size_ / type_size_; return cap_size_ / type_size_;
} }
@ -159,8 +163,8 @@ class ColumnBase {
virtual SpanBase virtual SpanBase
Span() const = 0; Span() const = 0;
void virtual void
AppendBatch(const FieldDataPtr& data) { AppendBatch(const FieldDataPtr data) {
size_t required_size = size_ + data->Size(); size_t required_size = size_ + data->Size();
if (required_size > cap_size_) { if (required_size > cap_size_) {
Expand(required_size * 2 + padding_); Expand(required_size * 2 + padding_);
@ -174,7 +178,7 @@ class ColumnBase {
} }
// Append one row // Append one row
void virtual void
Append(const char* data, size_t size) { Append(const char* data, size_t size) {
size_t required_size = size_ + size; size_t required_size = size_ + size;
if (required_size > cap_size_) { if (required_size > cap_size_) {
@ -263,6 +267,80 @@ class Column : public ColumnBase {
} }
}; };
// mmap not yet supported, thus SparseFloatColumn is not using fields in super
// class such as ColumnBase::data.
class SparseFloatColumn : public ColumnBase {
public:
// memory mode ctor
SparseFloatColumn(const FieldMeta& field_meta) : ColumnBase(0, field_meta) {
}
// mmap mode ctor
SparseFloatColumn(const File& file,
size_t size,
const FieldMeta& field_meta)
: ColumnBase(file, size, field_meta) {
AssertInfo(false, "SparseFloatColumn mmap mode not supported");
}
SparseFloatColumn(SparseFloatColumn&& column) noexcept
: ColumnBase(std::move(column)),
dim_(column.dim_),
vec_(std::move(column.vec_)) {
}
~SparseFloatColumn() override = default;
const char*
Data() const override {
return static_cast<const char*>(static_cast<const void*>(vec_.data()));
}
// This is used to advice mmap prefetch, we don't currently support mmap for
// sparse float vector thus not implemented for now.
size_t
ByteSize() const override {
throw std::runtime_error(
"ByteSize not supported for sparse float column");
}
size_t
Capacity() const override {
throw std::runtime_error(
"Capacity not supported for sparse float column");
}
SpanBase
Span() const override {
throw std::runtime_error("Span not supported for sparse float column");
}
void
AppendBatch(const FieldDataPtr data) override {
auto ptr = static_cast<const knowhere::sparse::SparseRow<float>*>(
data->Data());
vec_.insert(vec_.end(), ptr, ptr + data->Length());
for (size_t i = 0; i < data->Length(); ++i) {
dim_ = std::max(dim_, ptr[i].dim());
}
num_rows_ += data->Length();
}
void
Append(const char* data, size_t size) override {
throw std::runtime_error(
"Append not supported for sparse float column");
}
int64_t
Dim() const {
return dim_;
}
private:
int64_t dim_ = 0;
std::vector<knowhere::sparse::SparseRow<float>> vec_;
};
template <typename T> template <typename T>
class VariableColumn : public ColumnBase { class VariableColumn : public ColumnBase {
public: public:

View File

@ -31,52 +31,6 @@
namespace milvus { namespace milvus {
inline size_t
GetDataSize(const std::vector<FieldDataPtr>& datas) {
size_t total_size{0};
for (const auto& data : datas) {
total_size += data->Size();
}
return total_size;
}
inline void*
FillField(DataType data_type, const FieldDataPtr& data, void* dst) {
char* dest = reinterpret_cast<char*>(dst);
if (datatype_is_variable(data_type)) {
switch (data_type) {
case DataType::STRING:
case DataType::VARCHAR: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto str =
static_cast<const std::string*>(data->RawValue(i));
memcpy(dest, str->data(), str->size());
dest += str->size();
}
break;
}
case DataType::JSON: {
for (ssize_t i = 0; i < data->get_num_rows(); ++i) {
auto padded_string =
static_cast<const Json*>(data->RawValue(i))->data();
memcpy(dest, padded_string.data(), padded_string.size());
dest += padded_string.size();
}
break;
}
default:
PanicInfo(
DataTypeInvalid, "not supported data type {}", data_type);
}
} else {
memcpy(dst, data->Data(), data->Size());
dest += data->Size();
}
return dest;
}
inline size_t inline size_t
WriteFieldData(File& file, WriteFieldData(File& file,
DataType data_type, DataType data_type,
@ -124,6 +78,12 @@ WriteFieldData(File& file,
} }
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
// TODO(SPARSE): this is for mmap to write data to disk so that
// the file can be mmaped into memory.
throw std::runtime_error(
"WriteFieldData for VECTOR_SPARSE_FLOAT not implemented");
}
default: default:
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
"not supported data type {}", "not supported data type {}",

View File

@ -24,7 +24,6 @@ set(SEGCORE_FILES
SegmentGrowingImpl.cpp SegmentGrowingImpl.cpp
SegmentSealedImpl.cpp SegmentSealedImpl.cpp
FieldIndexing.cpp FieldIndexing.cpp
InsertRecord.cpp
Reduce.cpp Reduce.cpp
metrics_c.cpp metrics_c.cpp
plan_c.cpp plan_c.cpp
@ -35,7 +34,6 @@ set(SEGCORE_FILES
SegcoreConfig.cpp SegcoreConfig.cpp
IndexConfigGenerator.cpp IndexConfigGenerator.cpp
segcore_init_c.cpp segcore_init_c.cpp
ScalarIndex.cpp
TimestampIndex.cpp TimestampIndex.cpp
Utils.cpp Utils.cpp
ConcurrentVector.cpp) ConcurrentVector.cpp)

View File

@ -36,8 +36,16 @@ VectorBase::set_data_raw(ssize_t element_offset,
} else if (field_meta.get_data_type() == DataType::VECTOR_BFLOAT16) { } else if (field_meta.get_data_type() == DataType::VECTOR_BFLOAT16) {
return set_data_raw( return set_data_raw(
element_offset, VEC_FIELD_DATA(data, bfloat16), element_count); element_offset, VEC_FIELD_DATA(data, bfloat16), element_count);
} else if (field_meta.get_data_type() ==
DataType::VECTOR_SPARSE_FLOAT) {
return set_data_raw(
element_offset,
SparseBytesToRows(
data->vectors().sparse_float_vector().contents())
.get(),
element_count);
} else { } else {
PanicInfo(DataTypeInvalid, "unsupported"); PanicInfo(DataTypeInvalid, "unsupported vector type");
} }
} }

View File

@ -93,9 +93,6 @@ class VectorBase {
} }
virtual ~VectorBase() = default; virtual ~VectorBase() = default;
virtual void
grow_to_at_least(int64_t element_count) = 0;
virtual void virtual void
set_data_raw(ssize_t element_offset, set_data_raw(ssize_t element_offset,
const void* source, const void* source,
@ -105,12 +102,13 @@ class VectorBase {
set_data_raw(ssize_t element_offset, set_data_raw(ssize_t element_offset,
const std::vector<FieldDataPtr>& data) = 0; const std::vector<FieldDataPtr>& data) = 0;
void virtual void
set_data_raw(ssize_t element_offset, set_data_raw(ssize_t element_offset,
ssize_t element_count, ssize_t element_count,
const DataArray* data, const DataArray* data,
const FieldMeta& field_meta); const FieldMeta& field_meta);
// used only by sealed segment to load system field
virtual void virtual void
fill_chunk_data(const std::vector<FieldDataPtr>& data) = 0; fill_chunk_data(const std::vector<FieldDataPtr>& data) = 0;
@ -135,7 +133,7 @@ class VectorBase {
const int64_t size_per_chunk_; const int64_t size_per_chunk_;
}; };
template <typename Type, bool is_scalar = false> template <typename Type, bool is_type_entire_row = false>
class ConcurrentVectorImpl : public VectorBase { class ConcurrentVectorImpl : public VectorBase {
public: public:
// constants // constants
@ -149,7 +147,7 @@ class ConcurrentVectorImpl : public VectorBase {
operator=(const ConcurrentVectorImpl&) = delete; operator=(const ConcurrentVectorImpl&) = delete;
using TraitType = std::conditional_t< using TraitType = std::conditional_t<
is_scalar, is_type_entire_row,
Type, Type,
std::conditional_t< std::conditional_t<
std::is_same_v<Type, float>, std::is_same_v<Type, float>,
@ -162,27 +160,16 @@ class ConcurrentVectorImpl : public VectorBase {
BinaryVector>>>>; BinaryVector>>>>;
public: public:
explicit ConcurrentVectorImpl(ssize_t dim, int64_t size_per_chunk) explicit ConcurrentVectorImpl(ssize_t elements_per_row,
: VectorBase(size_per_chunk), Dim(is_scalar ? 1 : dim) { int64_t size_per_chunk)
// Assert(is_scalar ? dim == 1 : dim != 1); : VectorBase(size_per_chunk),
} elements_per_row_(is_type_entire_row ? 1 : elements_per_row) {
void
grow_to_at_least(int64_t element_count) override {
auto chunk_count = upper_div(element_count, size_per_chunk_);
chunks_.emplace_to_at_least(chunk_count, Dim * size_per_chunk_);
}
void
grow_on_demand(int64_t element_count) {
auto chunk_count = upper_div(element_count, size_per_chunk_);
chunks_.emplace_to_at_least(chunk_count, Dim * element_count);
} }
Span<TraitType> Span<TraitType>
get_span(int64_t chunk_id) const { get_span(int64_t chunk_id) const {
auto& chunk = get_chunk(chunk_id); auto& chunk = get_chunk(chunk_id);
if constexpr (is_scalar) { if constexpr (is_type_entire_row) {
return Span<TraitType>(chunk.data(), chunk.size()); return Span<TraitType>(chunk.data(), chunk.size());
} else if constexpr (std::is_same_v<Type, int64_t> || // NOLINT } else if constexpr (std::is_same_v<Type, int64_t> || // NOLINT
std::is_same_v<Type, int>) { std::is_same_v<Type, int>) {
@ -191,7 +178,8 @@ class ConcurrentVectorImpl : public VectorBase {
} else { } else {
static_assert( static_assert(
std::is_same_v<typename TraitType::embedded_type, Type>); std::is_same_v<typename TraitType::embedded_type, Type>);
return Span<TraitType>(chunk.data(), chunk.size(), Dim); return Span<TraitType>(
chunk.data(), chunk.size(), elements_per_row_);
} }
} }
@ -201,15 +189,14 @@ class ConcurrentVectorImpl : public VectorBase {
} }
void void
fill_chunk_data(const std::vector<FieldDataPtr>& datas) fill_chunk_data(const std::vector<FieldDataPtr>& datas) override {
override { // used only for sealed segment AssertInfo(chunks_.size() == 0, "non empty concurrent vector");
AssertInfo(chunks_.size() == 0, "no empty concurrent vector");
int64_t element_count = 0; int64_t element_count = 0;
for (auto& field_data : datas) { for (auto& field_data : datas) {
element_count += field_data->get_num_rows(); element_count += field_data->get_num_rows();
} }
chunks_.emplace_to_at_least(1, Dim * element_count); chunks_.emplace_to_at_least(1, elements_per_row_ * element_count);
int64_t offset = 0; int64_t offset = 0;
for (auto& field_data : datas) { for (auto& field_data : datas) {
auto num_rows = field_data->get_num_rows(); auto num_rows = field_data->get_num_rows();
@ -236,11 +223,70 @@ class ConcurrentVectorImpl : public VectorBase {
if (element_count == 0) { if (element_count == 0) {
return; return;
} }
this->grow_to_at_least(element_offset + element_count); chunks_.emplace_to_at_least(
upper_div(element_offset + element_count, size_per_chunk_),
elements_per_row_ * size_per_chunk_);
set_data( set_data(
element_offset, static_cast<const Type*>(source), element_count); element_offset, static_cast<const Type*>(source), element_count);
} }
const Chunk&
get_chunk(ssize_t chunk_index) const {
return chunks_[chunk_index];
}
Chunk&
get_chunk(ssize_t index) {
return chunks_[index];
}
const void*
get_chunk_data(ssize_t chunk_index) const override {
return chunks_[chunk_index].data();
}
// just for fun, don't use it directly
const Type*
get_element(ssize_t element_index) const {
auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id).data() + chunk_offset * elements_per_row_;
}
const Type&
operator[](ssize_t element_index) const {
AssertInfo(
elements_per_row_ == 1,
fmt::format(
"The value of elements_per_row_ is not 1, elements_per_row_={}",
elements_per_row_));
auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id)[chunk_offset];
}
ssize_t
num_chunk() const override {
return chunks_.size();
}
bool
empty() override {
for (size_t i = 0; i < chunks_.size(); i++) {
if (get_chunk(i).size() > 0) {
return false;
}
}
return true;
}
void
clear() {
chunks_.clear();
}
private:
void void
set_data(ssize_t element_offset, set_data(ssize_t element_offset,
const Type* source, const Type* source,
@ -277,60 +323,6 @@ class ConcurrentVectorImpl : public VectorBase {
} }
} }
const Chunk&
get_chunk(ssize_t chunk_index) const {
return chunks_[chunk_index];
}
Chunk&
get_chunk(ssize_t index) {
return chunks_[index];
}
const void*
get_chunk_data(ssize_t chunk_index) const override {
return chunks_[chunk_index].data();
}
// just for fun, don't use it directly
const Type*
get_element(ssize_t element_index) const {
auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id).data() + chunk_offset * Dim;
}
const Type&
operator[](ssize_t element_index) const {
AssertInfo(Dim == 1,
fmt::format("The value of Dim is not 1, Dim={}", Dim));
auto chunk_id = element_index / size_per_chunk_;
auto chunk_offset = element_index % size_per_chunk_;
return get_chunk(chunk_id)[chunk_offset];
}
ssize_t
num_chunk() const override {
return chunks_.size();
}
bool
empty() override {
for (size_t i = 0; i < chunks_.size(); i++) {
if (get_chunk(i).size() > 0) {
return false;
}
}
return true;
}
void
clear() {
chunks_.clear();
}
private:
void void
fill_chunk(ssize_t chunk_id, fill_chunk(ssize_t chunk_id,
ssize_t chunk_offset, ssize_t chunk_offset,
@ -349,12 +341,12 @@ class ConcurrentVectorImpl : public VectorBase {
Chunk& chunk = chunks_[chunk_id]; Chunk& chunk = chunks_[chunk_id];
auto ptr = chunk.data(); auto ptr = chunk.data();
std::copy_n(source + source_offset * Dim, std::copy_n(source + source_offset * elements_per_row_,
element_count * Dim, element_count * elements_per_row_,
ptr + chunk_offset * Dim); ptr + chunk_offset * elements_per_row_);
} }
const ssize_t Dim; const ssize_t elements_per_row_;
private: private:
ThreadSafeVector<Chunk> chunks_; ThreadSafeVector<Chunk> chunks_;
@ -370,6 +362,40 @@ class ConcurrentVector : public ConcurrentVectorImpl<Type, true> {
} }
}; };
template <>
class ConcurrentVector<SparseFloatVector>
: public ConcurrentVectorImpl<knowhere::sparse::SparseRow<float>, true> {
public:
explicit ConcurrentVector(int64_t size_per_chunk)
: ConcurrentVectorImpl<knowhere::sparse::SparseRow<float>,
true>::ConcurrentVectorImpl(1, size_per_chunk),
dim_(0) {
}
void
set_data_raw(ssize_t element_offset,
const void* source,
ssize_t element_count) override {
auto* src =
static_cast<const knowhere::sparse::SparseRow<float>*>(source);
for (int i = 0; i < element_count; ++i) {
dim_ = std::max(dim_, src[i].dim());
}
ConcurrentVectorImpl<knowhere::sparse::SparseRow<float>,
true>::set_data_raw(element_offset,
source,
element_count);
}
int64_t
Dim() const {
return dim_;
}
private:
int64_t dim_;
};
template <> template <>
class ConcurrentVector<FloatVector> class ConcurrentVector<FloatVector>
: public ConcurrentVectorImpl<float, false> { : public ConcurrentVectorImpl<float, false> {

View File

@ -11,6 +11,7 @@
#include <string> #include <string>
#include <thread> #include <thread>
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "fmt/format.h" #include "fmt/format.h"
#include "index/ScalarIndexSort.h" #include "index/ScalarIndexSort.h"
@ -29,8 +30,8 @@ VectorFieldIndexing::VectorFieldIndexing(const FieldMeta& field_meta,
int64_t segment_max_row_count, int64_t segment_max_row_count,
const SegcoreConfig& segcore_config) const SegcoreConfig& segcore_config)
: FieldIndexing(field_meta, segcore_config), : FieldIndexing(field_meta, segcore_config),
build(false), built_(false),
sync_with_index(false), sync_with_index_(false),
config_(std::make_unique<VecIndexConfig>(segment_max_row_count, config_(std::make_unique<VecIndexConfig>(segment_max_row_count,
field_index_meta, field_index_meta,
segcore_config, segcore_config,
@ -45,6 +46,7 @@ void
VectorFieldIndexing::BuildIndexRange(int64_t ack_beg, VectorFieldIndexing::BuildIndexRange(int64_t ack_beg,
int64_t ack_end, int64_t ack_end,
const VectorBase* vec_base) { const VectorBase* vec_base) {
// No BuildIndexRange support for sparse vector.
AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT, AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT,
"Data type of vector field is not VECTOR_FLOAT"); "Data type of vector field is not VECTOR_FLOAT");
auto dim = field_meta_.get_dim(); auto dim = field_meta_.get_dim();
@ -85,13 +87,65 @@ VectorFieldIndexing::GetDataFromIndex(const int64_t* seg_offsets,
} }
void void
VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset, VectorFieldIndexing::AppendSegmentIndexSparse(int64_t reserved_offset,
int64_t size,
int64_t new_data_dim,
const VectorBase* field_raw_data,
const void* data_source) {
auto conf = get_build_params();
auto source = dynamic_cast<const ConcurrentVector<SparseFloatVector>*>(
field_raw_data);
AssertInfo(source,
"field_raw_data can't cast to "
"ConcurrentVector<SparseFloatVector> type");
AssertInfo(size > 0, "append 0 sparse rows to index is not allowed");
if (!built_) {
AssertInfo(!sync_with_index_, "index marked synced before built");
idx_t total_rows = reserved_offset + size;
idx_t chunk_id = 0;
auto dim = source->Dim();
while (total_rows > 0) {
auto mat = static_cast<const knowhere::sparse::SparseRow<float>*>(
source->get_chunk_data(chunk_id));
auto rows = std::min(source->get_size_per_chunk(), total_rows);
auto dataset = knowhere::GenDataSet(rows, dim, mat);
dataset->SetIsSparse(true);
try {
if (chunk_id == 0) {
index_->BuildWithDataset(dataset, conf);
} else {
index_->AddWithDataset(dataset, conf);
}
} catch (SegcoreError& error) {
LOG_ERROR("growing sparse index build error: {}", error.what());
return;
}
index_cur_.fetch_add(rows);
total_rows -= rows;
chunk_id++;
}
built_ = true;
sync_with_index_ = true;
// if not built_, new rows in data_source have already been added to
// source(ConcurrentVector<SparseFloatVector>) and thus added to the
// index, thus no need to add again.
return;
}
auto dataset = knowhere::GenDataSet(size, new_data_dim, data_source);
dataset->SetIsSparse(true);
index_->AddWithDataset(dataset, conf);
index_cur_.fetch_add(size);
}
void
VectorFieldIndexing::AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size, int64_t size,
const VectorBase* field_raw_data, const VectorBase* field_raw_data,
const void* data_source) { const void* data_source) {
AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT, AssertInfo(field_meta_.get_data_type() == DataType::VECTOR_FLOAT,
"Data type of vector field is not VECTOR_FLOAT"); "Data type of vector field is not VECTOR_FLOAT");
auto dim = field_meta_.get_dim(); auto dim = field_meta_.get_dim();
auto conf = get_build_params(); auto conf = get_build_params();
auto source = auto source =
@ -100,8 +154,9 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
auto size_per_chunk = source->get_size_per_chunk(); auto size_per_chunk = source->get_size_per_chunk();
//append vector [vector_id_beg, vector_id_end] into index //append vector [vector_id_beg, vector_id_end] into index
//build index [vector_id_beg, build_threshold) when index not exist //build index [vector_id_beg, build_threshold) when index not exist
if (!build) { if (!built_) {
idx_t vector_id_beg = index_cur_.load(); idx_t vector_id_beg = index_cur_.load();
Assert(vector_id_beg == 0);
idx_t vector_id_end = get_build_threshold() - 1; idx_t vector_id_end = get_build_threshold() - 1;
auto chunk_id_beg = vector_id_beg / size_per_chunk; auto chunk_id_beg = vector_id_beg / size_per_chunk;
auto chunk_id_end = vector_id_end / size_per_chunk; auto chunk_id_end = vector_id_end / size_per_chunk;
@ -143,7 +198,7 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
return; return;
} }
index_cur_.fetch_add(vec_num); index_cur_.fetch_add(vec_num);
build = true; built_ = true;
} }
//append rest data when index has built //append rest data when index has built
idx_t vector_id_beg = index_cur_.load(); idx_t vector_id_beg = index_cur_.load();
@ -153,11 +208,12 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
int64_t vec_num = vector_id_end - vector_id_beg + 1; int64_t vec_num = vector_id_end - vector_id_beg + 1;
if (vec_num <= 0) { if (vec_num <= 0) {
sync_with_index.store(true); sync_with_index_.store(true);
return; return;
} }
if (sync_with_index.load()) { if (sync_with_index_.load()) {
Assert(size == vec_num);
auto dataset = knowhere::GenDataSet(vec_num, dim, data_source); auto dataset = knowhere::GenDataSet(vec_num, dim, data_source);
index_->AddWithDataset(dataset, conf); index_->AddWithDataset(dataset, conf);
index_cur_.fetch_add(vec_num); index_cur_.fetch_add(vec_num);
@ -179,7 +235,7 @@ VectorFieldIndexing::AppendSegmentIndex(int64_t reserved_offset,
index_->AddWithDataset(dataset, conf); index_->AddWithDataset(dataset, conf);
index_cur_.fetch_add(chunk_sz); index_cur_.fetch_add(chunk_sz);
} }
sync_with_index.store(true); sync_with_index_.store(true);
} }
} }
@ -188,6 +244,8 @@ VectorFieldIndexing::get_build_params() const {
auto config = config_->GetBuildBaseParams(); auto config = config_->GetBuildBaseParams();
config[knowhere::meta::DIM] = std::to_string(field_meta_.get_dim()); config[knowhere::meta::DIM] = std::to_string(field_meta_.get_dim());
config[knowhere::meta::NUM_BUILD_THREAD] = std::to_string(1); config[knowhere::meta::NUM_BUILD_THREAD] = std::to_string(1);
// for sparse float vector: drop_ratio_build config is not allowed to be set
// on growing segment index.
return config; return config;
} }
@ -203,7 +261,7 @@ VectorFieldIndexing::get_index_cursor() {
} }
bool bool
VectorFieldIndexing::sync_data_with_index() const { VectorFieldIndexing::sync_data_with_index() const {
return sync_with_index.load(); return sync_with_index_.load();
} }
bool bool
@ -243,17 +301,10 @@ CreateIndex(const FieldMeta& field_meta,
int64_t segment_max_row_count, int64_t segment_max_row_count,
const SegcoreConfig& segcore_config) { const SegcoreConfig& segcore_config) {
if (field_meta.is_vector()) { if (field_meta.is_vector()) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { if (field_meta.get_data_type() == DataType::VECTOR_FLOAT ||
return std::make_unique<VectorFieldIndexing>(field_meta, field_meta.get_data_type() == DataType::VECTOR_FLOAT16 ||
field_index_meta, field_meta.get_data_type() == DataType::VECTOR_BFLOAT16 ||
segment_max_row_count, field_meta.get_data_type() == DataType::VECTOR_SPARSE_FLOAT) {
segcore_config);
} else if (field_meta.get_data_type() == DataType::VECTOR_FLOAT16) {
return std::make_unique<VectorFieldIndexing>(field_meta,
field_index_meta,
segment_max_row_count,
segcore_config);
} else if (field_meta.get_data_type() == DataType::VECTOR_BFLOAT16) {
return std::make_unique<VectorFieldIndexing>(field_meta, return std::make_unique<VectorFieldIndexing>(field_meta,
field_index_meta, field_index_meta,
segment_max_row_count, segment_max_row_count,

View File

@ -51,11 +51,19 @@ class FieldIndexing {
const VectorBase* vec_base) = 0; const VectorBase* vec_base) = 0;
virtual void virtual void
AppendSegmentIndex(int64_t reserved_offset, AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size, int64_t size,
const VectorBase* vec_base, const VectorBase* vec_base,
const void* data_source) = 0; const void* data_source) = 0;
// new_data_dim is the dimension of the new data being appended(data_source)
virtual void
AppendSegmentIndexSparse(int64_t reserved_offset,
int64_t size,
int64_t new_data_dim,
const VectorBase* vec_base,
const void* data_source) = 0;
virtual void virtual void
GetDataFromIndex(const int64_t* seg_offsets, GetDataFromIndex(const int64_t* seg_offsets,
int64_t count, int64_t count,
@ -109,12 +117,22 @@ class ScalarFieldIndexing : public FieldIndexing {
const VectorBase* vec_base) override; const VectorBase* vec_base) override;
void void
AppendSegmentIndex(int64_t reserved_offset, AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size, int64_t size,
const VectorBase* vec_base, const VectorBase* vec_base,
const void* data_source) override { const void* data_source) override {
PanicInfo(Unsupported, PanicInfo(Unsupported,
"scalar index don't support append segment index"); "scalar index doesn't support append vector segment index");
}
void
AppendSegmentIndexSparse(int64_t reserved_offset,
int64_t size,
int64_t new_data_dim,
const VectorBase* vec_base,
const void* data_source) override {
PanicInfo(Unsupported,
"scalar index doesn't support append vector segment index");
} }
void void
@ -171,11 +189,18 @@ class VectorFieldIndexing : public FieldIndexing {
const VectorBase* vec_base) override; const VectorBase* vec_base) override;
void void
AppendSegmentIndex(int64_t reserved_offset, AppendSegmentIndexDense(int64_t reserved_offset,
int64_t size, int64_t size,
const VectorBase* field_raw_data, const VectorBase* field_raw_data,
const void* data_source) override; const void* data_source) override;
void
AppendSegmentIndexSparse(int64_t reserved_offset,
int64_t size,
int64_t new_data_dim,
const VectorBase* field_raw_data,
const void* data_source) override;
void void
GetDataFromIndex(const int64_t* seg_offsets, GetDataFromIndex(const int64_t* seg_offsets,
int64_t count, int64_t count,
@ -214,9 +239,13 @@ class VectorFieldIndexing : public FieldIndexing {
get_search_params(const SearchInfo& searchInfo) const; get_search_params(const SearchInfo& searchInfo) const;
private: private:
// current number of rows in index.
std::atomic<idx_t> index_cur_ = 0; std::atomic<idx_t> index_cur_ = 0;
std::atomic<bool> build; // whether the growing index has been built.
std::atomic<bool> sync_with_index; std::atomic<bool> built_;
// whether all insertd data has been added to growing index and can be
// searched.
std::atomic<bool> sync_with_index_;
std::unique_ptr<VecIndexConfig> config_; std::unique_ptr<VecIndexConfig> config_;
std::unique_ptr<index::VectorIndex> index_; std::unique_ptr<index::VectorIndex> index_;
tbb::concurrent_vector<std::unique_ptr<index::VectorIndex>> data_; tbb::concurrent_vector<std::unique_ptr<index::VectorIndex>> data_;
@ -283,19 +312,28 @@ class IndexingRecord {
FieldId fieldId, FieldId fieldId,
const DataArray* stream_data, const DataArray* stream_data,
const InsertRecord<is_sealed>& record) { const InsertRecord<is_sealed>& record) {
if (is_in(fieldId)) { if (!is_in(fieldId)) {
return;
}
auto& indexing = field_indexings_.at(fieldId); auto& indexing = field_indexings_.at(fieldId);
if (indexing->get_field_meta().is_vector() && auto type = indexing->get_field_meta().get_data_type();
indexing->get_field_meta().get_data_type() ==
DataType::VECTOR_FLOAT &&
reserved_offset + size >= indexing->get_build_threshold()) {
auto field_raw_data = record.get_field_data_base(fieldId); auto field_raw_data = record.get_field_data_base(fieldId);
indexing->AppendSegmentIndex( if (type == DataType::VECTOR_FLOAT &&
reserved_offset + size >= indexing->get_build_threshold()) {
indexing->AppendSegmentIndexDense(
reserved_offset, reserved_offset,
size, size,
field_raw_data, field_raw_data,
stream_data->vectors().float_vector().data().data()); stream_data->vectors().float_vector().data().data());
} } else if (type == DataType::VECTOR_SPARSE_FLOAT) {
auto data = SparseBytesToRows(
stream_data->vectors().sparse_float_vector().contents());
indexing->AppendSegmentIndexSparse(
reserved_offset,
size,
stream_data->vectors().sparse_float_vector().dim(),
field_raw_data,
data.get());
} }
} }
@ -307,16 +345,28 @@ class IndexingRecord {
FieldId fieldId, FieldId fieldId,
const FieldDataPtr data, const FieldDataPtr data,
const InsertRecord<is_sealed>& record) { const InsertRecord<is_sealed>& record) {
if (is_in(fieldId)) { if (!is_in(fieldId)) {
return;
}
auto& indexing = field_indexings_.at(fieldId); auto& indexing = field_indexings_.at(fieldId);
if (indexing->get_field_meta().is_vector() && auto type = indexing->get_field_meta().get_data_type();
indexing->get_field_meta().get_data_type() == const void* p = data->Data();
DataType::VECTOR_FLOAT &&
if (type == DataType::VECTOR_FLOAT &&
reserved_offset + size >= indexing->get_build_threshold()) { reserved_offset + size >= indexing->get_build_threshold()) {
auto vec_base = record.get_field_data_base(fieldId); auto vec_base = record.get_field_data_base(fieldId);
indexing->AppendSegmentIndex( indexing->AppendSegmentIndexDense(
reserved_offset, size, vec_base, data->Data()); reserved_offset, size, vec_base, data->Data());
} } else if (type == DataType::VECTOR_SPARSE_FLOAT) {
auto vec_base = record.get_field_data_base(fieldId);
indexing->AppendSegmentIndexSparse(
reserved_offset,
size,
std::dynamic_pointer_cast<const FieldData<SparseFloatVector>>(
data)
->Dim(),
vec_base,
p);
} }
} }
@ -396,14 +446,12 @@ class IndexingRecord {
IndexMetaPtr index_meta_; IndexMetaPtr index_meta_;
const SegcoreConfig& segcore_config_; const SegcoreConfig& segcore_config_;
private:
// control info // control info
std::atomic<int64_t> resource_ack_ = 0; std::atomic<int64_t> resource_ack_ = 0;
// std::atomic<int64_t> finished_ack_ = 0; // std::atomic<int64_t> finished_ack_ = 0;
AckResponder finished_ack_; AckResponder finished_ack_;
std::mutex mutex_; std::mutex mutex_;
private:
// field_offset => indexing // field_offset => indexing
std::map<FieldId, std::unique_ptr<FieldIndexing>> field_indexings_; std::map<FieldId, std::unique_ptr<FieldIndexing>> field_indexings_;
}; };

View File

@ -20,8 +20,23 @@ VecIndexConfig::VecIndexConfig(const int64_t max_index_row_cout,
: max_index_row_count_(max_index_row_cout), config_(config) { : max_index_row_count_(max_index_row_cout), config_(config) {
origin_index_type_ = index_meta_.GetIndexType(); origin_index_type_ = index_meta_.GetIndexType();
metric_type_ = index_meta_.GeMetricType(); metric_type_ = index_meta_.GeMetricType();
// Currently for dense vector index, if the segment is growing, we use IVFCC
// as the index type; if the segment is sealed but its index has not been
// built by the index node, we use IVFFLAT as the temp index type and
// release it once the index node has finished building the index and query
// node has loaded it.
// But for sparse vector index(INDEX_SPARSE_INVERTED_INDEX and
// INDEX_SPARSE_WAND), those index themselves can be used as the temp index
// type, so we can avoid the extra step of "releast temp and load".
if (origin_index_type_ ==
knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX ||
origin_index_type_ == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
index_type_ = origin_index_type_;
} else {
index_type_ = support_index_types.at(segment_type); index_type_ = support_index_types.at(segment_type);
}
build_params_[knowhere::meta::METRIC_TYPE] = metric_type_; build_params_[knowhere::meta::METRIC_TYPE] = metric_type_;
build_params_[knowhere::indexparam::NLIST] = build_params_[knowhere::indexparam::NLIST] =
std::to_string(config_.get_nlist()); std::to_string(config_.get_nlist());
@ -29,6 +44,8 @@ VecIndexConfig::VecIndexConfig(const int64_t max_index_row_cout,
std::max((int)(config_.get_chunk_rows() / config_.get_nlist()), 48)); std::max((int)(config_.get_chunk_rows() / config_.get_nlist()), 48));
search_params_[knowhere::indexparam::NPROBE] = search_params_[knowhere::indexparam::NPROBE] =
std::to_string(config_.get_nprobe()); std::to_string(config_.get_nprobe());
// note for sparse vector index: drop_ratio_build is not allowed for growing
// segment index.
LOG_INFO( LOG_INFO(
"VecIndexConfig: origin_index_type={}, index_type={}, metric_type={}", "VecIndexConfig: origin_index_type={}, index_type={}, metric_type={}",
origin_index_type_, origin_index_type_,
@ -38,6 +55,14 @@ VecIndexConfig::VecIndexConfig(const int64_t max_index_row_cout,
int64_t int64_t
VecIndexConfig::GetBuildThreshold() const noexcept { VecIndexConfig::GetBuildThreshold() const noexcept {
// For sparse, do not impose a threshold and start using index with any
// number of rows. Unlike dense vector index, growing sparse vector index
// does not require a minimum number of rows to train.
if (origin_index_type_ ==
knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX ||
origin_index_type_ == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
return 0;
}
assert(VecIndexConfig::index_build_ratio.count(index_type_)); assert(VecIndexConfig::index_build_ratio.count(index_type_));
auto ratio = VecIndexConfig::index_build_ratio.at(index_type_); auto ratio = VecIndexConfig::index_build_ratio.at(index_type_);
assert(ratio >= 0.0 && ratio < 1.0); assert(ratio >= 0.0 && ratio < 1.0);

View File

@ -27,6 +27,8 @@ enum class IndexConfigLevel {
SYSTEM_ASSIGN = 3 SYSTEM_ASSIGN = 3
}; };
// this is the config used for generating growing index or the temp sealed index
// when the segment is sealed before the index is built.
class VecIndexConfig { class VecIndexConfig {
inline static const std::map<SegmentType, std::string> support_index_types = inline static const std::map<SegmentType, std::string> support_index_types =
{{SegmentType::Growing, knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC}, {{SegmentType::Growing, knowhere::IndexEnum::INDEX_FAISS_IVFFLAT_CC},

View File

@ -1,12 +0,0 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "InsertRecord.h"

View File

@ -346,6 +346,11 @@ struct InsertRecord {
this->append_field_data<BFloat16Vector>( this->append_field_data<BFloat16Vector>(
field_id, field_meta.get_dim(), size_per_chunk); field_id, field_meta.get_dim(), size_per_chunk);
continue; continue;
} else if (field_meta.get_data_type() ==
DataType::VECTOR_SPARSE_FLOAT) {
this->append_field_data<SparseFloatVector>(field_id,
size_per_chunk);
continue;
} else { } else {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported vector type", fmt::format("unsupported vector type",
@ -524,8 +529,7 @@ struct InsertRecord {
AssertInfo(fields_data_.find(field_id) != fields_data_.end(), AssertInfo(fields_data_.find(field_id) != fields_data_.end(),
"Cannot find field_data with field_id: " + "Cannot find field_data with field_id: " +
std::to_string(field_id.get())); std::to_string(field_id.get()));
auto ptr = fields_data_.at(field_id).get(); return fields_data_.at(field_id).get();
return ptr;
} }
// get field data in given type, const version // get field data in given type, const version
@ -552,7 +556,7 @@ struct InsertRecord {
template <typename Type> template <typename Type>
void void
append_field_data(FieldId field_id, int64_t size_per_chunk) { append_field_data(FieldId field_id, int64_t size_per_chunk) {
static_assert(IsScalar<Type>); static_assert(IsScalar<Type> || IsSparse<Type>);
fields_data_.emplace( fields_data_.emplace(
field_id, std::make_unique<ConcurrentVector<Type>>(size_per_chunk)); field_id, std::make_unique<ConcurrentVector<Type>>(size_per_chunk));
} }
@ -608,7 +612,6 @@ struct InsertRecord {
std::unique_ptr<OffsetMap> pk2offset_; std::unique_ptr<OffsetMap> pk2offset_;
private: private:
// std::vector<std::unique_ptr<VectorBase>> fields_data_;
std::unordered_map<FieldId, std::unique_ptr<VectorBase>> fields_data_{}; std::unordered_map<FieldId, std::unique_ptr<VectorBase>> fields_data_{};
mutable std::shared_mutex shared_mutex_{}; mutable std::shared_mutex shared_mutex_{};
}; };

View File

@ -1,88 +0,0 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "common/EasyAssert.h"
#include "ScalarIndex.h"
namespace milvus::segcore {
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
ScalarIndexVector::do_search_ids(const IdArray& ids) const {
auto res_ids = std::make_unique<IdArray>();
// TODO: support string array
static_assert(std::is_same_v<T, int64_t>);
AssertInfo(ids.has_int_id(), "ids doesn't have int_id field");
auto src_ids = ids.int_id();
auto dst_ids = res_ids->mutable_int_id();
std::vector<SegOffset> dst_offsets;
// TODO: a possible optimization:
// TODO: sort the input id array to make access cache friendly
// assume no repeated key now
// TODO: support repeated key
for (auto id : src_ids.data()) {
using Pair = std::pair<T, SegOffset>;
auto [iter_beg, iter_end] =
std::equal_range(mapping_.begin(),
mapping_.end(),
std::make_pair(id, SegOffset(0)),
[](const Pair& left, const Pair& right) {
return left.first < right.first;
});
for (auto& iter = iter_beg; iter != iter_end; iter++) {
auto [entry_id, entry_offset] = *iter;
dst_ids->add_data(entry_id);
dst_offsets.push_back(entry_offset);
}
}
return {std::move(res_ids), std::move(dst_offsets)};
}
std::pair<std::vector<idx_t>, std::vector<SegOffset>>
ScalarIndexVector::do_search_ids(const std::vector<idx_t>& ids) const {
std::vector<SegOffset> dst_offsets;
std::vector<idx_t> dst_ids;
for (auto id : ids) {
using Pair = std::pair<T, SegOffset>;
auto [iter_beg, iter_end] =
std::equal_range(mapping_.begin(),
mapping_.end(),
std::make_pair(id, SegOffset(0)),
[](const Pair& left, const Pair& right) {
return left.first < right.first;
});
for (auto& iter = iter_beg; iter != iter_end; iter++) {
auto [entry_id, entry_offset] = *iter_beg;
dst_ids.emplace_back(entry_id);
dst_offsets.push_back(entry_offset);
}
}
return {std::move(dst_ids), std::move(dst_offsets)};
}
void
ScalarIndexVector::append_data(const ScalarIndexVector::T* ids,
int64_t count,
SegOffset base) {
for (int64_t i = 0; i < count; ++i) {
auto offset = base + SegOffset(i);
mapping_.emplace_back(ids[i], offset);
}
}
void
ScalarIndexVector::build() {
std::sort(mapping_.begin(), mapping_.end());
}
} // namespace milvus::segcore

View File

@ -1,66 +0,0 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "common/Types.h"
#include "pb/schema.pb.h"
namespace milvus::segcore {
class ScalarIndexBase {
public:
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
do_search_ids(const IdArray& ids) const = 0;
virtual std::pair<std::vector<idx_t>, std::vector<SegOffset>>
do_search_ids(const std::vector<idx_t>& ids) const = 0;
virtual ~ScalarIndexBase() = default;
virtual std::string
debug() const = 0;
};
class ScalarIndexVector : public ScalarIndexBase {
using T = int64_t;
public:
// TODO: use proto::schema::ids
void
append_data(const T* ids, int64_t count, SegOffset base);
void
build();
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
do_search_ids(const IdArray& ids) const override;
std::pair<std::vector<idx_t>, std::vector<SegOffset>>
do_search_ids(const std::vector<idx_t>& ids) const override;
std::string
debug() const override {
std::string dbg_str;
for (auto pr : mapping_) {
dbg_str += "<" + std::to_string(pr.first) + "->" +
std::to_string(pr.second.get()) + ">";
}
return dbg_str;
}
private:
std::vector<std::pair<T, SegOffset>> mapping_;
};
} // namespace milvus::segcore

View File

@ -32,7 +32,7 @@ class SegmentGrowing : public SegmentInternalInterface {
int64_t size, int64_t size,
const int64_t* row_ids, const int64_t* row_ids,
const Timestamp* timestamps, const Timestamp* timestamps,
const InsertData* insert_data) = 0; const InsertRecordProto* insert_record_proto) = 0;
SegmentType SegmentType
type() const override { type() const override {

View File

@ -87,15 +87,13 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
int64_t num_rows, int64_t num_rows,
const int64_t* row_ids, const int64_t* row_ids,
const Timestamp* timestamps_raw, const Timestamp* timestamps_raw,
const InsertData* insert_data) { const InsertRecordProto* insert_record_proto) {
AssertInfo(insert_data->num_rows() == num_rows, AssertInfo(insert_record_proto->num_rows() == num_rows,
"Entities_raw count not equal to insert size"); "Entities_raw count not equal to insert size");
// AssertInfo(insert_data->fields_data_size() == schema_->size(),
// "num fields of insert data not equal to num of schema fields");
// step 1: check insert data if valid // step 1: check insert data if valid
std::unordered_map<FieldId, int64_t> field_id_to_offset; std::unordered_map<FieldId, int64_t> field_id_to_offset;
int64_t field_offset = 0; int64_t field_offset = 0;
for (const auto& field : insert_data->fields_data()) { for (const auto& field : insert_record_proto->fields_data()) {
auto field_id = FieldId(field.field_id()); auto field_id = FieldId(field.field_id());
AssertInfo(!field_id_to_offset.count(field_id), "duplicate field data"); AssertInfo(!field_id_to_offset.count(field_id), "duplicate field data");
field_id_to_offset.emplace(field_id, field_offset++); field_id_to_offset.emplace(field_id, field_offset++);
@ -122,7 +120,7 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
insert_record_.get_field_data_base(field_id)->set_data_raw( insert_record_.get_field_data_base(field_id)->set_data_raw(
reserved_offset, reserved_offset,
num_rows, num_rows,
&insert_data->fields_data(data_offset), &insert_record_proto->fields_data(data_offset),
field_meta); field_meta);
} }
//insert vector data into index //insert vector data into index
@ -131,13 +129,15 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
reserved_offset, reserved_offset,
num_rows, num_rows,
field_id, field_id,
&insert_data->fields_data(data_offset), &insert_record_proto->fields_data(data_offset),
insert_record_); insert_record_);
} }
// update average row data size // update average row data size
auto field_data_size = GetRawDataSizeOfDataArray( auto field_data_size = GetRawDataSizeOfDataArray(
&insert_data->fields_data(data_offset), field_meta, num_rows); &insert_record_proto->fields_data(data_offset),
field_meta,
num_rows);
if (datatype_is_variable(field_meta.get_data_type())) { if (datatype_is_variable(field_meta.get_data_type())) {
SegmentInternalInterface::set_field_avg_size( SegmentInternalInterface::set_field_avg_size(
field_id, num_rows, field_data_size); field_id, num_rows, field_data_size);
@ -153,7 +153,7 @@ SegmentGrowingImpl::Insert(int64_t reserved_offset,
AssertInfo(field_id.get() != INVALID_FIELD_ID, "Primary key is -1"); AssertInfo(field_id.get() != INVALID_FIELD_ID, "Primary key is -1");
std::vector<PkType> pks(num_rows); std::vector<PkType> pks(num_rows);
ParsePksFromFieldData( ParsePksFromFieldData(
pks, insert_data->fields_data(field_id_to_offset[field_id])); pks, insert_record_proto->fields_data(field_id_to_offset[field_id]));
for (int i = 0; i < num_rows; ++i) { for (int i = 0; i < num_rows; ++i) {
insert_record_.insert_pk(pks[i], reserved_offset + i); insert_record_.insert_pk(pks[i], reserved_offset + i);
} }

View File

@ -45,7 +45,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t size, int64_t size,
const int64_t* row_ids, const int64_t* row_ids,
const Timestamp* timestamps, const Timestamp* timestamps,
const InsertData* insert_data) override; const InsertRecordProto* insert_record_proto) override;
bool bool
Contain(const PkType& pk) const override { Contain(const PkType& pk) const override {

View File

@ -37,6 +37,7 @@
#include "common/FieldData.h" #include "common/FieldData.h"
#include "common/Types.h" #include "common/Types.h"
#include "log/Log.h" #include "log/Log.h"
#include "mmap/Utils.h"
#include "pb/schema.pb.h" #include "pb/schema.pb.h"
#include "mmap/Types.h" #include "mmap/Types.h"
#include "query/ScalarIndex.h" #include "query/ScalarIndex.h"
@ -252,7 +253,7 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& load_info) {
field_data_info.channel->set_capacity(parallel_degree * 2); field_data_info.channel->set_capacity(parallel_degree * 2);
auto& pool = auto& pool =
ThreadPools::GetThreadPool(milvus::ThreadPoolPriority::MIDDLE); ThreadPools::GetThreadPool(milvus::ThreadPoolPriority::MIDDLE);
auto load_future = pool.Submit( pool.Submit(
LoadFieldDatasFromRemote, insert_files, field_data_info.channel); LoadFieldDatasFromRemote, insert_files, field_data_info.channel);
LOG_INFO("segment {} submits load field {} task to thread pool", LOG_INFO("segment {} submits load field {} task to thread pool",
@ -272,6 +273,7 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& load_info) {
void void
SegmentSealedImpl::LoadFieldDataV2(const LoadFieldDataInfo& load_info) { SegmentSealedImpl::LoadFieldDataV2(const LoadFieldDataInfo& load_info) {
// TODO(SPARSE): support storage v2
// NOTE: lock only when data is ready to avoid starvation // NOTE: lock only when data is ready to avoid starvation
// only one field for now, parallel load field data in golang // only one field for now, parallel load field data in golang
size_t num_rows = storage::GetNumRowsForLoadInfo(load_info); size_t num_rows = storage::GetNumRowsForLoadInfo(load_info);
@ -435,6 +437,16 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
column = std::move(var_column); column = std::move(var_column);
break; break;
} }
case milvus::DataType::VECTOR_SPARSE_FLOAT: {
auto col = std::make_shared<SparseFloatColumn>(field_meta);
FieldDataPtr field_data;
while (data.channel->pop(field_data)) {
stats_.mem_size += field_data->Size();
col->AppendBatch(field_data);
}
column = std::move(col);
break;
}
default: { default: {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type", data_type)); fmt::format("unsupported data type", data_type));
@ -566,6 +578,7 @@ SegmentSealedImpl::MapFieldData(const FieldId field_id, FieldDataInfo& data) {
column = std::move(arr_column); column = std::move(arr_column);
break; break;
} }
// TODO(SPARSE) support mmap
default: { default: {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type {}", data_type)); fmt::format("unsupported data type {}", data_type));
@ -1514,14 +1527,17 @@ SegmentSealedImpl::generate_binlog_index(const FieldId field_id) {
auto& field_index_meta = col_index_meta_->GetFieldIndexMeta(field_id); auto& field_index_meta = col_index_meta_->GetFieldIndexMeta(field_id);
auto& index_params = field_index_meta.GetIndexParams(); auto& index_params = field_index_meta.GetIndexParams();
bool is_sparse =
field_meta.get_data_type() == DataType::VECTOR_SPARSE_FLOAT;
auto enable_binlog_index = [&]() { auto enable_binlog_index = [&]() {
// checkout config // checkout config
if (!segcore_config_.get_enable_interim_segment_index()) { if (!segcore_config_.get_enable_interim_segment_index()) {
return false; return false;
} }
// check data type // check data type
if (!field_meta.is_vector() || if (field_meta.get_data_type() != DataType::VECTOR_FLOAT &&
field_meta.get_data_type() != DataType::VECTOR_FLOAT) { !is_sparse) {
return false; return false;
} }
// check index type // check index type
@ -1546,7 +1562,7 @@ SegmentSealedImpl::generate_binlog_index(const FieldId field_id) {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
row_count = num_rows_.value(); row_count = num_rows_.value();
} }
auto dim = field_meta.get_dim();
// generate index params // generate index params
auto field_binlog_config = std::unique_ptr<VecIndexConfig>( auto field_binlog_config = std::unique_ptr<VecIndexConfig>(
new VecIndexConfig(row_count, new VecIndexConfig(row_count,
@ -1556,19 +1572,24 @@ SegmentSealedImpl::generate_binlog_index(const FieldId field_id) {
if (row_count < field_binlog_config->GetBuildThreshold()) { if (row_count < field_binlog_config->GetBuildThreshold()) {
return false; return false;
} }
auto build_config = field_binlog_config->GetBuildBaseParams();
build_config[knowhere::meta::DIM] = std::to_string(dim);
build_config[knowhere::meta::NUM_BUILD_THREAD] = std::to_string(1);
auto index_metric = field_binlog_config->GetMetricType();
std::shared_ptr<ColumnBase> vec_data{}; std::shared_ptr<ColumnBase> vec_data{};
{ {
std::shared_lock lck(mutex_); std::shared_lock lck(mutex_);
vec_data = fields_.at(field_id); vec_data = fields_.at(field_id);
} }
auto dim = is_sparse
? dynamic_cast<SparseFloatColumn*>(vec_data.get())->Dim()
: field_meta.get_dim();
auto build_config = field_binlog_config->GetBuildBaseParams();
build_config[knowhere::meta::DIM] = std::to_string(dim);
build_config[knowhere::meta::NUM_BUILD_THREAD] = std::to_string(1);
auto index_metric = field_binlog_config->GetMetricType();
auto dataset = auto dataset =
knowhere::GenDataSet(row_count, dim, (void*)vec_data->Data()); knowhere::GenDataSet(row_count, dim, (void*)vec_data->Data());
dataset->SetIsOwner(false); dataset->SetIsOwner(false);
dataset->SetIsSparse(is_sparse);
index::IndexBasePtr vec_index = index::IndexBasePtr vec_index =
std::make_unique<index::VectorMemIndex<float>>( std::make_unique<index::VectorMemIndex<float>>(

View File

@ -24,7 +24,6 @@
#include "ConcurrentVector.h" #include "ConcurrentVector.h"
#include "DeletedRecord.h" #include "DeletedRecord.h"
#include "ScalarIndex.h"
#include "SealedIndexingRecord.h" #include "SealedIndexingRecord.h"
#include "SegmentSealed.h" #include "SegmentSealed.h"
#include "TimestampIndex.h" #include "TimestampIndex.h"

View File

@ -21,8 +21,9 @@
#include "index/ScalarIndex.h" #include "index/ScalarIndex.h"
#include "log/Log.h" #include "log/Log.h"
#include "mmap/Utils.h" #include "mmap/Utils.h"
#include "storage/ThreadPool.h" #include "common/FieldData.h"
#include "storage/RemoteChunkManagerSingleton.h" #include "storage/RemoteChunkManagerSingleton.h"
#include "common/Common.h"
#include "storage/ThreadPools.h" #include "storage/ThreadPools.h"
#include "storage/Util.h" #include "storage/Util.h"
@ -205,6 +206,11 @@ GetRawDataSizeOfDataArray(const DataArray* data,
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
// TODO(SPARSE, size)
result += data->vectors().sparse_float_vector().ByteSizeLong();
break;
}
default: { default: {
PanicInfo( PanicInfo(
DataTypeInvalid, DataTypeInvalid,
@ -338,6 +344,10 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) {
obj->resize(length * sizeof(bfloat16)); obj->resize(length * sizeof(bfloat16));
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
// does nothing here
break;
}
default: { default: {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type)); fmt::format("unsupported datatype {}", data_type));
@ -446,8 +456,11 @@ CreateVectorDataArrayFrom(const void* data_raw,
field_meta.get_data_type())); field_meta.get_data_type()));
auto vector_array = data_array->mutable_vectors(); auto vector_array = data_array->mutable_vectors();
auto dim = field_meta.get_dim(); auto dim = 0;
if (!datatype_is_sparse_vector(data_type)) {
dim = field_meta.get_dim();
vector_array->set_dim(dim); vector_array->set_dim(dim);
}
switch (data_type) { switch (data_type) {
case DataType::VECTOR_FLOAT: { case DataType::VECTOR_FLOAT: {
auto length = count * dim; auto length = count * dim;
@ -479,6 +492,15 @@ CreateVectorDataArrayFrom(const void* data_raw,
obj->assign(data, length * sizeof(bfloat16)); obj->assign(data, length * sizeof(bfloat16));
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
SparseRowsToProto(
reinterpret_cast<const knowhere::sparse::SparseRow<float>*>(
data_raw),
count,
vector_array->mutable_sparse_float_vector());
vector_array->set_dim(vector_array->sparse_float_vector().dim());
break;
}
default: { default: {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type)); fmt::format("unsupported datatype {}", data_type));
@ -534,6 +556,15 @@ MergeDataArray(
auto data = VEC_FIELD_DATA(src_field_data, binary); auto data = VEC_FIELD_DATA(src_field_data, binary);
auto obj = vector_array->mutable_binary_vector(); auto obj = vector_array->mutable_binary_vector();
obj->assign(data + src_offset * num_bytes, num_bytes); obj->assign(data + src_offset * num_bytes, num_bytes);
} else if (field_meta.get_data_type() ==
DataType::VECTOR_SPARSE_FLOAT) {
auto src = src_field_data->vectors().sparse_float_vector();
auto dst = vector_array->mutable_sparse_float_vector();
if (src.dim() > dst->dim()) {
dst->set_dim(src.dim());
}
vector_array->set_dim(dst->dim());
*dst->mutable_contents() = src.contents();
} else { } else {
PanicInfo(DataTypeInvalid, PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type)); fmt::format("unsupported datatype {}", data_type));

View File

@ -207,12 +207,17 @@ Insert(CSegmentInterface c_segment,
const uint64_t data_info_len) { const uint64_t data_info_len) {
try { try {
auto segment = static_cast<milvus::segcore::SegmentGrowing*>(c_segment); auto segment = static_cast<milvus::segcore::SegmentGrowing*>(c_segment);
auto insert_data = std::make_unique<milvus::InsertData>(); auto insert_record_proto =
auto suc = insert_data->ParseFromArray(data_info, data_info_len); std::make_unique<milvus::InsertRecordProto>();
auto suc =
insert_record_proto->ParseFromArray(data_info, data_info_len);
AssertInfo(suc, "failed to parse insert data from records"); AssertInfo(suc, "failed to parse insert data from records");
segment->Insert( segment->Insert(reserved_offset,
reserved_offset, size, row_ids, timestamps, insert_data.get()); size,
row_ids,
timestamps,
insert_record_proto.get());
return milvus::SuccessCStatus(); return milvus::SuccessCStatus();
} catch (std::exception& e) { } catch (std::exception& e) {
return milvus::FailureCStatus(&e); return milvus::FailureCStatus(&e);

View File

@ -15,6 +15,7 @@
// limitations under the License. // limitations under the License.
#include "ChunkCache.h" #include "ChunkCache.h"
#include "mmap/Utils.h"
namespace milvus::storage { namespace milvus::storage {

View File

@ -58,7 +58,7 @@ class ChunkManager {
Read(const std::string& filepath, void* buf, uint64_t len) = 0; Read(const std::string& filepath, void* buf, uint64_t len) = 0;
/** /**
* @brief Write buffer to file with offset * @brief Write buffer to file without offset
* @param filepath * @param filepath
* @param buf * @param buf
* @param len * @param len

View File

@ -215,7 +215,8 @@ std::vector<uint8_t>
BaseEventData::Serialize() { BaseEventData::Serialize() {
auto data_type = field_data->get_data_type(); auto data_type = field_data->get_data_type();
std::shared_ptr<PayloadWriter> payload_writer; std::shared_ptr<PayloadWriter> payload_writer;
if (milvus::datatype_is_vector(data_type)) { if (milvus::datatype_is_vector(data_type) &&
data_type != DataType::VECTOR_SPARSE_FLOAT) {
payload_writer = payload_writer =
std::make_unique<PayloadWriter>(data_type, field_data->get_dim()); std::make_unique<PayloadWriter>(data_type, field_data->get_dim());
} else { } else {
@ -259,6 +260,18 @@ BaseEventData::Serialize() {
} }
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
for (size_t offset = 0; offset < field_data->get_num_rows();
++offset) {
auto row =
static_cast<const knowhere::sparse::SparseRow<float>*>(
field_data->RawValue(offset));
payload_writer->add_one_binary_payload(
static_cast<const uint8_t*>(row->data()),
row->data_byte_size());
}
break;
}
default: { default: {
auto payload = auto payload =
Payload{data_type, Payload{data_type,

View File

@ -59,7 +59,9 @@ PayloadReader::init(std::shared_ptr<arrow::io::BufferReader> input) {
int64_t column_index = 0; int64_t column_index = 0;
auto file_meta = arrow_reader->parquet_reader()->metadata(); auto file_meta = arrow_reader->parquet_reader()->metadata();
dim_ = datatype_is_vector(column_type_) // dim is unused for sparse float vector
dim_ = (datatype_is_vector(column_type_) &&
column_type_ != DataType::VECTOR_SPARSE_FLOAT)
? GetDimensionFromFileMetaData( ? GetDimensionFromFileMetaData(
file_meta->schema()->Column(column_index), column_type_) file_meta->schema()->Column(column_index), column_type_)
: 1; : 1;

View File

@ -31,6 +31,9 @@ PayloadWriter::PayloadWriter(const DataType column_type)
// create payload writer for vector data type // create payload writer for vector data type
PayloadWriter::PayloadWriter(const DataType column_type, int dim) PayloadWriter::PayloadWriter(const DataType column_type, int dim)
: column_type_(column_type) { : column_type_(column_type) {
AssertInfo(column_type != DataType::VECTOR_SPARSE_FLOAT,
"PayloadWriter for Sparse Float Vector should be created "
"using the constructor without dimension");
init_dimension(dim); init_dimension(dim);
} }
@ -58,7 +61,9 @@ PayloadWriter::add_one_string_payload(const char* str, int str_size) {
void void
PayloadWriter::add_one_binary_payload(const uint8_t* data, int length) { PayloadWriter::add_one_binary_payload(const uint8_t* data, int length) {
AssertInfo(output_ == nullptr, "payload writer has been finished"); AssertInfo(output_ == nullptr, "payload writer has been finished");
AssertInfo(milvus::datatype_is_binary(column_type_), "mismatch data type"); AssertInfo(milvus::datatype_is_binary(column_type_) ||
milvus::datatype_is_sparse_vector(column_type_),
"mismatch data type");
AddOneBinaryToArrowBuilder(builder_, data, length); AddOneBinaryToArrowBuilder(builder_, data, length);
rows_.fetch_add(1); rows_.fetch_add(1);
} }

View File

@ -39,8 +39,10 @@
#include "storage/OpenDALChunkManager.h" #include "storage/OpenDALChunkManager.h"
#endif #endif
#include "storage/Types.h" #include "storage/Types.h"
#include "storage/ThreadPools.h"
#include "storage/Util.h" #include "storage/Util.h"
#include "storage/ThreadPools.h"
#include "storage/MemFileManagerImpl.h"
#include "storage/DiskFileManagerImpl.h"
namespace milvus::storage { namespace milvus::storage {
@ -170,6 +172,12 @@ AddPayloadToArrowBuilder(std::shared_ptr<arrow::ArrayBuilder> builder,
add_vector_payload(builder, const_cast<uint8_t*>(raw_data), length); add_vector_payload(builder, const_cast<uint8_t*>(raw_data), length);
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
PanicInfo(DataTypeInvalid,
"Sparse Float Vector payload should be added by calling "
"add_one_binary_payload",
data_type);
}
default: { default: {
PanicInfo(DataTypeInvalid, "unsupported data type {}", data_type); PanicInfo(DataTypeInvalid, "unsupported data type {}", data_type);
} }
@ -242,6 +250,10 @@ CreateArrowBuilder(DataType data_type) {
case DataType::JSON: { case DataType::JSON: {
return std::make_shared<arrow::BinaryBuilder>(); return std::make_shared<arrow::BinaryBuilder>();
} }
// sparse float vector doesn't require a dim
case DataType::VECTOR_SPARSE_FLOAT: {
return std::make_shared<arrow::BinaryBuilder>();
}
default: { default: {
PanicInfo( PanicInfo(
DataTypeInvalid, "unsupported numeric data type {}", data_type); DataTypeInvalid, "unsupported numeric data type {}", data_type);
@ -311,6 +323,10 @@ CreateArrowSchema(DataType data_type) {
case DataType::JSON: { case DataType::JSON: {
return arrow::schema({arrow::field("val", arrow::binary())}); return arrow::schema({arrow::field("val", arrow::binary())});
} }
// sparse float vector doesn't require a dim
case DataType::VECTOR_SPARSE_FLOAT: {
return arrow::schema({arrow::field("val", arrow::binary())});
}
default: { default: {
PanicInfo( PanicInfo(
DataTypeInvalid, "unsupported numeric data type {}", data_type); DataTypeInvalid, "unsupported numeric data type {}", data_type);
@ -341,6 +357,9 @@ CreateArrowSchema(DataType data_type, int dim) {
return arrow::schema({arrow::field( return arrow::schema({arrow::field(
"val", arrow::fixed_size_binary(dim * sizeof(bfloat16)))}); "val", arrow::fixed_size_binary(dim * sizeof(bfloat16)))});
} }
case DataType::VECTOR_SPARSE_FLOAT: {
return arrow::schema({arrow::field("val", arrow::binary())});
}
default: { default: {
PanicInfo( PanicInfo(
DataTypeInvalid, "unsupported vector data type {}", data_type); DataTypeInvalid, "unsupported vector data type {}", data_type);
@ -364,6 +383,11 @@ GetDimensionFromFileMetaData(const parquet::ColumnDescriptor* schema,
case DataType::VECTOR_BFLOAT16: { case DataType::VECTOR_BFLOAT16: {
return schema->type_length() / sizeof(bfloat16); return schema->type_length() / sizeof(bfloat16);
} }
case DataType::VECTOR_SPARSE_FLOAT: {
PanicInfo(DataTypeInvalid,
fmt::format("GetDimensionFromFileMetaData should not be "
"called for sparse vector"));
}
default: default:
PanicInfo(DataTypeInvalid, "unsupported data type {}", data_type); PanicInfo(DataTypeInvalid, "unsupported data type {}", data_type);
} }
@ -501,11 +525,12 @@ EncodeAndUploadFieldSlice(ChunkManager* chunk_manager,
field_data->FillFieldData(buf, element_count); field_data->FillFieldData(buf, element_count);
auto insertData = std::make_shared<InsertData>(field_data); auto insertData = std::make_shared<InsertData>(field_data);
insertData->SetFieldDataMeta(field_data_meta); insertData->SetFieldDataMeta(field_data_meta);
auto serialized_index_data = insertData->serialize_to_remote_file(); auto serialized_inserted_data = insertData->serialize_to_remote_file();
auto serialized_index_size = serialized_index_data.size(); auto serialized_inserted_data_size = serialized_inserted_data.size();
chunk_manager->Write( chunk_manager->Write(object_key,
object_key, serialized_index_data.data(), serialized_index_size); serialized_inserted_data.data(),
return std::make_pair(std::move(object_key), serialized_index_size); serialized_inserted_data_size);
return std::make_pair(std::move(object_key), serialized_inserted_data_size);
} }
std::vector<std::future<std::unique_ptr<DataCodec>>> std::vector<std::future<std::unique_ptr<DataCodec>>>
@ -738,6 +763,9 @@ CreateFieldData(const DataType& type, int64_t dim, int64_t total_num_rows) {
case DataType::VECTOR_BFLOAT16: case DataType::VECTOR_BFLOAT16:
return std::make_shared<FieldData<BFloat16Vector>>( return std::make_shared<FieldData<BFloat16Vector>>(
dim, type, total_num_rows); dim, type, total_num_rows);
case DataType::VECTOR_SPARSE_FLOAT:
return std::make_shared<FieldData<SparseFloatVector>>(
type, total_num_rows);
default: default:
throw SegcoreError( throw SegcoreError(
DataTypeInvalid, DataTypeInvalid,

View File

@ -31,6 +31,8 @@ typedef struct CBuffer {
} CBuffer; } CBuffer;
//============= payload writer ====================== //============= payload writer ======================
// TODO(SPARSE): CPayloadWriter is no longer used as we switch to the payload
// writer in golang. Thus not implementing sparse float vector support here.
typedef void* CPayloadWriter; typedef void* CPayloadWriter;
CPayloadWriter CPayloadWriter
NewPayloadWriter(int columnType); NewPayloadWriter(int columnType);

View File

@ -141,7 +141,7 @@ class BinlogIndexTest : public ::testing::TestWithParam<Param> {
std::shared_ptr<float[]> vec_data; std::shared_ptr<float[]> vec_data;
}; };
INSTANTIATE_TEST_CASE_P(MetricTypeParameters, INSTANTIATE_TEST_SUITE_P(MetricTypeParameters,
BinlogIndexTest, BinlogIndexTest,
::testing::Values(knowhere::metric::L2)); ::testing::Values(knowhere::metric::L2));

View File

@ -34,8 +34,7 @@ TEST(ConcurrentVector, TestSingle) {
for (auto& x : vec) { for (auto& x : vec) {
x = data++; x = data++;
} }
c_vec.grow_to_at_least(total_count + insert_size); c_vec.set_data_raw(total_count, vec.data(), insert_size);
c_vec.set_data(total_count, vec.data(), insert_size);
total_count += insert_size; total_count += insert_size;
} }
ASSERT_EQ(c_vec.num_chunk(), (total_count + 31) / 32); ASSERT_EQ(c_vec.num_chunk(), (total_count + 31) / 32);
@ -66,8 +65,7 @@ TEST(ConcurrentVector, TestMultithreads) {
x = data++ * threads + thread_id; x = data++ * threads + thread_id;
} }
auto offset = ack_counter.fetch_add(insert_size); auto offset = ack_counter.fetch_add(insert_size);
c_vec.grow_to_at_least(offset + insert_size); c_vec.set_data_raw(offset, vec.data(), insert_size);
c_vec.set_data(offset, vec.data(), insert_size);
total_count += insert_size; total_count += insert_size;
} }
assert(data == total_count * dim); assert(data == total_count * dim);

View File

@ -22,6 +22,8 @@
#include "storage/Util.h" #include "storage/Util.h"
#include "common/Consts.h" #include "common/Consts.h"
#include "common/Json.h" #include "common/Json.h"
#include "test_utils/Constants.h"
#include "test_utils/DataGen.h"
using namespace milvus; using namespace milvus;
@ -274,6 +276,45 @@ TEST(storage, InsertDataFloatVector) {
ASSERT_EQ(data, new_data); ASSERT_EQ(data, new_data);
} }
TEST(storage, InsertDataSparseFloat) {
auto n_rows = 100;
auto vecs = milvus::segcore::GenerateRandomSparseFloatVector(
n_rows, kTestSparseDim, kTestSparseVectorDensity);
auto field_data = milvus::storage::CreateFieldData(
storage::DataType::VECTOR_SPARSE_FLOAT, kTestSparseDim, n_rows);
field_data->FillFieldData(vecs.get(), n_rows);
storage::InsertData insert_data(field_data);
storage::FieldDataMeta field_data_meta{100, 101, 102, 103};
insert_data.SetFieldDataMeta(field_data_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::StorageType::Remote);
std::shared_ptr<uint8_t[]> serialized_data_ptr(serialized_bytes.data(),
[&](uint8_t*) {});
auto new_insert_data = storage::DeserializeFileData(
serialized_data_ptr, serialized_bytes.size());
ASSERT_EQ(new_insert_data->GetCodecType(), storage::InsertDataType);
ASSERT_EQ(new_insert_data->GetTimeRage(),
std::make_pair(Timestamp(0), Timestamp(100)));
auto new_payload = new_insert_data->GetFieldData();
ASSERT_TRUE(new_payload->get_data_type() ==
storage::DataType::VECTOR_SPARSE_FLOAT);
ASSERT_EQ(new_payload->get_num_rows(), n_rows);
auto new_data = static_cast<const knowhere::sparse::SparseRow<float>*>(
new_payload->Data());
for (auto i = 0; i < n_rows; ++i) {
auto& original = vecs[i];
auto& new_vec = new_data[i];
ASSERT_EQ(original.size(), new_vec.size());
for (auto j = 0; j < original.size(); ++j) {
ASSERT_EQ(original[j].id, new_vec[j].id);
ASSERT_EQ(original[j].val, new_vec[j].val);
}
}
}
TEST(storage, InsertDataBinaryVector) { TEST(storage, InsertDataBinaryVector) {
std::vector<uint8_t> data = {1, 2, 3, 4, 5, 6, 7, 8}; std::vector<uint8_t> data = {1, 2, 3, 4, 5, 6, 7, 8};
int DIM = 16; int DIM = 16;

View File

@ -155,7 +155,7 @@ class GrowingIndexGetVectorTest : public ::testing::TestWithParam<Param> {
const char* metricType; const char* metricType;
}; };
INSTANTIATE_TEST_CASE_P(IndexTypeParameters, INSTANTIATE_TEST_SUITE_P(IndexTypeParameters,
GrowingIndexGetVectorTest, GrowingIndexGetVectorTest,
::testing::Values(knowhere::metric::L2, ::testing::Values(knowhere::metric::L2,
knowhere::metric::COSINE, knowhere::metric::COSINE,

View File

@ -79,6 +79,70 @@ TEST(FloatVecIndex, All) {
{ DeleteBinarySet(binary_set); } { DeleteBinarySet(binary_set); }
} }
TEST(SparseFloatVecIndex, All) {
auto index_type = knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX;
auto metric_type = knowhere::metric::IP;
indexcgo::TypeParams type_params;
indexcgo::IndexParams index_params;
std::tie(type_params, index_params) =
generate_params(index_type, metric_type);
std::string type_params_str, index_params_str;
bool ok = google::protobuf::TextFormat::PrintToString(type_params,
&type_params_str);
assert(ok);
ok = google::protobuf::TextFormat::PrintToString(index_params,
&index_params_str);
assert(ok);
auto dataset = GenDatasetWithDataType(
NB, metric_type, milvus::DataType::VECTOR_SPARSE_FLOAT);
auto xb_data = dataset.get_col<knowhere::sparse::SparseRow<float>>(
milvus::FieldId(100));
CDataType dtype = SparseFloatVector;
CIndex index;
CStatus status;
CBinarySet binary_set;
CIndex copy_index;
{
status = CreateIndexV0(
dtype, type_params_str.c_str(), index_params_str.c_str(), &index);
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = BuildSparseFloatVecIndex(
index,
NB,
kTestSparseDim,
static_cast<const uint8_t*>(
static_cast<const void*>(xb_data.data())));
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = SerializeIndexToBinarySet(index, &binary_set);
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = CreateIndexV0(dtype,
type_params_str.c_str(),
index_params_str.c_str(),
&copy_index);
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = LoadIndexFromBinarySet(copy_index, binary_set);
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = DeleteIndex(index);
ASSERT_EQ(milvus::Success, status.error_code);
}
{
status = DeleteIndex(copy_index);
ASSERT_EQ(milvus::Success, status.error_code);
}
{ DeleteBinarySet(binary_set); }
}
TEST(Float16VecIndex, All) { TEST(Float16VecIndex, All) {
auto index_type = knowhere::IndexEnum::INDEX_FAISS_IVFPQ; auto index_type = knowhere::IndexEnum::INDEX_FAISS_IVFPQ;
auto metric_type = knowhere::metric::L2; auto metric_type = knowhere::metric::L2;

View File

@ -59,35 +59,23 @@ class IndexWrapperTest : public ::testing::TestWithParam<Param> {
search_conf = generate_search_conf(index_type, metric_type); search_conf = generate_search_conf(index_type, metric_type);
std::map<knowhere::MetricType, bool> is_binary_map = { std::map<knowhere::MetricType, DataType> index_to_vec_type = {
{knowhere::IndexEnum::INDEX_FAISS_IDMAP, false}, {knowhere::IndexEnum::INDEX_FAISS_IDMAP, DataType::VECTOR_FLOAT},
{knowhere::IndexEnum::INDEX_FAISS_IVFPQ, false}, {knowhere::IndexEnum::INDEX_FAISS_IVFPQ, DataType::VECTOR_FLOAT},
{knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, false}, {knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, DataType::VECTOR_FLOAT},
{knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, false}, {knowhere::IndexEnum::INDEX_FAISS_IVFSQ8, DataType::VECTOR_FLOAT},
{knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, true}, {knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT,
{knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, true}, DataType::VECTOR_BINARY},
{knowhere::IndexEnum::INDEX_HNSW, false}, {knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP,
DataType::VECTOR_BINARY},
{knowhere::IndexEnum::INDEX_HNSW, DataType::VECTOR_FLOAT},
{knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
DataType::VECTOR_SPARSE_FLOAT},
{knowhere::IndexEnum::INDEX_SPARSE_WAND,
DataType::VECTOR_SPARSE_FLOAT},
}; };
is_binary = is_binary_map[index_type]; vec_field_data_type = index_to_vec_type[index_type];
if (is_binary) {
vec_field_data_type = DataType::VECTOR_BINARY;
} else {
vec_field_data_type = DataType::VECTOR_FLOAT;
}
auto dataset = GenDataset(NB, metric_type, is_binary);
if (!is_binary) {
xb_data = dataset.get_col<float>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataSet(NB, DIM, xb_data.data());
xq_dataset = knowhere::GenDataSet(
NQ, DIM, xb_data.data() + DIM * query_offset);
} else {
xb_bin_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataSet(NB, DIM, xb_bin_data.data());
xq_dataset = knowhere::GenDataSet(
NQ, DIM, xb_bin_data.data() + DIM * query_offset);
}
} }
void void
@ -101,18 +89,13 @@ class IndexWrapperTest : public ::testing::TestWithParam<Param> {
std::string type_params_str, index_params_str; std::string type_params_str, index_params_str;
Config config; Config config;
milvus::Config search_conf; milvus::Config search_conf;
bool is_binary;
DataType vec_field_data_type; DataType vec_field_data_type;
knowhere::DataSetPtr xb_dataset; int64_t query_offset = 1;
FixedVector<float> xb_data; int64_t NB = 10;
FixedVector<uint8_t> xb_bin_data;
knowhere::DataSetPtr xq_dataset;
int64_t query_offset = 100;
int64_t NB = 10000;
StorageConfig storage_config_; StorageConfig storage_config_;
}; };
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_SUITE_P(
IndexTypeParameters, IndexTypeParameters,
IndexWrapperTest, IndexWrapperTest,
::testing::Values( ::testing::Values(
@ -126,7 +109,11 @@ INSTANTIATE_TEST_CASE_P(
knowhere::metric::JACCARD), knowhere::metric::JACCARD),
std::pair(knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, std::pair(knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP,
knowhere::metric::JACCARD), knowhere::metric::JACCARD),
std::pair(knowhere::IndexEnum::INDEX_HNSW, knowhere::metric::L2))); std::pair(knowhere::IndexEnum::INDEX_HNSW, knowhere::metric::L2),
std::pair(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX,
knowhere::metric::IP),
std::pair(knowhere::IndexEnum::INDEX_SPARSE_WAND,
knowhere::metric::IP)));
TEST_P(IndexWrapperTest, BuildAndQuery) { TEST_P(IndexWrapperTest, BuildAndQuery) {
milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100}; milvus::storage::FieldDataMeta field_data_meta{1, 2, 3, 100};
@ -139,20 +126,29 @@ TEST_P(IndexWrapperTest, BuildAndQuery) {
std::to_string(knowhere::Version::GetCurrentVersion().VersionNumber()); std::to_string(knowhere::Version::GetCurrentVersion().VersionNumber());
auto index = milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex( auto index = milvus::indexbuilder::IndexFactory::GetInstance().CreateIndex(
vec_field_data_type, config, file_manager_context); vec_field_data_type, config, file_manager_context);
auto dataset = GenDataset(NB, metric_type, is_binary);
knowhere::DataSetPtr xb_dataset; knowhere::DataSetPtr xb_dataset;
FixedVector<uint8_t> bin_vecs; if (vec_field_data_type == DataType::VECTOR_BINARY) {
FixedVector<float> f_vecs; auto dataset = GenDataset(NB, metric_type, true);
if (is_binary) { auto bin_vecs = dataset.get_col<uint8_t>(milvus::FieldId(100));
bin_vecs = dataset.get_col<uint8_t>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataSet(NB, DIM, bin_vecs.data()); xb_dataset = knowhere::GenDataSet(NB, DIM, bin_vecs.data());
ASSERT_NO_THROW(index->Build(xb_dataset));
} else if (vec_field_data_type == DataType::VECTOR_SPARSE_FLOAT) {
auto dataset = GenDatasetWithDataType(
NB, metric_type, milvus::DataType::VECTOR_SPARSE_FLOAT);
auto sparse_vecs = dataset.get_col<knowhere::sparse::SparseRow<float>>(
milvus::FieldId(100));
xb_dataset =
knowhere::GenDataSet(NB, kTestSparseDim, sparse_vecs.data());
xb_dataset->SetIsSparse(true);
ASSERT_NO_THROW(index->Build(xb_dataset));
} else { } else {
f_vecs = dataset.get_col<float>(milvus::FieldId(100)); // VECTOR_FLOAT
auto dataset = GenDataset(NB, metric_type, false);
auto f_vecs = dataset.get_col<float>(milvus::FieldId(100));
xb_dataset = knowhere::GenDataSet(NB, DIM, f_vecs.data()); xb_dataset = knowhere::GenDataSet(NB, DIM, f_vecs.data());
ASSERT_NO_THROW(index->Build(xb_dataset));
} }
ASSERT_NO_THROW(index->Build(xb_dataset));
auto binary_set = index->Serialize(); auto binary_set = index->Serialize();
FixedVector<std::string> index_files; FixedVector<std::string> index_files;
for (auto& binary : binary_set.binary_map_) { for (auto& binary : binary_set.binary_map_) {
@ -164,21 +160,53 @@ TEST_P(IndexWrapperTest, BuildAndQuery) {
vec_field_data_type, config, file_manager_context); vec_field_data_type, config, file_manager_context);
auto vec_index = auto vec_index =
static_cast<milvus::indexbuilder::VecIndexCreator*>(copy_index.get()); static_cast<milvus::indexbuilder::VecIndexCreator*>(copy_index.get());
if (vec_field_data_type != DataType::VECTOR_SPARSE_FLOAT) {
ASSERT_EQ(vec_index->dim(), DIM); ASSERT_EQ(vec_index->dim(), DIM);
}
ASSERT_NO_THROW(vec_index->Load(binary_set)); ASSERT_NO_THROW(vec_index->Load(binary_set));
if (vec_field_data_type == DataType::VECTOR_SPARSE_FLOAT) {
// TODO(SPARSE): complete test in PR adding search/query to sparse
// float vector.
return;
}
milvus::SearchInfo search_info; milvus::SearchInfo search_info;
search_info.topk_ = K; search_info.topk_ = K;
search_info.metric_type_ = metric_type; search_info.metric_type_ = metric_type;
search_info.search_params_ = search_conf; search_info.search_params_ = search_conf;
auto result = vec_index->Query(xq_dataset, search_info, nullptr); std::unique_ptr<SearchResult> result;
if (vec_field_data_type == DataType::VECTOR_FLOAT) {
auto dataset = GenDataset(NB, metric_type, false);
auto xb_data = dataset.get_col<float>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataSet(NB, DIM, xb_data.data());
auto xq_dataset =
knowhere::GenDataSet(NQ, DIM, xb_data.data() + DIM * query_offset);
result = vec_index->Query(xq_dataset, search_info, nullptr);
} else if (vec_field_data_type == DataType::VECTOR_SPARSE_FLOAT) {
auto dataset = GenDatasetWithDataType(
NQ, metric_type, milvus::DataType::VECTOR_SPARSE_FLOAT);
auto xb_data = dataset.get_col<knowhere::sparse::SparseRow<float>>(
milvus::FieldId(100));
auto xq_dataset =
knowhere::GenDataSet(NQ, kTestSparseDim, xb_data.data());
xq_dataset->SetIsSparse(true);
result = vec_index->Query(xq_dataset, search_info, nullptr);
} else {
auto dataset = GenDataset(NB, metric_type, true);
auto xb_bin_data = dataset.get_col<uint8_t>(milvus::FieldId(100));
auto xb_dataset = knowhere::GenDataSet(NB, DIM, xb_bin_data.data());
auto xq_dataset = knowhere::GenDataSet(
NQ, DIM, xb_bin_data.data() + DIM * query_offset);
result = vec_index->Query(xq_dataset, search_info, nullptr);
}
EXPECT_EQ(result->total_nq_, NQ); EXPECT_EQ(result->total_nq_, NQ);
EXPECT_EQ(result->unity_topK_, K); EXPECT_EQ(result->unity_topK_, K);
EXPECT_EQ(result->distances_.size(), NQ * K); EXPECT_EQ(result->distances_.size(), NQ * K);
EXPECT_EQ(result->seg_offsets_.size(), NQ * K); EXPECT_EQ(result->seg_offsets_.size(), NQ * K);
if (!is_binary) { if (vec_field_data_type == DataType::VECTOR_FLOAT) {
EXPECT_EQ(result->seg_offsets_[0], query_offset); EXPECT_EQ(result->seg_offsets_[0], query_offset);
} }
} }

View File

@ -383,7 +383,7 @@ class IndexTest : public ::testing::TestWithParam<Param> {
StorageConfig storage_config_; StorageConfig storage_config_;
}; };
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_SUITE_P(
IndexTypeParameters, IndexTypeParameters,
IndexTest, IndexTest,
::testing::Values( ::testing::Values(
@ -990,7 +990,7 @@ TEST(Indexing, SearchDiskAnnWithInvalidParam) {
// boost::filesystem::path mmap_file_path; // boost::filesystem::path mmap_file_path;
//}; //};
// //
//INSTANTIATE_TEST_CASE_P( //INSTANTIATE_TEST_SUITE_P(
// IndexTypeParameters, // IndexTypeParameters,
// IndexTestV2, // IndexTestV2,
// testing::Combine( // testing::Combine(

View File

@ -157,7 +157,7 @@ class RangeSearchSortTest
float dist_min = 0.0, dist_max = 100.0; float dist_min = 0.0, dist_max = 100.0;
}; };
INSTANTIATE_TEST_CASE_P(RangeSearchSortParameters, INSTANTIATE_TEST_SUITE_P(RangeSearchSortParameters,
RangeSearchSortTest, RangeSearchSortTest,
::testing::Values(knowhere::metric::L2, ::testing::Values(knowhere::metric::L2,
knowhere::metric::IP, knowhere::metric::IP,

View File

@ -15,7 +15,6 @@
#include "knowhere/comp/index_param.h" #include "knowhere/comp/index_param.h"
#include "query/Expr.h" #include "query/Expr.h"
#include "query/ExprImpl.h" #include "query/ExprImpl.h"
#include "segcore/ScalarIndex.h"
#include "test_utils/DataGen.h" #include "test_utils/DataGen.h"
#include "exec/expression/Expr.h" #include "exec/expression/Expr.h"
#include "plan/PlanNode.h" #include "plan/PlanNode.h"
@ -30,32 +29,6 @@ RetrieveUsingDefaultOutputSize(SegmentInterface* segment,
return segment->Retrieve(plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE); return segment->Retrieve(plan, timestamp, DEFAULT_MAX_OUTPUT_SIZE);
} }
TEST(Retrieve, ScalarIndex) {
SUCCEED();
auto index = std::make_unique<ScalarIndexVector>();
std::vector<int64_t> data;
int N = 1000;
auto req_ids = std::make_unique<IdArray>();
auto req_ids_arr = req_ids->mutable_int_id();
for (int i = 0; i < N; ++i) {
data.push_back(i * 3 % N);
req_ids_arr->add_data(i);
}
index->append_data(data.data(), N, SegOffset(10000));
index->build();
auto [res_ids, res_offsets] = index->do_search_ids(*req_ids);
auto res_ids_arr = res_ids->int_id();
for (int i = 0; i < N; ++i) {
auto res_offset = res_offsets[i].get() - 10000;
auto res_id = res_ids_arr.data(i);
auto std_id = (res_offset * 3 % N);
ASSERT_EQ(res_id, std_id);
}
}
TEST(Retrieve, AutoID) { TEST(Retrieve, AutoID) {
auto schema = std::make_shared<Schema>(); auto schema = std::make_shared<Schema>();
auto fid_64 = schema->AddDebugField("i64", DataType::INT64); auto fid_64 = schema->AddDebugField("i64", DataType::INT64);

View File

@ -9,6 +9,10 @@
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License // or implied. See the License for the specific language governing permissions and limitations under the License
#include <vector>
#include <memory>
#include <cstring>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string.h> #include <string.h>
#include <boost/uuid/uuid.hpp> #include <boost/uuid/uuid.hpp>
@ -19,6 +23,8 @@
#include "common/Types.h" #include "common/Types.h"
#include "common/Utils.h" #include "common/Utils.h"
#include "common/Exception.h" #include "common/Exception.h"
#include "knowhere/sparse_utils.h"
#include "pb/schema.pb.h"
#include "query/Utils.h" #include "query/Utils.h"
#include "test_utils/DataGen.h" #include "test_utils/DataGen.h"
@ -131,8 +137,7 @@ TEST(Util, upper_bound) {
std::vector<Timestamp> data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<Timestamp> data{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
ConcurrentVector<Timestamp> timestamps(1); ConcurrentVector<Timestamp> timestamps(1);
timestamps.grow_to_at_least(data.size()); timestamps.set_data_raw(0, data.data(), data.size());
timestamps.set_data(0, data.data(), data.size());
ASSERT_EQ(1, upper_bound(timestamps, 0, data.size(), 0)); ASSERT_EQ(1, upper_bound(timestamps, 0, data.size(), 0));
ASSERT_EQ(5, upper_bound(timestamps, 0, data.size(), 4)); ASSERT_EQ(5, upper_bound(timestamps, 0, data.size(), 4));

View File

@ -13,3 +13,6 @@
constexpr int64_t TestChunkSize = 32 * 1024; constexpr int64_t TestChunkSize = 32 * 1024;
constexpr char TestLocalPath[] = "/tmp/milvus/local_data/"; constexpr char TestLocalPath[] = "/tmp/milvus/local_data/";
constexpr char TestRemotePath[] = "/tmp/milvus/remote_data"; constexpr char TestRemotePath[] = "/tmp/milvus/remote_data";
constexpr int64_t kTestSparseDim = 10000;
constexpr float kTestSparseVectorDensity = 0.0003;

View File

@ -16,7 +16,9 @@
#include <memory> #include <memory>
#include <random> #include <random>
#include <string> #include <string>
#include <cmath>
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include "Constants.h" #include "Constants.h"
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
@ -42,7 +44,7 @@ namespace milvus::segcore {
struct GeneratedData { struct GeneratedData {
std::vector<idx_t> row_ids_; std::vector<idx_t> row_ids_;
std::vector<Timestamp> timestamps_; std::vector<Timestamp> timestamps_;
InsertData* raw_; InsertRecordProto* raw_;
std::vector<FieldId> field_ids; std::vector<FieldId> field_ids;
SchemaPtr schema_; SchemaPtr schema_;
@ -92,7 +94,8 @@ struct GeneratedData {
} }
auto& field_meta = schema_->operator[](field_id); auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector()) { if (field_meta.is_vector() &&
field_meta.get_data_type() != DataType::VECTOR_SPARSE_FLOAT) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) { if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
int len = raw_->num_rows() * field_meta.get_dim(); int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len); ret.resize(len);
@ -111,7 +114,6 @@ struct GeneratedData {
std::copy_n(src_data, len, ret.data()); std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() == } else if (field_meta.get_data_type() ==
DataType::VECTOR_FLOAT16) { DataType::VECTOR_FLOAT16) {
// int len = raw_->num_rows() * field_meta.get_dim() * sizeof(float16);
int len = raw_->num_rows() * field_meta.get_dim(); int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len); ret.resize(len);
auto src_data = reinterpret_cast<const T*>( auto src_data = reinterpret_cast<const T*>(
@ -119,7 +121,6 @@ struct GeneratedData {
std::copy_n(src_data, len, ret.data()); std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() == } else if (field_meta.get_data_type() ==
DataType::VECTOR_BFLOAT16) { DataType::VECTOR_BFLOAT16) {
// int len = raw_->num_rows() * field_meta.get_dim() * sizeof(bfloat16);
int len = raw_->num_rows() * field_meta.get_dim(); int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len); ret.resize(len);
auto src_data = reinterpret_cast<const T*>( auto src_data = reinterpret_cast<const T*>(
@ -131,7 +132,13 @@ struct GeneratedData {
return std::move(ret); return std::move(ret);
} }
if constexpr (std::is_same_v<T, ScalarArray>) { if constexpr (std::is_same_v<T,
knowhere::sparse::SparseRow<float>>) {
auto sparse_float_array =
target_field_data.vectors().sparse_float_vector();
auto rows = SparseBytesToRows(sparse_float_array.contents());
std::copy_n(rows.get(), raw_->num_rows(), ret.data());
} else if constexpr (std::is_same_v<T, ScalarArray>) {
auto ret_data = reinterpret_cast<ScalarArray*>(ret.data()); auto ret_data = reinterpret_cast<ScalarArray*>(ret.data());
auto src_data = target_field_data.scalars().array_data().data(); auto src_data = target_field_data.scalars().array_data().data();
std::copy(src_data.begin(), src_data.end(), ret_data); std::copy(src_data.begin(), src_data.end(), ret_data);
@ -238,8 +245,50 @@ struct GeneratedData {
int array_len); int array_len);
}; };
inline GeneratedData inline std::unique_ptr<knowhere::sparse::SparseRow<float>[]>
DataGen(SchemaPtr schema, GenerateRandomSparseFloatVector(size_t rows,
size_t cols,
float density,
int seed = 42) {
int32_t num_elements = static_cast<int32_t>(rows * cols * density);
std::mt19937 rng(seed);
auto real_distrib = std::uniform_real_distribution<float>(0, 1);
auto row_distrib = std::uniform_int_distribution<int32_t>(0, rows - 1);
auto col_distrib = std::uniform_int_distribution<int32_t>(0, cols - 1);
std::vector<std::map<int32_t, float>> data(rows);
for (int32_t i = 0; i < num_elements; ++i) {
auto row = row_distrib(rng);
while (data[row].size() == (size_t)cols) {
row = row_distrib(rng);
}
auto col = col_distrib(rng);
while (data[row].find(col) != data[row].end()) {
col = col_distrib(rng);
}
auto val = real_distrib(rng);
data[row][col] = val;
}
auto tensor = std::make_unique<knowhere::sparse::SparseRow<float>[]>(rows);
for (int32_t i = 0; i < rows; ++i) {
if (data[i].size() == 0) {
continue;
}
knowhere::sparse::SparseRow<float> row(data[i].size());
size_t j = 0;
for (auto& [idx, val] : data[i]) {
row.set_at(j++, idx, val);
}
tensor[i] = std::move(row);
}
return tensor;
}
inline GeneratedData DataGen(SchemaPtr schema,
int64_t N, int64_t N,
uint64_t seed = 42, uint64_t seed = 42,
uint64_t ts_offset = 0, uint64_t ts_offset = 0,
@ -250,7 +299,7 @@ DataGen(SchemaPtr schema,
std::normal_distribution<> distr(0, 1); std::normal_distribution<> distr(0, 1);
int offset = 0; int offset = 0;
auto insert_data = std::make_unique<InsertData>(); auto insert_data = std::make_unique<InsertRecordProto>();
auto insert_cols = [&insert_data]( auto insert_cols = [&insert_data](
auto& data, int64_t count, auto& field_meta) { auto& data, int64_t count, auto& field_meta) {
auto array = milvus::segcore::CreateDataArrayFrom( auto array = milvus::segcore::CreateDataArrayFrom(
@ -309,6 +358,15 @@ DataGen(SchemaPtr schema,
insert_cols(final, N, field_meta); insert_cols(final, N, field_meta);
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
auto res = GenerateRandomSparseFloatVector(
N, kTestSparseDim, kTestSparseVectorDensity, seed);
auto array = milvus::segcore::CreateDataArrayFrom(
res.get(), N, field_meta);
insert_data->mutable_fields_data()->AddAllocated(
array.release());
break;
}
case DataType::VECTOR_BFLOAT16: { case DataType::VECTOR_BFLOAT16: {
auto dim = field_meta.get_dim(); auto dim = field_meta.get_dim();
@ -526,7 +584,7 @@ DataGenForJsonArray(SchemaPtr schema,
std::default_random_engine er(seed); std::default_random_engine er(seed);
std::normal_distribution<> distr(0, 1); std::normal_distribution<> distr(0, 1);
auto insert_data = std::make_unique<InsertData>(); auto insert_data = std::make_unique<InsertRecordProto>();
auto insert_cols = [&insert_data]( auto insert_cols = [&insert_data](
auto& data, int64_t count, auto& field_meta) { auto& data, int64_t count, auto& field_meta) {
auto array = milvus::segcore::CreateDataArrayFrom( auto array = milvus::segcore::CreateDataArrayFrom(
@ -777,6 +835,23 @@ CreateBFloat16PlaceholderGroupFromBlob(int64_t num_queries,
return raw_group; return raw_group;
} }
inline auto
CreateSparseFloatPlaceholderGroup(int64_t num_queries, int64_t seed = 42) {
namespace ser = milvus::proto::common;
ser::PlaceholderGroup raw_group;
auto value = raw_group.add_placeholders();
value->set_tag("$0");
value->set_type(ser::PlaceholderType::SparseFloatVector);
auto sparse_vecs = GenerateRandomSparseFloatVector(
num_queries, kTestSparseDim, kTestSparseVectorDensity, seed);
for (int i = 0; i < num_queries; ++i) {
value->add_values(sparse_vecs[i].data(),
sparse_vecs[i].data_byte_size());
}
return raw_group;
}
inline auto inline auto
SearchResultToVector(const SearchResult& sr) { SearchResultToVector(const SearchResult& sr) {
int64_t num_queries = sr.total_nq_; int64_t num_queries = sr.total_nq_;
@ -850,6 +925,12 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
createFieldData(raw_data, DataType::VECTOR_BFLOAT16, dim); createFieldData(raw_data, DataType::VECTOR_BFLOAT16, dim);
break; break;
} }
case DataType::VECTOR_SPARSE_FLOAT: {
auto sparse_float_array = data->vectors().sparse_float_vector();
auto rows = SparseBytesToRows(sparse_float_array.contents());
createFieldData(rows.get(), DataType::VECTOR_SPARSE_FLOAT, 0);
break;
}
default: { default: {
PanicInfo(Unsupported, "unsupported"); PanicInfo(Unsupported, "unsupported");
} }

View File

@ -37,31 +37,6 @@ using namespace milvus;
using namespace milvus::segcore; using namespace milvus::segcore;
namespace { namespace {
const char*
get_default_schema_config() {
static std::string conf = R"(name: "default-collection"
fields: <
fieldID: 100
name: "fakevec"
data_type: FloatVector
type_params: <
key: "dim"
value: "16"
>
index_params: <
key: "metric_type"
value: "L2"
>
>
fields: <
fieldID: 101
name: "age"
data_type: Int64
is_primary_key: true
>)";
static std::string fake_conf = "";
return conf.c_str();
}
std::string std::string
generate_max_float_query_data(int all_nq, int max_float_nq) { generate_max_float_query_data(int all_nq, int max_float_nq) {

View File

@ -98,6 +98,11 @@ generate_build_conf(const milvus::IndexType& index_type,
{milvus::index::DISK_ANN_BUILD_DRAM_BUDGET, std::to_string(32)}, {milvus::index::DISK_ANN_BUILD_DRAM_BUDGET, std::to_string(32)},
{milvus::index::DISK_ANN_BUILD_THREAD_NUM, std::to_string(2)}, {milvus::index::DISK_ANN_BUILD_THREAD_NUM, std::to_string(2)},
}; };
} else if (index_type == knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX ||
index_type == knowhere::IndexEnum::INDEX_SPARSE_WAND) {
return knowhere::Json{
{knowhere::meta::METRIC_TYPE, metric_type},
};
} }
return knowhere::Json(); return knowhere::Json();
} }
@ -235,6 +240,10 @@ GenDatasetWithDataType(int64_t N,
schema->AddDebugField( schema->AddDebugField(
"fakevec", milvus::DataType::VECTOR_FLOAT, dim, metric_type); "fakevec", milvus::DataType::VECTOR_FLOAT, dim, metric_type);
return milvus::segcore::DataGen(schema, N); return milvus::segcore::DataGen(schema, N);
} else if (data_type == milvus::DataType::VECTOR_SPARSE_FLOAT) {
schema->AddDebugField(
"fakevec", milvus::DataType::VECTOR_SPARSE_FLOAT, 0, metric_type);
return milvus::segcore::DataGen(schema, N);
} else { } else {
schema->AddDebugField( schema->AddDebugField(
"fakebinvec", milvus::DataType::VECTOR_BINARY, dim, metric_type); "fakebinvec", milvus::DataType::VECTOR_BINARY, dim, metric_type);