feat: impl StructArray -- data storage support in segcore (#42406)

Ref https://github.com/milvus-io/milvus/issues/42148
This PR mainly enables segcore to support array of vector (read and
write, but not indexing). Now only float vector as the element type is
supported.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-06-12 14:38:35 +08:00 committed by GitHub
parent 57c60af00d
commit 911a8df17c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
60 changed files with 2082 additions and 590 deletions

View File

@ -6,7 +6,7 @@ require (
github.com/blang/semver/v4 v4.0.0
github.com/cockroachdb/errors v1.9.1
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847
github.com/milvus-io/milvus/pkg/v2 v2.0.0-20250319085209-5a6b4e56d59e
github.com/quasilyte/go-ruleguard/dsl v0.3.22
github.com/samber/lo v1.27.0

View File

@ -318,8 +318,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfr
github.com/mediocregopher/radix/v3 v3.4.2/go.mod h1:8FL3F6UQRXHXIBSPUs5h0RybMF8i4n7wVopoX3x7Bv8=
github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd h1:nriBHIny3LiSU56kP2FiIVtxh/0EEFXBpZk4WirsR7s=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847 h1:6MFC+EIDe+n1aSX0BZ4s1/XbbPaEzQCx6JBoM1NTTz0=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus/pkg/v2 v2.0.0-20250319085209-5a6b4e56d59e h1:VCr43pG4efacDbM4au70fh8/5hNTftoWzm1iEumvDWM=
github.com/milvus-io/milvus/pkg/v2 v2.0.0-20250319085209-5a6b4e56d59e/go.mod h1:37AWzxVs2NS4QUJrkcbeLUwi+4Av0h5mEdjLI62EANU=
github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=

2
go.mod
View File

@ -21,7 +21,7 @@ require (
github.com/grpc-ecosystem/go-grpc-middleware v1.3.0
github.com/klauspost/compress v1.17.9
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847
github.com/minio/minio-go/v7 v7.0.73
github.com/panjf2000/ants/v2 v2.11.3 // indirect
github.com/pingcap/log v1.1.1-0.20221015072633-39906604fb81

4
go.sum
View File

@ -737,8 +737,8 @@ github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6 h1:YHMFI6L
github.com/milvus-io/cgosymbolizer v0.0.0-20250318084424-114f4050c3a6/go.mod h1:DvXTE/K/RtHehxU8/GtDs4vFtfw64jJ3PaCnFri8CRg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6 h1:/m5rWCbnFL0Pg2KLS6Lw/lzQfdEb4qa2dLEqE4QjpkU=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847 h1:6MFC+EIDe+n1aSX0BZ4s1/XbbPaEzQCx6JBoM1NTTz0=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/pulsar-client-go v0.12.1 h1:O2JZp1tsYiO7C0MQ4hrUY/aJXnn2Gry6hpm7UodghmE=
github.com/milvus-io/pulsar-client-go v0.12.1/go.mod h1:dkutuH4oS2pXiGm+Ti7fQZ4MRjrMPZ8IJeEGAWMeckk=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=

View File

@ -19,6 +19,7 @@
#include <type_traits>
#include <utility>
#include <vector>
#include <memory>
#include <arrow/array.h>
#include <arrow/array/builder_primitive.h>
@ -33,13 +34,7 @@ class Array {
public:
Array() = default;
~Array() {
delete[] data_;
if (offsets_ptr_) {
// only deallocate offsets for string type array
delete[] offsets_ptr_;
}
}
~Array() = default;
Array(char* data,
int len,
@ -47,78 +42,78 @@ class Array {
DataType element_type,
const uint32_t* offsets_ptr)
: size_(size), length_(len), element_type_(element_type) {
data_ = new char[size];
std::copy(data, data + size, data_);
data_ = std::make_unique<char[]>(size);
std::copy(data, data + size, data_.get());
if (IsVariableDataType(element_type)) {
AssertInfo(offsets_ptr != nullptr,
"For variable type elements in array, offsets_ptr must "
"be non-null");
offsets_ptr_ = new uint32_t[len];
std::copy(offsets_ptr, offsets_ptr + len, offsets_ptr_);
offsets_ptr_ = std::make_unique<uint32_t[]>(len);
std::copy(offsets_ptr, offsets_ptr + len, offsets_ptr_.get());
}
}
explicit Array(const ScalarArray& field_data) {
explicit Array(const ScalarFieldProto& field_data) {
switch (field_data.data_case()) {
case ScalarArray::kBoolData: {
case ScalarFieldProto::kBoolData: {
element_type_ = DataType::BOOL;
length_ = field_data.bool_data().data().size();
auto data = new bool[length_];
size_ = length_;
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
data[i] = field_data.bool_data().data(i);
reinterpret_cast<bool*>(data_.get())[i] =
field_data.bool_data().data(i);
}
data_ = reinterpret_cast<char*>(data);
break;
}
case ScalarArray::kIntData: {
case ScalarFieldProto::kIntData: {
element_type_ = DataType::INT32;
length_ = field_data.int_data().data().size();
size_ = length_ * sizeof(int32_t);
data_ = new char[size_];
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
reinterpret_cast<int*>(data_)[i] =
reinterpret_cast<int*>(data_.get())[i] =
field_data.int_data().data(i);
}
break;
}
case ScalarArray::kLongData: {
case ScalarFieldProto::kLongData: {
element_type_ = DataType::INT64;
length_ = field_data.long_data().data().size();
size_ = length_ * sizeof(int64_t);
data_ = new char[size_];
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
reinterpret_cast<int64_t*>(data_)[i] =
reinterpret_cast<int64_t*>(data_.get())[i] =
field_data.long_data().data(i);
}
break;
}
case ScalarArray::kFloatData: {
case ScalarFieldProto::kFloatData: {
element_type_ = DataType::FLOAT;
length_ = field_data.float_data().data().size();
size_ = length_ * sizeof(float);
data_ = new char[size_];
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
reinterpret_cast<float*>(data_)[i] =
reinterpret_cast<float*>(data_.get())[i] =
field_data.float_data().data(i);
}
break;
}
case ScalarArray::kDoubleData: {
case ScalarFieldProto::kDoubleData: {
element_type_ = DataType::DOUBLE;
length_ = field_data.double_data().data().size();
size_ = length_ * sizeof(double);
data_ = new char[size_];
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
reinterpret_cast<double*>(data_)[i] =
reinterpret_cast<double*>(data_.get())[i] =
field_data.double_data().data(i);
}
break;
}
case ScalarArray::kStringData: {
case ScalarFieldProto::kStringData: {
element_type_ = DataType::STRING;
length_ = field_data.string_data().data().size();
offsets_ptr_ = new uint32_t[length_];
offsets_ptr_ = std::make_unique<uint32_t[]>(length_);
for (int i = 0; i < length_; ++i) {
offsets_ptr_[i] = size_;
size_ +=
@ -126,11 +121,11 @@ class Array {
.data(i)
.size(); //type risk here between uint32_t vs size_t
}
data_ = new char[size_];
data_ = std::make_unique<char[]>(size_);
for (int i = 0; i < length_; ++i) {
std::copy_n(field_data.string_data().data(i).data(),
field_data.string_data().data(i).size(),
data_ + offsets_ptr_[i]);
data_.get() + offsets_ptr_[i]);
}
break;
}
@ -144,35 +139,43 @@ class Array {
: length_{array.length_},
size_{array.size_},
element_type_{array.element_type_} {
data_ = new char[array.size_];
std::copy(array.data_, array.data_ + array.size_, data_);
data_ = std::make_unique<char[]>(array.size_);
std::copy(
array.data_.get(), array.data_.get() + array.size_, data_.get());
if (IsVariableDataType(array.element_type_)) {
AssertInfo(array.get_offsets_data() != nullptr,
"for array with variable length elements, offsets_ptr"
"must not be nullptr");
offsets_ptr_ = new uint32_t[length_];
std::copy_n(array.get_offsets_data(), array.length(), offsets_ptr_);
offsets_ptr_ = std::make_unique<uint32_t[]>(length_);
std::copy_n(
array.get_offsets_data(), array.length(), offsets_ptr_.get());
}
}
friend void
swap(Array& array1, Array& array2) noexcept {
using std::swap;
swap(array1.data_, array2.data_);
swap(array1.length_, array2.length_);
swap(array1.size_, array2.size_);
swap(array1.element_type_, array2.element_type_);
swap(array1.offsets_ptr_, array2.offsets_ptr_);
}
Array&
operator=(const Array& array) {
delete[] data_;
if (offsets_ptr_) {
delete[] offsets_ptr_;
}
length_ = array.length_;
size_ = array.size_;
element_type_ = array.element_type_;
data_ = new char[size_];
std::copy(array.data_, array.data_ + size_, data_);
if (IsVariableDataType(element_type_)) {
AssertInfo(array.get_offsets_data() != nullptr,
"for array with variable length elements, offsets_ptr"
"must not be nullptr");
offsets_ptr_ = new uint32_t[length_];
std::copy_n(array.get_offsets_data(), array.length(), offsets_ptr_);
}
Array temp(array);
swap(*this, temp);
return *this;
}
Array(Array&& other) noexcept : Array() {
swap(*this, other);
}
Array&
operator=(Array&& other) noexcept {
swap(*this, other);
return *this;
}
@ -258,7 +261,7 @@ class Array {
(index == length_ - 1)
? size_ - offsets_ptr_[length_ - 1]
: offsets_ptr_[index + 1] - offsets_ptr_[index];
return T(data_ + offsets_ptr_[index], element_length);
return T(data_.get() + offsets_ptr_[index], element_length);
}
if constexpr (std::is_same_v<T, int> || std::is_same_v<T, int64_t> ||
std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
@ -268,32 +271,32 @@ class Array {
case DataType::INT16:
case DataType::INT32:
return static_cast<T>(
reinterpret_cast<int32_t*>(data_)[index]);
reinterpret_cast<int32_t*>(data_.get())[index]);
case DataType::INT64:
return static_cast<T>(
reinterpret_cast<int64_t*>(data_)[index]);
reinterpret_cast<int64_t*>(data_.get())[index]);
case DataType::FLOAT:
return static_cast<T>(
reinterpret_cast<float*>(data_)[index]);
reinterpret_cast<float*>(data_.get())[index]);
case DataType::DOUBLE:
return static_cast<T>(
reinterpret_cast<double*>(data_)[index]);
reinterpret_cast<double*>(data_.get())[index]);
default:
PanicInfo(Unsupported,
"unsupported element type for array");
}
}
return reinterpret_cast<T*>(data_)[index];
return reinterpret_cast<T*>(data_.get())[index];
}
uint32_t*
get_offsets_data() const {
return offsets_ptr_;
return offsets_ptr_.get();
}
ScalarArray
ScalarFieldProto
output_data() const {
ScalarArray data_array;
ScalarFieldProto data_array;
switch (element_type_) {
case DataType::BOOL: {
for (int j = 0; j < length_; ++j) {
@ -364,7 +367,7 @@ class Array {
const char*
data() const {
return data_;
return data_.get();
}
bool
@ -442,11 +445,11 @@ class Array {
}
private:
char* data_{nullptr};
std::unique_ptr<char[]> data_{nullptr};
int length_ = 0;
int size_ = 0;
DataType element_type_ = DataType::NONE;
uint32_t* offsets_ptr_{nullptr};
std::unique_ptr<uint32_t[]> offsets_ptr_{nullptr};
};
class ArrayView {
@ -528,9 +531,9 @@ class ArrayView {
return reinterpret_cast<T*>(data_)[index];
}
ScalarArray
ScalarFieldProto
output_data() const {
ScalarArray data_array;
ScalarFieldProto data_array;
switch (element_type_) {
case DataType::BOOL: {
for (int j = 0; j < length_; ++j) {

View File

@ -11,9 +11,6 @@
#include <sys/mman.h>
#include <cstdint>
#include "common/Array.h"
#include "common/Span.h"
#include "common/Types.h"
#include "common/Chunk.h"
namespace milvus {

View File

@ -21,6 +21,7 @@
#include "arrow/array/array_base.h"
#include "arrow/record_batch.h"
#include "common/Array.h"
#include "common/VectorArray.h"
#include "common/ChunkTarget.h"
#include "common/EasyAssert.h"
#include "common/FieldDataInterface.h"
@ -231,11 +232,11 @@ using JSONChunk = StringChunk;
// create an ArrayChunk for these arrays. The data block might look like this:
//
// [null_bitmap][offsets_lens][array_data]
// [00000000] [24, 3, 36, 2, 44, 4, 60] [1, 2, 3, 4, 5, 6, 7, 8, 9]
// [00000000] [29, 3, 41, 2, 49, 4, 65] [1, 2, 3, 4, 5, 6, 7, 8, 9]
//
// For string arrays, the structure is more complex as each string element needs its own offset:
// [null_bitmap][offsets_lens][array1_offsets][array1_data][array2_offsets][array2_data][array3_offsets][array3_data]
// [00000000] [24, 3, 48, 2, 64, 4, 96] [0, 5, 11, 16] ["hello", "world", "!"] [0, 3, 6] ["foo", "bar"] [0, 6, 12, 18, 24] ["apple", "orange", "banana", "grape"]
// [00000000] [29, 3, 53, 2, 69, 4, 101] [0, 5, 11, 16] ["hello", "world", "!"] [0, 3, 6] ["foo", "bar"] [0, 6, 12, 18, 24] ["apple", "orange", "banana", "grape"]
//
// Here, the null_bitmap is empty (indicating no nulls), the offsets_lens array contains pairs of (offset, length)
// for each array, and the array_data contains the actual array elements.
@ -331,6 +332,67 @@ class ArrayChunk : public Chunk {
uint32_t* offsets_lens_;
};
// A VectorArrayChunk is similar to an ArrayChunk but is specialized for storing arrays of vectors.
// Key differences and characteristics:
// - No Nullability: VectorArrayChunk does not support null values. Unlike ArrayChunk, it does not have a null bitmap.
// - Fixed Vector Dimensions: All vectors within a VectorArrayChunk have the same, fixed dimension, specified at creation.
// However, each row (array of vectors) can contain a variable number of these fixed-dimension vectors.
//
// Due to these characteristics, the data layout is simpler:
// [offsets_lens][all_vector_data_concatenated]
//
// Example:
// Suppose we have a data block containing arrays of vectors [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[10, 11, 12]], and [[13, 14, 15], [16, 17, 18]], and we want to
// create a VectorArrayChunk for these arrays. The data block might look like this:
//
// [offsets_lens][all_vector_data_concatenated]
// [28, 3, 36, 1, 76, 2, 100] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
class VectorArrayChunk : public Chunk {
public:
VectorArrayChunk(int64_t dim,
int32_t row_nums,
char* data,
uint64_t size,
milvus::DataType element_type)
: Chunk(row_nums, data, size, false),
dim_(dim),
element_type_(element_type) {
offsets_lens_ = reinterpret_cast<uint32_t*>(data);
}
VectorArrayView
View(int64_t idx) const {
int idx_off = 2 * idx;
auto offset = offsets_lens_[idx_off];
auto len = offsets_lens_[idx_off + 1];
auto next_offset = offsets_lens_[idx_off + 2];
auto data_ptr = data_ + offset;
return VectorArrayView(
data_ptr, dim_, len, next_offset - offset, element_type_);
}
std::vector<VectorArrayView>
Views() const {
std::vector<VectorArrayView> views;
views.reserve(row_nums_);
for (int64_t i = 0; i < row_nums_; i++) {
views.emplace_back(View(i));
}
return views;
}
const char*
ValueAt(int64_t idx) const override {
PanicInfo(ErrorCode::Unsupported,
"VectorArrayChunk::ValueAt is not supported");
}
private:
int64_t dim_;
uint32_t* offsets_lens_;
milvus::DataType element_type_;
};
class SparseFloatVectorChunk : public Chunk {
public:
SparseFloatVectorChunk(int32_t row_nums,

View File

@ -155,15 +155,15 @@ ArrayChunkWriter::write(const arrow::ArrayVector& array_vec) {
auto array = std::dynamic_pointer_cast<arrow::BinaryArray>(data);
for (int i = 0; i < array->length(); i++) {
auto str = array->GetView(i);
ScalarArray scalar_array;
ScalarFieldProto scalar_array;
scalar_array.ParseFromArray(str.data(), str.size());
auto arr = Array(scalar_array);
size += arr.byte_size();
arrays.push_back(std::move(arr));
if (is_string) {
// element offsets size
size += sizeof(uint32_t) * arr.length();
}
arrays.push_back(std::move(arr));
}
row_nums_ += array->length();
if (nullable_) {
@ -229,6 +229,78 @@ ArrayChunkWriter::finish() {
row_nums_, data, size, element_type_, nullable_);
}
// 1. Deserialize VectorFieldProto (proto::schema::VectorField) from arrow::ArrayVector
// where VectorFieldProto is vector array and each element it self is a VectorFieldProto.
// 2. Transform this vector of VectorFieldProto to vector of our local representation of VectorArray.
// 3. the contents of these VectorArray are concatenated in some format in target_.
// See more details for the format in the comments of VectorArrayChunk.
void
VectorArrayChunkWriter::write(const arrow::ArrayVector& arrow_array_vec) {
auto size = 0;
std::vector<VectorArray> vector_arrays;
vector_arrays.reserve(arrow_array_vec.size());
for (const auto& data : arrow_array_vec) {
auto array = std::dynamic_pointer_cast<arrow::BinaryArray>(data);
for (size_t i = 0; i < array->length(); i++) {
auto str = array->GetView(i);
VectorFieldProto vector_field;
vector_field.ParseFromArray(str.data(), str.size());
auto arr = VectorArray(vector_field);
size += arr.byte_size();
vector_arrays.push_back(std::move(arr));
}
row_nums_ += array->length();
}
// offsets + lens
size += sizeof(uint32_t) * (row_nums_ * 2 + 1) + MMAP_ARRAY_PADDING;
if (file_) {
target_ = std::make_shared<MmapChunkTarget>(*file_, file_offset_);
} else {
target_ = std::make_shared<MemChunkTarget>(size);
}
int offsets_num = row_nums_ + 1;
int len_num = row_nums_;
uint32_t offset_start_pos =
target_->tell() + sizeof(uint32_t) * (offsets_num + len_num);
std::vector<uint32_t> offsets(offsets_num);
std::vector<uint32_t> lens(len_num);
for (size_t i = 0; i < vector_arrays.size(); i++) {
auto& arr = vector_arrays[i];
offsets[i] = offset_start_pos;
lens[i] = arr.length();
offset_start_pos += arr.byte_size();
}
if (offsets_num > 0) {
offsets[offsets_num - 1] = offset_start_pos;
}
for (int i = 0; i < offsets.size(); i++) {
if (i == offsets.size() - 1) {
target_->write(&offsets[i], sizeof(uint32_t));
break;
}
target_->write(&offsets[i], sizeof(uint32_t));
target_->write(&lens[i], sizeof(uint32_t));
}
for (auto& arr : vector_arrays) {
target_->write(arr.data(), arr.byte_size());
}
}
std::unique_ptr<Chunk>
VectorArrayChunkWriter::finish() {
char padding[MMAP_ARRAY_PADDING];
target_->write(padding, MMAP_ARRAY_PADDING);
auto [data, size] = target_->get();
return std::make_unique<VectorArrayChunk>(
dim_, row_nums_, data, size, element_type_);
}
void
SparseFloatVectorChunkWriter::write(const arrow::ArrayVector& array_vec) {
auto size = 0;
@ -382,6 +454,11 @@ create_chunk(const FieldMeta& field_meta,
w = std::make_shared<SparseFloatVectorChunkWriter>(nullable);
break;
}
case milvus::DataType::VECTOR_ARRAY: {
w = std::make_shared<VectorArrayChunkWriter>(
dim, field_meta.get_element_type());
break;
}
default:
PanicInfo(Unsupported, "Unsupported data type");
}
@ -486,6 +563,11 @@ create_chunk(const FieldMeta& field_meta,
file, file_offset, nullable);
break;
}
case milvus::DataType::VECTOR_ARRAY: {
w = std::make_shared<VectorArrayChunkWriter>(
dim, field_meta.get_element_type(), file, file_offset);
break;
}
default:
PanicInfo(Unsupported, "Unsupported data type");
}

View File

@ -219,6 +219,31 @@ class ArrayChunkWriter : public ChunkWriterBase {
const milvus::DataType element_type_;
};
class VectorArrayChunkWriter : public ChunkWriterBase {
public:
VectorArrayChunkWriter(int64_t dim, const milvus::DataType element_type)
: ChunkWriterBase(false), element_type_(element_type), dim_(dim) {
}
VectorArrayChunkWriter(int64_t dim,
const milvus::DataType element_type,
File& file,
size_t offset)
: ChunkWriterBase(file, offset, false),
element_type_(element_type),
dim_(dim) {
}
void
write(const arrow::ArrayVector& array_vec) override;
std::unique_ptr<Chunk>
finish() override;
private:
const milvus::DataType element_type_;
int64_t dim_;
};
class SparseFloatVectorChunkWriter : public ChunkWriterBase {
public:
using ChunkWriterBase::ChunkWriterBase;

View File

@ -234,7 +234,7 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
std::vector<Array> values(element_count);
int null_number = 0;
for (size_t index = 0; index < element_count; ++index) {
ScalarArray field_data;
ScalarFieldProto field_data;
if (array_array->GetString(index) == "") {
null_number++;
continue;
@ -274,6 +274,22 @@ FieldDataImpl<Type, is_type_entire_row>::FillFieldData(
}
return FillFieldData(values.data(), element_count);
}
case DataType::VECTOR_ARRAY: {
auto array_array =
std::dynamic_pointer_cast<arrow::BinaryArray>(array);
std::vector<VectorArray> values(element_count);
for (size_t index = 0; index < element_count; ++index) {
VectorFieldProto field_data;
if (array_array->GetString(index) == "") {
PanicInfo(DataTypeInvalid, "empty vector array");
}
auto success =
field_data.ParseFromString(array_array->GetString(index));
AssertInfo(success, "parse from string failed");
values[index] = VectorArray(field_data);
}
return FillFieldData(values.data(), element_count);
}
default: {
PanicInfo(DataTypeInvalid,
GetName() + "::FillFieldData" +
@ -423,6 +439,7 @@ template class FieldDataImpl<float, false>;
template class FieldDataImpl<float16, false>;
template class FieldDataImpl<bfloat16, false>;
template class FieldDataImpl<knowhere::sparse::SparseRow<float>, true>;
template class FieldDataImpl<VectorArray, true>;
FieldDataPtr
InitScalarFieldData(const DataType& type, bool nullable, int64_t cap_rows) {

View File

@ -80,6 +80,20 @@ class FieldData<Array> : public FieldDataArrayImpl {
}
};
template <>
class FieldData<VectorArray> : public FieldDataVectorArrayImpl {
public:
explicit FieldData(DataType data_type, int64_t buffered_num_rows = 0)
: FieldDataVectorArrayImpl(data_type, buffered_num_rows) {
}
int64_t
get_dim() const override {
PanicInfo(Unsupported,
"Call get_dim on FieldData<VectorArray> is not supported");
}
};
template <>
class FieldData<FloatVector> : public FieldDataImpl<float, false> {
public:

View File

@ -34,7 +34,9 @@
#include "common/VectorTrait.h"
#include "common/EasyAssert.h"
#include "common/Array.h"
#include "common/VectorArray.h"
#include "knowhere/dataset.h"
#include "common/TypeTraits.h"
namespace milvus {
@ -818,4 +820,32 @@ class FieldDataArrayImpl : public FieldDataImpl<Array, true> {
}
};
// is_type_entire_row set be true as each element in data_ is a VectorArray
class FieldDataVectorArrayImpl : public FieldDataImpl<VectorArray, true> {
public:
explicit FieldDataVectorArrayImpl(DataType data_type,
int64_t total_num_rows = 0)
: FieldDataImpl<VectorArray, true>(
1, data_type, false, total_num_rows) {
}
int64_t
DataSize() const override {
int64_t data_size = 0;
for (size_t offset = 0; offset < length(); ++offset) {
data_size += data_[offset].byte_size();
}
return data_size;
}
int64_t
DataSize(ssize_t offset) const override {
AssertInfo(offset < get_num_rows(),
"field data subscript out of range");
AssertInfo(offset < length(),
"subscript position don't has valid value");
return data_[offset].byte_size();
}
};
} // namespace milvus

View File

@ -81,6 +81,7 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
}
auto data_type = DataType(schema_proto.data_type());
auto element_type = DataType(schema_proto.element_type());
auto default_value = [&]() -> std::optional<DefaultValueType> {
if (!schema_proto.has_default_value()) {
@ -89,6 +90,17 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
return schema_proto.default_value();
}();
if (data_type == DataType::VECTOR_ARRAY) {
// todo(SpadeA): revisit the code when index build for vector array is ready
int64_t dim = 0;
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
AssertInfo(type_map.count("dim"), "dim not found");
dim = boost::lexical_cast<int64_t>(type_map.at("dim"));
return FieldMeta{
name, field_id, data_type, element_type, dim, std::nullopt};
}
if (IsVectorDataType(data_type)) {
auto type_map = RepeatedKeyValToMap(schema_proto.type_params());
auto index_map = RepeatedKeyValToMap(schema_proto.index_params());

View File

@ -126,6 +126,23 @@ class FieldMeta {
Assert(!nullable);
}
// array of vector type
FieldMeta(FieldName name,
FieldId id,
DataType type,
DataType element_type,
int64_t dim,
std::optional<knowhere::MetricType> metric_type)
: name_(std::move(name)),
id_(id),
type_(type),
nullable_(false),
element_type_(element_type),
vector_info_(VectorInfo{dim, std::move(metric_type)}) {
Assert(type_ == DataType::VECTOR_ARRAY);
Assert(IsVectorDataType(element_type_));
}
int64_t
get_dim() const {
Assert(IsVectorDataType(type_));
@ -218,7 +235,11 @@ class FieldMeta {
"schema");
static const size_t ARRAY_SIZE = 128;
static const size_t JSON_SIZE = 512;
if (is_vector()) {
// assume float vector with dim 512, array length 10
static const size_t VECTOR_ARRAY_SIZE = 512 * 10 * 4;
if (type_ == DataType::VECTOR_ARRAY) {
return VECTOR_ARRAY_SIZE;
} else if (is_vector()) {
return GetDataTypeSize(type_, get_dim());
} else if (is_string()) {
Assert(string_info_.has_value());

View File

@ -231,7 +231,8 @@ class Json {
: doc().at_pointer(pointer).type();
}
auto get_number_type(const std::string& pointer) const {
auto
get_number_type(const std::string& pointer) const {
return pointer.empty() ? doc().get_number_type()
: doc().at_pointer(pointer).get_number_type();
}

View File

@ -1,3 +1,14 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
#include <vector>

View File

@ -40,8 +40,7 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
// NOTE: only two system
for (const milvus::proto::schema::FieldSchema& child :
schema_proto.fields()) {
auto process_field = [&schema, &schema_proto](const auto& child) {
auto field_id = FieldId(child.fieldid());
auto f = FieldMeta::ParseFrom(child);
@ -59,6 +58,18 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
"repetitive dynamic field");
schema->set_dynamic_field_id(field_id);
}
};
for (const milvus::proto::schema::FieldSchema& child :
schema_proto.fields()) {
process_field(child);
}
for (const milvus::proto::schema::StructArrayFieldSchema& child :
schema_proto.struct_array_fields()) {
for (const auto& sub_field : child.fields()) {
process_field(sub_field);
}
}
AssertInfo(schema->get_primary_field_id().has_value(),

View File

@ -108,6 +108,24 @@ class Schema {
return field_id;
}
// array of vector type
FieldId
AddDebugVectorArrayField(const std::string& name,
DataType element_type,
int64_t dim,
std::optional<knowhere::MetricType> metric_type) {
auto field_id = FieldId(debug_id);
debug_id++;
auto field_meta = FieldMeta(FieldName(name),
field_id,
DataType::VECTOR_ARRAY,
element_type,
dim,
metric_type);
this->AddField(std::move(field_meta));
return field_id;
}
// scalar type
void
AddField(const FieldName& name,

View File

@ -23,6 +23,7 @@
#include "Types.h"
#include "VectorTrait.h"
#include "TypeTraits.h"
namespace milvus {
// type erasure to work around virtual restriction

View File

@ -0,0 +1,66 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <type_traits>
#include "Array.h"
#include "Types.h"
#include "VectorArray.h"
namespace milvus {
template <typename T>
constexpr bool IsVector = std::is_base_of_v<VectorTrait, T>;
template <typename T>
constexpr bool IsScalar =
std::is_fundamental_v<T> || std::is_same_v<T, std::string> ||
std::is_same_v<T, Json> || std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> ||
std::is_same_v<T, proto::plan::Array>;
template <typename T>
constexpr bool IsSparse = std::is_same_v<T, SparseFloatVector> ||
std::is_same_v<T, knowhere::sparse::SparseRow<float>>;
template <typename T>
constexpr bool IsVariableType =
std::is_same_v<T, std::string> || std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> ||
std::is_same_v<T, proto::plan::Array> || std::is_same_v<T, Json> ||
IsSparse<T> || std::is_same_v<T, VectorArray> ||
std::is_same_v<T, VectorArrayView>;
template <typename T>
constexpr bool IsVariableTypeSupportInChunk =
std::is_same_v<T, std::string> || std::is_same_v<T, Array> ||
std::is_same_v<T, Json> ||
std::is_same_v<T, knowhere::sparse::SparseRow<float>>;
template <typename T>
using ChunkViewType = std::conditional_t<
std::is_same_v<T, std::string>,
std::string_view,
std::conditional_t<std::is_same_v<T, Array>,
ArrayView,
std::conditional_t<std::is_same_v<T, VectorArray>,
VectorArrayView,
T>>>;
} // namespace milvus

View File

@ -92,6 +92,7 @@ enum class DataType {
VECTOR_BFLOAT16 = 103,
VECTOR_SPARSE_FLOAT = 104,
VECTOR_INT8 = 105,
VECTOR_ARRAY = 106,
};
using Timestamp = uint64_t; // TODO: use TiKV-like timestamp
@ -100,9 +101,9 @@ constexpr auto MAX_ROW_COUNT = std::numeric_limits<idx_t>::max();
using OpType = proto::plan::OpType;
using ArithOpType = proto::plan::ArithOpType;
using ScalarArray = proto::schema::ScalarField;
using ScalarFieldProto = proto::schema::ScalarField;
using DataArray = proto::schema::FieldData;
using VectorArray = proto::schema::VectorField;
using VectorFieldProto = proto::schema::VectorField;
using IdArray = proto::schema::IDs;
using InsertRecordProto = proto::segcore::InsertRecord;
using PkType = std::variant<std::monostate, int64_t, std::string>;
@ -246,6 +247,8 @@ GetDataTypeName(DataType data_type) {
return "vector_sparse_float";
case DataType::VECTOR_INT8:
return "vector_int8";
case DataType::VECTOR_ARRAY:
return "vector_array";
default:
PanicInfo(DataTypeInvalid, "Unsupported DataType({})", data_type);
}
@ -327,7 +330,7 @@ IsJsonDataType(DataType data_type) {
inline bool
IsArrayDataType(DataType data_type) {
return data_type == DataType::ARRAY;
return data_type == DataType::ARRAY || data_type == DataType::VECTOR_ARRAY;
}
inline bool
@ -396,10 +399,16 @@ IsFloatVectorDataType(DataType data_type) {
IsSparseFloatVectorDataType(data_type);
}
inline bool
IsVectorArrayDataType(DataType data_type) {
return data_type == DataType::VECTOR_ARRAY;
}
inline bool
IsVectorDataType(DataType data_type) {
return IsBinaryVectorDataType(data_type) ||
IsFloatVectorDataType(data_type) || IsIntVectorDataType(data_type);
IsFloatVectorDataType(data_type) || IsIntVectorDataType(data_type) ||
IsVectorArrayDataType(data_type);
}
inline bool
@ -651,6 +660,15 @@ struct TypeTraits<DataType::VECTOR_FLOAT> {
static constexpr const char* Name = "VECTOR_FLOAT";
};
template <>
struct TypeTraits<DataType::VECTOR_ARRAY> {
using NativeType = void;
static constexpr DataType TypeKind = DataType::VECTOR_ARRAY;
static constexpr bool IsPrimitiveType = false;
static constexpr bool IsFixedWidth = false;
static constexpr const char* Name = "VECTOR_ARRAY";
};
inline DataType
FromValCase(milvus::proto::plan::GenericValue::ValCase val_case) {
switch (val_case) {
@ -735,6 +753,9 @@ struct fmt::formatter<milvus::DataType> : formatter<string_view> {
case milvus::DataType::VECTOR_INT8:
name = "VECTOR_INT8";
break;
case milvus::DataType::VECTOR_ARRAY:
name = "VECTOR_ARRAY";
break;
}
return formatter<string_view>::format(name, ctx);
}

View File

@ -0,0 +1,352 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "FieldMeta.h"
#include "Types.h"
#include "common/VectorTrait.h"
namespace milvus {
// Internal representation of proto::schema::VectorField which is recognized as one row
// of data type VECTOR_ARRAY
class VectorArray : public milvus::VectorTrait {
public:
VectorArray() = default;
~VectorArray() = default;
// One row of VectorFieldProto
explicit VectorArray(const VectorFieldProto& vector_field) {
dim_ = vector_field.dim();
switch (vector_field.data_case()) {
case VectorFieldProto::kFloatVector: {
element_type_ = DataType::VECTOR_FLOAT;
// data size should be array length * dim
length_ = vector_field.float_vector().data().size() / dim_;
auto data = new float[length_ * dim_];
size_ =
vector_field.float_vector().data().size() * sizeof(float);
std::copy(vector_field.float_vector().data().begin(),
vector_field.float_vector().data().end(),
data);
data_ = std::unique_ptr<char[]>(reinterpret_cast<char*>(data));
break;
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(vector_field.data_case()));
}
}
}
explicit VectorArray(const VectorArray& other)
: VectorArray(other.data_.get(),
other.length_,
other.dim_,
other.size_,
other.element_type_) {
}
friend void
swap(VectorArray& array1, VectorArray& array2) noexcept {
using std::swap;
swap(array1.data_, array2.data_);
swap(array1.size_, array2.size_);
swap(array1.length_, array2.length_);
swap(array1.dim_, array2.dim_);
swap(array1.element_type_, array2.element_type_);
}
VectorArray(VectorArray&& other) noexcept : VectorArray() {
swap(*this, other);
}
VectorArray&
operator=(const VectorArray& other) {
VectorArray temp(other);
swap(*this, temp);
return *this;
}
VectorArray&
operator=(VectorArray&& other) noexcept {
swap(*this, other);
return *this;
}
bool
operator==(const VectorArray& other) const {
if (element_type_ != other.element_type_ || length_ != other.length_ ||
size_ != other.size_) {
return false;
}
if (length_ == 0) {
return true;
}
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
auto* a = reinterpret_cast<const float*>(data_.get());
auto* b = reinterpret_cast<const float*>(other.data_.get());
return std::equal(
a, a + length_ * dim_, b, [](float x, float y) {
return std::abs(x - y) < 1e-6f;
});
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
}
template <typename VectorElement>
VectorElement*
get_data(const int index) const {
AssertInfo(index >= 0 && index < length_,
"index out of range, index={}, length={}",
index,
length_);
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
static_assert(std::is_same_v<VectorElement, float>,
"VectorElement must be float for VECTOR_FLOAT");
return reinterpret_cast<VectorElement*>(data_.get()) +
index * dim_;
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
}
VectorFieldProto
output_data() const {
VectorFieldProto vector_field;
vector_field.set_dim(dim_);
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
auto data = reinterpret_cast<const float*>(data_.get());
vector_field.mutable_float_vector()->mutable_data()->Add(
data, data + length_ * dim_);
break;
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
return vector_field;
}
int
length() const {
return length_;
}
size_t
byte_size() const {
return size_;
}
int64_t
dim() const {
return dim_;
}
DataType
get_element_type() const {
return element_type_;
}
const char*
data() const {
return data_.get();
}
bool
is_same_array(const VectorFieldProto& vector_field) {
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
if (vector_field.data_case() !=
VectorFieldProto::kFloatVector) {
return false;
}
if (length_ !=
vector_field.float_vector().data().size() / dim_) {
return false;
}
if (length_ == 0) {
return true;
}
const float* a = reinterpret_cast<const float*>(data_.get());
const float* b = vector_field.float_vector().data().data();
return std::equal(
a, a + length_ * dim_, b, [](float x, float y) {
return std::abs(x - y) < 1e-6f;
});
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
}
private:
VectorArray(
char* data, int len, int dim, size_t size, DataType element_type)
: size_(size), length_(len), dim_(dim), element_type_(element_type) {
data_ = std::make_unique<char[]>(size);
std::copy(data, data + size, data_.get());
}
int64_t dim_ = 0;
std::unique_ptr<char[]> data_;
// number of vectors in this array
int length_ = 0;
// size of the array in bytes
int size_ = 0;
DataType element_type_ = DataType::NONE;
};
class VectorArrayView {
public:
VectorArrayView() = default;
VectorArrayView(const VectorArrayView& other)
: VectorArrayView(other.data_,
other.dim_,
other.length_,
other.size_,
other.element_type_) {
}
VectorArrayView(
char* data, int64_t dim, int len, size_t size, DataType element_type)
: data_(data),
dim_(dim),
length_(len),
size_(size),
element_type_(element_type) {
}
template <typename VectorElement>
VectorElement*
get_data(const int index) const {
AssertInfo(index >= 0 && index < length_,
"index out of range, index={}, length={}",
index,
length_);
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
static_assert(std::is_same_v<VectorElement, float>,
"VectorElement must be float for VECTOR_FLOAT");
return reinterpret_cast<VectorElement*>(data_) + index * dim_;
}
default: {
// TODO(SpadeA): add other vector types.
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
}
VectorFieldProto
output_data() const {
VectorFieldProto vector_array;
vector_array.set_dim(dim_);
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
auto data = reinterpret_cast<const float*>(data_);
vector_array.mutable_float_vector()->mutable_data()->Add(
data, data + length_ * dim_);
break;
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
return vector_array;
}
bool
is_same_array(const VectorFieldProto& vector_field) {
switch (element_type_) {
case DataType::VECTOR_FLOAT: {
if (vector_field.data_case() !=
VectorFieldProto::kFloatVector) {
return false;
}
if (length_ !=
vector_field.float_vector().data().size() / dim_) {
return false;
}
if (length_ == 0) {
return true;
}
const float* a = reinterpret_cast<const float*>(data_);
const float* b = vector_field.float_vector().data().data();
return std::equal(
a, a + length_ * dim_, b, [](float x, float y) {
return std::abs(x - y) < 1e-6f;
});
}
default: {
// TODO(SpadeA): add other vector types
PanicInfo(NotImplemented,
"Not implemented vector type: {}",
static_cast<int>(element_type_));
}
}
}
private:
char* data_{nullptr};
int64_t dim_ = 0;
// number of vectors in this array
int length_ = 0;
// size of the array in bytes
int size_ = 0;
DataType element_type_ = DataType::NONE;
};
} // namespace milvus

View File

@ -136,39 +136,6 @@ class Int8Vector : public VectorTrait {
proto::common::PlaceholderType::Int8Vector;
};
template <typename T>
constexpr bool IsVector = std::is_base_of_v<VectorTrait, T>;
template <typename T>
constexpr bool IsScalar =
std::is_fundamental_v<T> || std::is_same_v<T, std::string> ||
std::is_same_v<T, Json> || std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> ||
std::is_same_v<T, proto::plan::Array>;
template <typename T>
constexpr bool IsSparse = std::is_same_v<T, SparseFloatVector> ||
std::is_same_v<T, knowhere::sparse::SparseRow<float>>;
template <typename T>
constexpr bool IsVariableType =
std::is_same_v<T, std::string> || std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Array> || std::is_same_v<T, ArrayView> ||
std::is_same_v<T, proto::plan::Array> || std::is_same_v<T, Json> ||
IsSparse<T>;
template <typename T>
constexpr bool IsVariableTypeSupportInChunk =
std::is_same_v<T, std::string> || std::is_same_v<T, Array> ||
std::is_same_v<T, Json> ||
std::is_same_v<T, knowhere::sparse::SparseRow<float>>;
template <typename T>
using ChunkViewType = std::conditional_t<
std::is_same_v<T, std::string>,
std::string_view,
std::conditional_t<std::is_same_v<T, Array>, ArrayView, T>>;
struct FundamentalTag {};
struct StringTag {};

View File

@ -125,8 +125,8 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
int processed_cursor = 0;
auto execute_sub_batch =
[&bitmap_input,
&processed_cursor]<FilterType filter_type = FilterType::sequential>(
[&bitmap_input, &
processed_cursor ]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -134,23 +134,23 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
TargetBitmapView res,
TargetBitmapView valid_res,
const std::string& pointer) {
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = data[offset].exist(pointer);
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
processed_cursor += size;
};
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = data[offset].exist(pointer);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {

View File

@ -318,9 +318,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(EvalCtx& context) {
}
int processed_cursor = 0;
auto execute_sub_batch =
[op_type,
&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
[ op_type, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
@ -329,186 +328,185 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(EvalCtx& context) {
TargetBitmapView valid_res,
ValueType val,
int index) {
switch (op_type) {
case proto::plan::GreaterThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::GreaterEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Equal: {
UnaryElementFuncForArray<ValueType,
proto::plan::Equal,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::NotEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::NotEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PrefixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PrefixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Match: {
UnaryElementFuncForArray<ValueType,
proto::plan::Match,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PostfixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PostfixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::InnerMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::InnerMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
default:
PanicInfo(
OpTypeInvalid,
fmt::format(
"unsupported operator type for unary expr: {}",
op_type));
switch (op_type) {
case proto::plan::GreaterThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
processed_cursor += size;
};
case proto::plan::GreaterEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Equal: {
UnaryElementFuncForArray<ValueType,
proto::plan::Equal,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::NotEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::NotEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PrefixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PrefixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Match: {
UnaryElementFuncForArray<ValueType,
proto::plan::Match,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PostfixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PostfixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::InnerMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::InnerMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
default:
PanicInfo(
OpTypeInvalid,
fmt::format("unsupported operator type for unary expr: {}",
op_type));
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
processed_size =
@ -708,18 +706,16 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
} while (false)
int processed_cursor = 0;
auto execute_sub_batch = [op_type,
pointer,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ExprValueType val) {
auto execute_sub_batch =
[ op_type, pointer, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ExprValueType val) {
bool has_bitmap_input = !bitmap_input.empty();
switch (op_type) {
case proto::plan::GreaterThan: {
@ -1679,17 +1675,16 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData(EvalCtx& context) {
auto expr_type = expr_->op_type_;
size_t processed_cursor = 0;
auto execute_sub_batch = [expr_type,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
IndexInnerType val) {
auto execute_sub_batch =
[ expr_type, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
IndexInnerType val) {
switch (expr_type) {
case proto::plan::GreaterThan: {
UnaryElementFunc<T, proto::plan::GreaterThan, filter_type> func;

View File

@ -109,13 +109,13 @@ class IndexFactory {
storage::FileManagerContext());
IndexBasePtr
CreateJsonIndex(IndexType index_type,
JsonCastType cast_dtype,
const std::string& nested_path,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext(),
const std::string& json_cast_function =
UNKNOW_CAST_FUNCTION_NAME);
CreateJsonIndex(
IndexType index_type,
JsonCastType cast_dtype,
const std::string& nested_path,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext(),
const std::string& json_cast_function = UNKNOW_CAST_FUNCTION_NAME);
IndexBasePtr
CreateScalarIndex(const CreateIndexInfo& create_index_info,

View File

@ -72,6 +72,10 @@ class IndexFactory {
case DataType::VECTOR_INT8:
return std::make_unique<VecIndexCreator>(type, config, context);
case DataType::VECTOR_ARRAY:
PanicInfo(DataTypeInvalid,
fmt::format("VECTOR_ARRAY is not implemented"));
default:
PanicInfo(DataTypeInvalid,
fmt::format("invalid type is {}", invalid_dtype_msg));

View File

@ -20,6 +20,7 @@
#include "mmap/ChunkData.h"
#include "storage/MmapManager.h"
#include "segcore/SegcoreConfig.h"
#include "common/TypeTraits.h"
namespace milvus {
template <typename Type>
@ -131,6 +132,13 @@ class ThreadSafeChunkVector : public ChunkVectorBase<Type> {
src.byte_size(),
src.get_element_type(),
src.get_offsets_data());
} else if constexpr (std::is_same_v<VectorArray, Type>) {
auto& src = chunk[chunk_offset];
return VectorArrayView(const_cast<char*>(src.data()),
src.dim(),
src.length(),
src.byte_size(),
src.get_element_type());
} else {
return chunk[chunk_offset];
}

View File

@ -35,7 +35,6 @@
#include "common/EasyAssert.h"
#include "common/FieldMeta.h"
#include "common/Span.h"
#include "common/Array.h"
#include "segcore/storagev1translator/ChunkTranslator.h"
#include "cachinglayer/Translator.h"
#include "mmap/ChunkedColumnInterface.h"
@ -190,6 +189,13 @@ class ChunkedColumnBase : public ChunkedColumnInterface {
"ArrayViews only supported for ArrayChunkedColumn");
}
PinWrapper<std::vector<VectorArrayView>>
VectorArrayViews(int64_t chunk_id) const override {
PanicInfo(
ErrorCode::Unsupported,
"VectorArrayViews only supported for ChunkedVectorArrayColumn");
}
PinWrapper<std::pair<std::vector<std::string_view>, FixedVector<bool>>>
ViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override {
@ -375,7 +381,7 @@ class ChunkedArrayColumn : public ChunkedColumnBase {
}
void
BulkArrayAt(std::function<void(ScalarArray&&, size_t)> fn,
BulkArrayAt(std::function<void(ScalarFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count);
@ -400,6 +406,39 @@ class ChunkedArrayColumn : public ChunkedColumnBase {
}
};
class ChunkedVectorArrayColumn : public ChunkedColumnBase {
public:
explicit ChunkedVectorArrayColumn(
std::unique_ptr<Translator<Chunk>> translator,
const FieldMeta& field_meta)
: ChunkedColumnBase(std::move(translator), field_meta) {
}
void
BulkVectorArrayAt(std::function<void(VectorFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count);
auto ca = SemiInlineGet(slot_->PinCells(cids));
for (int64_t i = 0; i < count; i++) {
auto array =
static_cast<VectorArrayChunk*>(ca->get_cell_of(cids[i]))
->View(offsets_in_chunk[i])
.output_data();
fn(std::move(array), i);
}
}
PinWrapper<std::vector<VectorArrayView>>
VectorArrayViews(int64_t chunk_id) const override {
auto ca =
SemiInlineGet(slot_->PinCells({static_cast<cid_t>(chunk_id)}));
auto chunk = ca->get_cell_of(chunk_id);
return PinWrapper<std::vector<VectorArrayView>>(
ca, static_cast<VectorArrayChunk*>(chunk)->Views());
}
};
inline std::shared_ptr<ChunkedColumnInterface>
MakeChunkedColumnBase(DataType data_type,
std::unique_ptr<Translator<milvus::Chunk>> translator,
@ -421,6 +460,12 @@ MakeChunkedColumnBase(DataType data_type,
field_meta));
}
if (ChunkedColumnInterface::IsChunkedVectorArrayColumnDataType(data_type)) {
return std::static_pointer_cast<ChunkedColumnInterface>(
std::make_shared<ChunkedVectorArrayColumn>(std::move(translator),
field_meta));
}
return std::static_pointer_cast<ChunkedColumnInterface>(
std::make_shared<ChunkedColumn>(std::move(translator), field_meta));
}

View File

@ -29,12 +29,10 @@
#include "cachinglayer/Translator.h"
#include "cachinglayer/Utils.h"
#include "common/Array.h"
#include "common/Chunk.h"
#include "common/GroupChunk.h"
#include "common/EasyAssert.h"
#include "common/Span.h"
#include "common/Array.h"
#include "mmap/ChunkedColumnInterface.h"
#include "segcore/storagev2translator/GroupCTMeta.h"
@ -261,6 +259,20 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
static_cast<ArrayChunk*>(chunk.get())->Views(offset_len));
}
PinWrapper<std::vector<VectorArrayView>>
VectorArrayViews(int64_t chunk_id) const override {
if (!IsChunkedVectorArrayColumnDataType(data_type_)) {
PanicInfo(
ErrorCode::Unsupported,
"VectorArrayViews only supported for ChunkedVectorArrayColumn");
}
auto chunk_wrapper = group_->GetGroupChunk(chunk_id);
auto chunk = chunk_wrapper.get()->GetChunk(field_id_);
return PinWrapper<std::vector<VectorArrayView>>(
chunk_wrapper,
static_cast<VectorArrayChunk*>(chunk.get())->Views());
}
PinWrapper<std::pair<std::vector<std::string_view>, FixedVector<bool>>>
ViewsByOffsets(int64_t chunk_id,
const FixedVector<int32_t>& offsets) const override {
@ -402,7 +414,7 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
}
void
BulkArrayAt(std::function<void(ScalarArray&&, size_t)> fn,
BulkArrayAt(std::function<void(ScalarFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
if (!IsChunkedArrayColumnDataType(data_type_)) {
@ -421,6 +433,27 @@ class ProxyChunkColumn : public ChunkedColumnInterface {
}
}
void
BulkVectorArrayAt(std::function<void(VectorFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const override {
if (!IsChunkedVectorArrayColumnDataType(data_type_)) {
PanicInfo(ErrorCode::Unsupported,
"BulkVectorArrayAt only supported for "
"ChunkedVectorArrayColumn");
}
auto [cids, offsets_in_chunk] = ToChunkIdAndOffset(offsets, count);
auto ca = group_->GetGroupChunks(cids);
for (int64_t i = 0; i < count; i++) {
auto* group_chunk = ca->get_cell_of(cids[i]);
auto chunk = group_chunk->GetChunk(field_id_);
auto array = static_cast<VectorArrayChunk*>(chunk.get())
->View(offsets_in_chunk[i])
.output_data();
fn(std::move(array), i);
}
}
private:
std::shared_ptr<ChunkedColumnGroup> group_;
FieldId field_id_;

View File

@ -81,6 +81,9 @@ class ChunkedColumnInterface {
ArrayViews(int64_t chunk_id,
std::optional<std::pair<int64_t, int64_t>> offset_len) const = 0;
virtual PinWrapper<std::vector<VectorArrayView>>
VectorArrayViews(int64_t chunk_id) const = 0;
virtual PinWrapper<
std::pair<std::vector<std::string_view>, FixedVector<bool>>>
ViewsByOffsets(int64_t chunk_id,
@ -129,13 +132,22 @@ class ChunkedColumnInterface {
}
virtual void
BulkArrayAt(std::function<void(ScalarArray&&, size_t)> fn,
BulkArrayAt(std::function<void(ScalarFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const {
PanicInfo(ErrorCode::Unsupported,
"BulkArrayAt only supported for ChunkedArrayColumn");
}
virtual void
BulkVectorArrayAt(std::function<void(VectorFieldProto&&, size_t)> fn,
const int64_t* offsets,
int64_t count) const {
PanicInfo(
ErrorCode::Unsupported,
"BulkVectorArrayAt only supported for ChunkedVectorArrayColumn");
}
static bool
IsChunkedVariableColumnDataType(DataType data_type) {
return data_type == DataType::STRING ||
@ -148,6 +160,11 @@ class ChunkedColumnInterface {
return data_type == DataType::ARRAY;
}
static bool
IsChunkedVectorArrayColumnDataType(DataType data_type) {
return data_type == DataType::VECTOR_ARRAY;
}
static bool
IsChunkedColumnDataType(DataType data_type) {
return !IsChunkedVariableColumnDataType(data_type) &&

View File

@ -90,6 +90,10 @@ ChunkedSegmentSealedImpl::LoadIndex(const LoadIndexInfo& info) {
auto field_id = FieldId(info.field_id);
auto& field_meta = schema_->operator[](field_id);
if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(DataTypeInvalid, "VECTOR_ARRAY is not implemented");
}
if (field_meta.is_vector()) {
LoadVecIndex(info);
} else {
@ -461,6 +465,11 @@ ChunkedSegmentSealedImpl::AddFieldDataInfoForSealed(
int64_t
ChunkedSegmentSealedImpl::num_chunk_index(FieldId field_id) const {
auto& field_meta = schema_->operator[](field_id);
if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(DataTypeInvalid, "VECTOR_ARRAY is not implemented");
}
if (field_meta.is_vector()) {
return int64_t(vector_indexings_.is_ready(field_id));
}
@ -1043,8 +1052,23 @@ ChunkedSegmentSealedImpl::bulk_subscript_array_impl(
const int64_t* seg_offsets,
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst) {
column->BulkArrayAt(
[dst](ScalarArray&& array, size_t i) { dst->at(i) = std::move(array); },
column->BulkArrayAt([dst](ScalarFieldProto&& array,
size_t i) { dst->at(i) = std::move(array); },
seg_offsets,
count);
}
template <typename T>
void
ChunkedSegmentSealedImpl::bulk_subscript_vector_array_impl(
const ChunkedColumnInterface* column,
const int64_t* seg_offsets,
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst) {
column->BulkVectorArrayAt(
[dst](VectorFieldProto&& array, size_t i) {
dst->at(i) = std::move(array);
},
seg_offsets,
count);
}
@ -1090,9 +1114,9 @@ ChunkedSegmentSealedImpl::fill_with_empty(FieldId field_id,
int64_t count) const {
auto& field_meta = schema_->operator[](field_id);
if (IsVectorDataType(field_meta.get_data_type())) {
return CreateVectorDataArray(count, field_meta);
return CreateEmptyVectorDataArray(count, field_meta);
}
return CreateScalarDataArray(count, field_meta);
return CreateEmptyScalarDataArray(count, field_meta);
}
void
@ -1367,6 +1391,14 @@ ChunkedSegmentSealedImpl::get_raw_data(FieldId field_id,
ret->mutable_vectors()->mutable_int8_vector()->data());
break;
}
case DataType::VECTOR_ARRAY: {
bulk_subscript_vector_array_impl(
column.get(),
seg_offsets,
count,
ret->mutable_vectors()->mutable_vector_array()->mutable_data());
break;
}
default: {
PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type {}",
@ -1962,6 +1994,11 @@ ChunkedSegmentSealedImpl::fill_empty_field(const FieldMeta& field_meta) {
field_meta);
break;
}
case milvus::DataType::VECTOR_ARRAY: {
column = std::make_shared<ChunkedVectorArrayColumn>(
std::move(translator), field_meta);
break;
}
default: {
column = std::make_shared<ChunkedColumn>(std::move(translator),
field_meta);

View File

@ -318,6 +318,14 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst);
template <typename T>
static void
bulk_subscript_vector_array_impl(
const ChunkedColumnInterface* column,
const int64_t* seg_offsets,
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst);
static void
bulk_subscript_impl(int64_t element_sizeof,
ChunkedColumnInterface* field,

View File

@ -47,6 +47,14 @@ VectorBase::set_data_raw(ssize_t element_offset,
} else if (field_meta.get_data_type() == DataType::VECTOR_INT8) {
return set_data_raw(
element_offset, VEC_FIELD_DATA(data, int8), element_count);
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
auto& vector_array = data->vectors().vector_array().data();
std::vector<VectorArray> data_raw{};
data_raw.reserve(vector_array.size());
for (auto& e : vector_array) {
data_raw.emplace_back(VectorArray(e));
}
return set_data_raw(element_offset, data_raw.data(), element_count);
} else {
PanicInfo(DataTypeInvalid, "unsupported vector type");
}

View File

@ -200,7 +200,9 @@ class ConcurrentVectorImpl : public VectorBase {
SpanBase
get_span_base(int64_t chunk_id) const override {
if constexpr (is_type_entire_row) {
if constexpr (std::is_same_v<Type, VectorArray>) {
PanicInfo(NotImplemented, "unimplemented");
} else if constexpr (is_type_entire_row) {
return chunks_ptr_->get_span(chunk_id);
} else if constexpr (std::is_same_v<Type, int64_t> || // NOLINT
std::is_same_v<Type, int>) {
@ -272,7 +274,9 @@ class ConcurrentVectorImpl : public VectorBase {
int64_t
get_element_size() const override {
if constexpr (is_type_entire_row) {
if constexpr (std::is_same_v<Type, VectorArray>) {
PanicInfo(NotImplemented, "unimplemented");
} else if constexpr (is_type_entire_row) {
return chunks_ptr_->get_element_size();
} else if constexpr (std::is_same_v<Type, int64_t> || // NOLINT
std::is_same_v<Type, int>) {
@ -484,6 +488,20 @@ class ConcurrentVector<Array> : public ConcurrentVectorImpl<Array, true> {
}
};
template <>
class ConcurrentVector<VectorArray>
: public ConcurrentVectorImpl<VectorArray, true> {
public:
explicit ConcurrentVector(
int64_t dim /* not use it*/,
int64_t size_per_chunk,
storage::MmapChunkDescriptorPtr mmap_descriptor = nullptr,
ThreadSafeValidDataPtr valid_data_ptr = nullptr)
: ConcurrentVectorImpl<VectorArray, true>::ConcurrentVectorImpl(
1, size_per_chunk, std::move(mmap_descriptor), valid_data_ptr) {
}
};
template <>
class ConcurrentVector<SparseFloatVector>
: public ConcurrentVectorImpl<knowhere::sparse::SparseRow<float>, true> {

View File

@ -563,6 +563,12 @@ struct InsertRecord<false> : public InsertRecord<true> {
size_per_chunk,
dense_vec_mmap_descriptor);
return;
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
this->append_data<VectorArray>(field_id,
field_meta.get_dim(),
size_per_chunk,
dense_vec_mmap_descriptor);
return;
} else {
PanicInfo(DataTypeInvalid,
fmt::format("unsupported vector type",

View File

@ -42,6 +42,7 @@
#include "storage/RemoteChunkManagerSingleton.h"
#include "storage/Util.h"
#include "storage/ThreadPools.h"
#include "common/TypeTraits.h"
#include "milvus-storage/format/parquet/file_reader.h"
#include "milvus-storage/filesystem/fs.h"
@ -644,7 +645,7 @@ SegmentGrowingImpl::bulk_subscript(
Assert(!dynamic_field_names.empty());
auto& field_meta = schema_->operator[](field_id);
auto vec_ptr = insert_record_.get_data_base(field_id);
auto result = CreateScalarDataArray(count, field_meta);
auto result = CreateEmptyScalarDataArray(count, field_meta);
if (field_meta.is_nullable()) {
auto valid_data_ptr = insert_record_.get_valid_data(field_id);
auto res = result->mutable_valid_data()->mutable_data();
@ -671,7 +672,7 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
auto& field_meta = schema_->operator[](field_id);
auto vec_ptr = insert_record_.get_data_base(field_id);
if (field_meta.is_vector()) {
auto result = CreateVectorDataArray(count, field_meta);
auto result = CreateEmptyVectorDataArray(count, field_meta);
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
bulk_subscript_impl<FloatVector>(field_id,
field_meta.get_sizeof(),
@ -724,6 +725,13 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
seg_offsets,
count,
result->mutable_vectors()->mutable_int8_vector()->data());
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
bulk_subscript_vector_array_impl(*vec_ptr,
seg_offsets,
count,
result->mutable_vectors()
->mutable_vector_array()
->mutable_data());
} else {
PanicInfo(DataTypeInvalid, "logical error");
}
@ -733,7 +741,7 @@ SegmentGrowingImpl::bulk_subscript(FieldId field_id,
AssertInfo(!field_meta.is_vector(),
"Scalar field meta type is vector type");
auto result = CreateScalarDataArray(count, field_meta);
auto result = CreateEmptyScalarDataArray(count, field_meta);
if (field_meta.is_nullable()) {
auto valid_data_ptr = insert_record_.get_valid_data(field_id);
auto res = result->mutable_valid_data()->mutable_data();
@ -992,6 +1000,24 @@ SegmentGrowingImpl::bulk_subscript_array_impl(
}
}
template <typename T>
void
SegmentGrowingImpl::bulk_subscript_vector_array_impl(
const VectorBase& vec_raw,
const int64_t* seg_offsets,
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst) const {
auto vec_ptr = dynamic_cast<const ConcurrentVector<VectorArray>*>(&vec_raw);
AssertInfo(vec_ptr, "Pointer of vec_raw is nullptr");
auto& vec = *vec_ptr;
for (int64_t i = 0; i < count; ++i) {
auto offset = seg_offsets[i];
if (offset != INVALID_SEG_OFFSET) {
dst->at(i) = vec[offset].output_data();
}
}
}
void
SegmentGrowingImpl::bulk_subscript(SystemFieldType system_type,
const int64_t* seg_offsets,

View File

@ -218,6 +218,15 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst) const;
// for vector array vectors
template <typename T>
void
bulk_subscript_vector_array_impl(
const VectorBase& vec_raw,
const int64_t* seg_offsets,
int64_t count,
google::protobuf::RepeatedPtrField<T>* dst) const;
template <typename T>
void
bulk_subscript_impl(FieldId field_id,

View File

@ -214,6 +214,7 @@ SegmentInternalInterface::FillTargetEntry(
} else {
col = bulk_subscript(field_id, offsets, size);
}
// todo(SpadeA): consider vector array?
if (field_meta.get_data_type() == DataType::ARRAY) {
col->mutable_scalars()->mutable_array_data()->set_element_type(
proto::schema::DataType(field_meta.get_element_type()));
@ -435,7 +436,7 @@ SegmentInternalInterface::bulk_subscript_not_exist_field(
fmt::format("unsupported added field type {}",
field_meta.get_data_type()));
}
auto result = CreateScalarDataArray(count, field_meta);
auto result = CreateEmptyScalarDataArray(count, field_meta);
if (field_meta.default_value().has_value()) {
auto res = result->mutable_valid_data()->mutable_data();
for (int64_t i = 0; i < count; ++i) {

View File

@ -216,6 +216,23 @@ GetRawDataSizeOfDataArray(const DataArray* data,
result += data->vectors().sparse_float_vector().ByteSizeLong();
break;
}
case DataType::VECTOR_ARRAY: {
auto& obj = data->vectors().vector_array().data();
switch (field_meta.get_element_type()) {
case DataType::VECTOR_FLOAT: {
for (auto& e : obj) {
result += e.float_vector().ByteSizeLong();
}
break;
}
default: {
PanicInfo(NotImplemented,
fmt::format("not implemented vector type {}",
field_meta.get_element_type()));
}
}
break;
}
default: {
PanicInfo(
DataTypeInvalid,
@ -231,7 +248,7 @@ GetRawDataSizeOfDataArray(const DataArray* data,
// modify bulk script implement to make process more clear
std::unique_ptr<DataArray>
CreateScalarDataArray(int64_t count, const FieldMeta& field_meta) {
CreateEmptyScalarDataArray(int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
@ -317,7 +334,7 @@ CreateScalarDataArray(int64_t count, const FieldMeta& field_meta) {
}
std::unique_ptr<DataArray>
CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) {
CreateEmptyVectorDataArray(int64_t count, const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
auto data_array = std::make_unique<DataArray>();
data_array->set_field_id(field_meta.get_id().get());
@ -367,6 +384,16 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) {
obj->resize(length);
break;
}
case DataType::VECTOR_ARRAY: {
auto obj = vector_array->mutable_vector_array();
obj->set_element_type(static_cast<milvus::proto::schema::DataType>(
field_meta.get_element_type()));
obj->mutable_data()->Reserve(count);
for (int i = 0; i < count; i++) {
*(obj->mutable_data()->Add()) = proto::schema::VectorField();
}
break;
}
default: {
PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type));
@ -453,7 +480,7 @@ CreateScalarDataArrayFrom(const void* data_raw,
break;
}
case DataType::ARRAY: {
auto data = reinterpret_cast<const ScalarArray*>(data_raw);
auto data = reinterpret_cast<const ScalarFieldProto*>(data_raw);
auto obj = scalar_array->mutable_array_data();
obj->set_element_type(static_cast<milvus::proto::schema::DataType>(
field_meta.get_element_type()));
@ -538,6 +565,28 @@ CreateVectorDataArrayFrom(const void* data_raw,
obj->assign(data, length * sizeof(int8));
break;
}
case DataType::VECTOR_ARRAY: {
auto data = reinterpret_cast<const VectorFieldProto*>(data_raw);
auto vector_type = field_meta.get_element_type();
switch (vector_type) {
case DataType::VECTOR_FLOAT: {
auto obj = vector_array->mutable_vector_array();
obj->set_element_type(
milvus::proto::schema::DataType::FloatVector);
obj->set_dim(dim);
for (auto i = 0; i < count; i++) {
*(obj->mutable_data()->Add()) = data[i];
}
break;
}
default: {
PanicInfo(NotImplemented,
fmt::format("not implemented vector type {}",
vector_type));
}
}
break;
}
default: {
PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type));
@ -553,7 +602,7 @@ CreateDataArrayFrom(const void* data_raw,
const FieldMeta& field_meta) {
auto data_type = field_meta.get_data_type();
if (!IsVectorDataType(data_type)) {
if (!IsVectorDataType(data_type) && data_type != DataType::VECTOR_ARRAY) {
return CreateScalarDataArrayFrom(
data_raw, valid_data, count, field_meta);
}
@ -619,6 +668,8 @@ MergeDataArray(std::vector<MergeBase>& merge_bases,
auto data = VEC_FIELD_DATA(src_field_data, int8);
auto obj = vector_array->mutable_int8_vector();
obj->assign(data, dim * sizeof(int8));
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(DataTypeInvalid, "VECTOR_ARRAY is not implemented");
} else {
PanicInfo(DataTypeInvalid,
fmt::format("unsupported datatype {}", data_type));

View File

@ -48,10 +48,10 @@ GetRawDataSizeOfDataArray(const DataArray* data,
// Note: this is temporary solution.
// modify bulk script implement to make process more clear
std::unique_ptr<DataArray>
CreateScalarDataArray(int64_t count, const FieldMeta& field_meta);
CreateEmptyScalarDataArray(int64_t count, const FieldMeta& field_meta);
std::unique_ptr<DataArray>
CreateVectorDataArray(int64_t count, const FieldMeta& field_meta);
CreateEmptyVectorDataArray(int64_t count, const FieldMeta& field_meta);
std::unique_ptr<DataArray>
CreateScalarDataArrayFrom(const void* data_raw,

View File

@ -439,6 +439,8 @@ ReduceHelper::GetSearchResultDataSlice(int slice_index) {
->mutable_array_data()
->set_element_type(
proto::schema::DataType(field_meta.get_element_type()));
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(NotImplemented, "VECTOR_ARRAY is not implemented");
}
search_result_data->mutable_fields_data()->AddAllocated(
field_data.release());

View File

@ -154,7 +154,10 @@ StreamReducerHelper::AssembleMergedResult() {
->mutable_array_data()
->set_element_type(
proto::schema::DataType(field_meta.get_element_type()));
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(NotImplemented, "VECTOR_ARRAY is not implemented");
}
new_merged_result->output_fields_data_[field_id] =
std::move(field_data);
}
@ -670,6 +673,8 @@ StreamReducerHelper::GetSearchResultDataSlice(int slice_index) {
->mutable_array_data()
->set_element_type(
proto::schema::DataType(field_meta.get_element_type()));
} else if (field_meta.get_data_type() == DataType::VECTOR_ARRAY) {
PanicInfo(NotImplemented, "VECTOR_ARRAY is not implemented");
}
search_result_data->mutable_fields_data()->AddAllocated(
field_data.release());

View File

@ -247,6 +247,8 @@ BaseEventData::Serialize() {
auto data_type = field_data->get_data_type();
std::shared_ptr<PayloadWriter> payload_writer;
if (IsVectorDataType(data_type) &&
// each element will be serialized as bytes so no need dim info
data_type != DataType::VECTOR_ARRAY &&
!IsSparseFloatVectorDataType(data_type)) {
payload_writer = std::make_unique<PayloadWriter>(
data_type, field_data->get_dim(), field_data->IsNullable());
@ -310,6 +312,22 @@ BaseEventData::Serialize() {
}
break;
}
case DataType::VECTOR_ARRAY: {
for (size_t offset = 0; offset < field_data->get_num_rows();
++offset) {
auto array = static_cast<const VectorArray*>(
field_data->RawValue(offset));
auto array_string =
array->output_data().SerializeAsString();
auto size = array_string.size();
// todo(SapdeA): it maybe better to serialize vectors one by one
payload_writer->add_one_binary_payload(
reinterpret_cast<const uint8_t*>(array_string.c_str()),
size);
}
break;
}
default: {
auto payload =
Payload{data_type,

View File

@ -73,7 +73,8 @@ PayloadReader::init(const uint8_t* data, int length, bool is_field_data) {
// dim is unused for sparse float vector
dim_ =
(IsVectorDataType(column_type_) &&
!IsSparseFloatVectorDataType(column_type_))
!IsSparseFloatVectorDataType(column_type_)) &&
!IsVectorArrayDataType(column_type_)
? GetDimensionFromFileMetaData(
file_meta->schema()->Column(column_index), column_type_)
: 1;

View File

@ -284,6 +284,7 @@ CreateArrowBuilder(DataType data_type) {
return std::make_shared<arrow::StringBuilder>();
}
case DataType::ARRAY:
case DataType::VECTOR_ARRAY:
case DataType::JSON: {
return std::make_shared<arrow::BinaryBuilder>();
}
@ -411,6 +412,7 @@ CreateArrowSchema(DataType data_type, bool nullable) {
{arrow::field("val", arrow::utf8(), nullable)});
}
case DataType::ARRAY:
case DataType::VECTOR_ARRAY:
case DataType::JSON: {
return arrow::schema(
{arrow::field("val", arrow::binary(), nullable)});
@ -938,6 +940,9 @@ CreateFieldData(const DataType& type,
case DataType::VECTOR_INT8:
return std::make_shared<FieldData<Int8Vector>>(
dim, type, total_num_rows);
case DataType::VECTOR_ARRAY:
return std::make_shared<FieldData<VectorArray>>(type,
total_num_rows);
default:
PanicInfo(DataTypeInvalid,
"CreateFieldData not support data type " +

View File

@ -105,6 +105,7 @@ set(MILVUS_TEST_FILES
test_chunked_segment_storage_v2.cpp
test_thread_pool.cpp
test_json_flat_index.cpp
test_vector_array.cpp
)
if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -65,7 +65,7 @@ GenerateArrayData(proto::schema::DataType element_type,
int cardinality,
int size,
int array_len) {
std::vector<ScalarArray> data(size);
std::vector<ScalarFieldProto> data(size);
switch (element_type) {
case proto::schema::DataType::Bool: {
for (int i = 0; i < size; i++) {

View File

@ -564,16 +564,18 @@ TEST(Expr, TestArrayRange) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_bool_array_col = raw_data.get_col<ScalarArray>(bool_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
auto new_bool_array_col =
raw_data.get_col<ScalarFieldProto>(bool_array_fid);
auto new_string_array_col =
raw_data.get_col<ScalarArray>(string_array_fid);
raw_data.get_col<ScalarFieldProto>(string_array_fid);
auto new_float_array_col =
raw_data.get_col<ScalarArray>(float_array_fid);
raw_data.get_col<ScalarFieldProto>(float_array_fid);
array_cols["long"].insert(array_cols["long"].end(),
new_long_array_col.begin(),
@ -717,11 +719,12 @@ TEST(Expr, TestArrayEqual) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<ScalarArray> long_array_col;
std::vector<ScalarFieldProto> long_array_col;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter, 0, 1, 3);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
long_array_col.insert(long_array_col.end(),
new_long_array_col.begin(),
new_long_array_col.end());
@ -820,13 +823,14 @@ TEST(Expr, TestArrayNullExpr) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<ScalarArray> long_array_col;
std::vector<ScalarFieldProto> long_array_col;
int num_iters = 1;
FixedVector<bool> valid_data;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter, 0, 1, 3);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
long_array_col.insert(long_array_col.end(),
new_long_array_col.begin(),
new_long_array_col.end());
@ -989,19 +993,22 @@ TEST(Expr, TestArrayContains) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_int_array_col = raw_data.get_col<ScalarArray>(int_array_fid);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_bool_array_col = raw_data.get_col<ScalarArray>(bool_array_fid);
auto new_int_array_col =
raw_data.get_col<ScalarFieldProto>(int_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
auto new_bool_array_col =
raw_data.get_col<ScalarFieldProto>(bool_array_fid);
auto new_float_array_col =
raw_data.get_col<ScalarArray>(float_array_fid);
raw_data.get_col<ScalarFieldProto>(float_array_fid);
auto new_double_array_col =
raw_data.get_col<ScalarArray>(double_array_fid);
raw_data.get_col<ScalarFieldProto>(double_array_fid);
auto new_string_array_col =
raw_data.get_col<ScalarArray>(string_array_fid);
raw_data.get_col<ScalarFieldProto>(string_array_fid);
array_cols["int"].insert(array_cols["int"].end(),
new_int_array_col.begin(),
@ -1512,16 +1519,18 @@ TEST(Expr, TestArrayBinaryArith) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_int_array_col = raw_data.get_col<ScalarArray>(int_array_fid);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_int_array_col =
raw_data.get_col<ScalarFieldProto>(int_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
auto new_float_array_col =
raw_data.get_col<ScalarArray>(float_array_fid);
raw_data.get_col<ScalarFieldProto>(float_array_fid);
auto new_double_array_col =
raw_data.get_col<ScalarArray>(double_array_fid);
raw_data.get_col<ScalarFieldProto>(double_array_fid);
array_cols["int"].insert(array_cols["int"].end(),
new_int_array_col.begin(),
@ -2477,12 +2486,12 @@ TEST(Expr, TestArrayStringMatch) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_string_array_col =
raw_data.get_col<ScalarArray>(string_array_fid);
raw_data.get_col<ScalarFieldProto>(string_array_fid);
array_cols["string"].insert(array_cols["string"].end(),
new_string_array_col.begin(),
new_string_array_col.end());
@ -2581,16 +2590,18 @@ TEST(Expr, TestArrayInTerm) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_bool_array_col = raw_data.get_col<ScalarArray>(bool_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
auto new_bool_array_col =
raw_data.get_col<ScalarFieldProto>(bool_array_fid);
auto new_float_array_col =
raw_data.get_col<ScalarArray>(float_array_fid);
raw_data.get_col<ScalarFieldProto>(float_array_fid);
auto new_string_array_col =
raw_data.get_col<ScalarArray>(string_array_fid);
raw_data.get_col<ScalarFieldProto>(string_array_fid);
array_cols["long"].insert(array_cols["long"].end(),
new_long_array_col.begin(),
new_long_array_col.end());
@ -2798,11 +2809,12 @@ TEST(Expr, TestTermInArray) {
auto seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::map<std::string, std::vector<ScalarArray>> array_cols;
std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
int num_iters = 1;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
auto new_long_array_col =
raw_data.get_col<ScalarFieldProto>(long_array_fid);
array_cols["long"].insert(array_cols["long"].end(),
new_long_array_col.begin(),
new_long_array_col.end());

View File

@ -380,16 +380,16 @@ TEST_F(ChunkVectorTest, QueryWithMmap) {
// auto seg = CreateGrowingSegment(schema, empty_index_meta, 22, config);
// int N = 1000;
// std::map<std::string, std::vector<ScalarArray>> array_cols;
// std::map<std::string, std::vector<ScalarFieldProto>> array_cols;
// int num_iters = 1;
// for (int iter = 0; iter < num_iters; ++iter) {
// auto raw_data = DataGen(schema, N, iter);
// auto new_long_array_col = raw_data.get_col<ScalarArray>(long_array_fid);
// auto new_bool_array_col = raw_data.get_col<ScalarArray>(bool_array_fid);
// auto new_long_array_col = raw_data.get_col<ScalarFieldProto>(long_array_fid);
// auto new_bool_array_col = raw_data.get_col<ScalarFieldProto>(bool_array_fid);
// auto new_float_array_col =
// raw_data.get_col<ScalarArray>(float_array_fid);
// raw_data.get_col<ScalarFieldProto>(float_array_fid);
// auto new_string_array_col =
// raw_data.get_col<ScalarArray>(string_array_fid);
// raw_data.get_col<ScalarFieldProto>(string_array_fid);
// array_cols["long"].insert(array_cols["long"].end(),
// new_long_array_col.begin(),
// new_long_array_col.end());

View File

@ -17,6 +17,7 @@
#include "segcore/SegmentGrowingImpl.h"
#include "pb/schema.pb.h"
#include "test_utils/DataGen.h"
#include "test_utils/storage_test_utils.h"
using namespace milvus::segcore;
using namespace milvus;
@ -414,3 +415,129 @@ TEST(Growing, FillNullableData) {
EXPECT_EQ(float_array_result->valid_data_size(), num_inserted);
}
}
TEST_P(GrowingTest, FillVectorArrayData) {
auto schema = std::make_shared<Schema>();
auto int64_field = schema->AddDebugField("int64", DataType::INT64);
auto array_float_vector = schema->AddDebugVectorArrayField(
"array_float_vector", DataType::VECTOR_FLOAT, 128, metric_type);
schema->set_primary_field_id(int64_field);
auto config = SegcoreConfig::default_config();
config.set_chunk_rows(1024);
config.set_enable_interim_segment_index(true);
std::map<FieldId, FieldIndexMeta> filedMap = {};
IndexMetaPtr metaPtr =
std::make_shared<CollectionIndexMeta>(100000, std::move(filedMap));
auto segment_growing = CreateGrowingSegment(schema, metaPtr, 1, config);
auto segment = dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
int64_t per_batch = 1000;
int64_t n_batch = 3;
int64_t dim = 128;
for (int64_t i = 0; i < n_batch; i++) {
auto dataset = DataGen(schema, per_batch);
auto offset = segment->PreInsert(per_batch);
segment->Insert(offset,
per_batch,
dataset.row_ids_.data(),
dataset.timestamps_.data(),
dataset.raw_);
auto num_inserted = (i + 1) * per_batch;
auto ids_ds = GenRandomIds(num_inserted);
auto int64_result = segment->bulk_subscript(
int64_field, ids_ds->GetIds(), num_inserted);
auto array_float_vector_result = segment->bulk_subscript(
array_float_vector, ids_ds->GetIds(), num_inserted);
EXPECT_EQ(int64_result->scalars().long_data().data_size(),
num_inserted);
EXPECT_EQ(
array_float_vector_result->vectors().vector_array().data_size(),
num_inserted);
if (i == 0) {
// Verify vector array data
auto verify_float_vectors = [](auto arr1, auto arr2) {
static constexpr float EPSILON = 1e-6;
EXPECT_EQ(arr1.size(), arr2.size());
for (int64_t i = 0; i < arr1.size(); ++i) {
EXPECT_NEAR(arr1[i], arr2[i], EPSILON);
}
};
auto array_vec_values =
dataset.get_col<VectorFieldProto>(array_float_vector);
for (int64_t i = 0; i < per_batch; ++i) {
auto arrow_array = array_float_vector_result->vectors()
.vector_array()
.data()[i]
.float_vector()
.data();
auto expected_array =
array_vec_values[ids_ds->GetIds()[i]].float_vector().data();
verify_float_vectors(arrow_array, expected_array);
}
}
EXPECT_EQ(int64_result->valid_data_size(), 0);
EXPECT_EQ(array_float_vector_result->valid_data_size(), 0);
}
}
TEST(GrowingTest, LoadVectorArrayData) {
auto schema = std::make_shared<Schema>();
auto metric_type = knowhere::metric::L2;
auto int64_field = schema->AddDebugField("int64", DataType::INT64);
auto array_float_vector = schema->AddDebugVectorArrayField(
"array_vec", DataType::VECTOR_FLOAT, 128, metric_type);
schema->set_primary_field_id(int64_field);
auto config = SegcoreConfig::default_config();
config.set_chunk_rows(1024);
config.set_enable_interim_segment_index(true);
std::map<FieldId, FieldIndexMeta> filedMap = {};
IndexMetaPtr metaPtr =
std::make_shared<CollectionIndexMeta>(100000, std::move(filedMap));
int64_t dataset_size = 1000;
int64_t dim = 128;
auto dataset = DataGen(schema, dataset_size);
auto segment_growing =
CreateGrowingWithFieldDataLoaded(schema, metaPtr, config, dataset);
auto segment = segment_growing.get();
// Verify data
auto int64_values = dataset.get_col<int64_t>(int64_field);
auto array_vec_values =
dataset.get_col<VectorFieldProto>(array_float_vector);
auto ids_ds = GenRandomIds(dataset_size);
auto int64_result =
segment->bulk_subscript(int64_field, ids_ds->GetIds(), dataset_size);
auto array_float_vector_result = segment->bulk_subscript(
array_float_vector, ids_ds->GetIds(), dataset_size);
EXPECT_EQ(int64_result->scalars().long_data().data_size(), dataset_size);
EXPECT_EQ(array_float_vector_result->vectors().vector_array().data_size(),
dataset_size);
auto verify_float_vectors = [](auto arr1, auto arr2) {
static constexpr float EPSILON = 1e-6;
EXPECT_EQ(arr1.size(), arr2.size());
for (int64_t i = 0; i < arr1.size(); ++i) {
EXPECT_NEAR(arr1[i], arr2[i], EPSILON);
}
};
for (int64_t i = 0; i < dataset_size; ++i) {
auto arrow_array = array_float_vector_result->vectors()
.vector_array()
.data()[i]
.float_vector()
.data();
auto expected_array =
array_vec_values[ids_ds->GetIds()[i]].float_vector().data();
verify_float_vectors(arrow_array, expected_array);
}
}

View File

@ -1992,12 +1992,17 @@ TEST(Sealed, QueryAllFields) {
auto double_values = dataset.get_col<double>(double_field);
auto varchar_values = dataset.get_col<std::string>(varchar_field);
auto json_values = dataset.get_col<std::string>(json_field);
auto int_array_values = dataset.get_col<ScalarArray>(int_array_field);
auto long_array_values = dataset.get_col<ScalarArray>(long_array_field);
auto bool_array_values = dataset.get_col<ScalarArray>(bool_array_field);
auto string_array_values = dataset.get_col<ScalarArray>(string_array_field);
auto double_array_values = dataset.get_col<ScalarArray>(double_array_field);
auto float_array_values = dataset.get_col<ScalarArray>(float_array_field);
auto int_array_values = dataset.get_col<ScalarFieldProto>(int_array_field);
auto long_array_values =
dataset.get_col<ScalarFieldProto>(long_array_field);
auto bool_array_values =
dataset.get_col<ScalarFieldProto>(bool_array_field);
auto string_array_values =
dataset.get_col<ScalarFieldProto>(string_array_field);
auto double_array_values =
dataset.get_col<ScalarFieldProto>(double_array_field);
auto float_array_values =
dataset.get_col<ScalarFieldProto>(float_array_field);
auto vector_values = dataset.get_col<float>(vec);
auto float16_vector_values = dataset.get_col<uint8_t>(float16_vec);
auto bfloat16_vector_values = dataset.get_col<uint8_t>(bfloat16_vec);
@ -2149,12 +2154,17 @@ TEST(Sealed, QueryAllNullableFields) {
auto double_values = dataset.get_col<double>(double_field);
auto varchar_values = dataset.get_col<std::string>(varchar_field);
auto json_values = dataset.get_col<std::string>(json_field);
auto int_array_values = dataset.get_col<ScalarArray>(int_array_field);
auto long_array_values = dataset.get_col<ScalarArray>(long_array_field);
auto bool_array_values = dataset.get_col<ScalarArray>(bool_array_field);
auto string_array_values = dataset.get_col<ScalarArray>(string_array_field);
auto double_array_values = dataset.get_col<ScalarArray>(double_array_field);
auto float_array_values = dataset.get_col<ScalarArray>(float_array_field);
auto int_array_values = dataset.get_col<ScalarFieldProto>(int_array_field);
auto long_array_values =
dataset.get_col<ScalarFieldProto>(long_array_field);
auto bool_array_values =
dataset.get_col<ScalarFieldProto>(bool_array_field);
auto string_array_values =
dataset.get_col<ScalarFieldProto>(string_array_field);
auto double_array_values =
dataset.get_col<ScalarFieldProto>(double_array_field);
auto float_array_values =
dataset.get_col<ScalarFieldProto>(float_array_field);
auto vector_values = dataset.get_col<float>(vec);
auto bool_valid_values = dataset.get_col_valid(bool_field);
@ -2269,3 +2279,57 @@ TEST(Sealed, SearchSortedPk) {
EXPECT_EQ(6, offsets2.size());
EXPECT_EQ(100, offsets2[0].get());
}
TEST(Sealed, QueryVectorArrayAllFields) {
auto schema = std::make_shared<Schema>();
auto metric_type = knowhere::metric::L2;
auto int64_field = schema->AddDebugField("int64", DataType::INT64);
auto array_vec = schema->AddDebugVectorArrayField(
"array_vec", DataType::VECTOR_FLOAT, 128, metric_type);
schema->set_primary_field_id(int64_field);
std::map<FieldId, FieldIndexMeta> filedMap{};
IndexMetaPtr metaPtr =
std::make_shared<CollectionIndexMeta>(100000, std::move(filedMap));
int64_t dataset_size = 1000;
int64_t dim = 128;
auto dataset = DataGen(schema, dataset_size);
auto segment_sealed = CreateSealedWithFieldDataLoaded(schema, dataset);
auto segment =
dynamic_cast<ChunkedSegmentSealedImpl*>(segment_sealed.get());
auto int64_values = dataset.get_col<int64_t>(int64_field);
auto array_vec_values = dataset.get_col<VectorFieldProto>(array_vec);
auto ids_ds = GenRandomIds(dataset_size);
auto int64_result =
segment->bulk_subscript(int64_field, ids_ds->GetIds(), dataset_size);
auto array_float_vector_result =
segment->bulk_subscript(array_vec, ids_ds->GetIds(), dataset_size);
EXPECT_EQ(int64_result->scalars().long_data().data_size(), dataset_size);
EXPECT_EQ(array_float_vector_result->vectors().vector_array().data_size(),
dataset_size);
auto verify_float_vectors = [](auto arr1, auto arr2) {
static constexpr float EPSILON = 1e-6;
EXPECT_EQ(arr1.size(), arr2.size());
for (int64_t i = 0; i < arr1.size(); ++i) {
EXPECT_NEAR(arr1[i], arr2[i], EPSILON);
}
};
for (int64_t i = 0; i < dataset_size; ++i) {
auto arrow_array = array_float_vector_result->vectors()
.vector_array()
.data()[i]
.float_vector()
.data();
auto expected_array =
array_vec_values[ids_ds->GetIds()[i]].float_vector().data();
verify_float_vectors(arrow_array, expected_array);
}
EXPECT_EQ(int64_result->valid_data_size(), 0);
EXPECT_EQ(array_float_vector_result->valid_data_size(), 0);
}

View File

@ -105,129 +105,150 @@ struct GeneratedData {
}
auto& field_meta = schema_->operator[](field_id);
if (field_meta.is_vector() &&
field_meta.get_data_type() != DataType::VECTOR_SPARSE_FLOAT) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data =
reinterpret_cast<const T*>(target_field_data.vectors()
.float_vector()
.data()
.data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_BINARY) {
int len = raw_->num_rows() * (field_meta.get_dim() / 8);
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.vectors().binary_vector().data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_FLOAT16) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.vectors().float16_vector().data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_BFLOAT16) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.vectors().bfloat16_vector().data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_INT8) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.vectors().int8_vector().data());
std::copy_n(src_data, len, ret.data());
} else {
PanicInfo(Unsupported, "unsupported");
}
return std::move(ret);
}
if constexpr (std::is_same_v<T,
knowhere::sparse::SparseRow<float>>) {
auto sparse_float_array =
target_field_data.vectors().sparse_float_vector();
auto rows = SparseBytesToRows(sparse_float_array.contents());
std::copy_n(rows.get(), raw_->num_rows(), ret.data());
} else if constexpr (std::is_same_v<T, ScalarArray>) {
auto ret_data = reinterpret_cast<ScalarArray*>(ret.data());
auto src_data = target_field_data.scalars().array_data().data();
if constexpr (std::is_same_v<T, VectorFieldProto>) {
auto ret_data = reinterpret_cast<VectorFieldProto*>(ret.data());
auto src_data =
target_field_data.vectors().vector_array().data();
std::copy(src_data.begin(), src_data.end(), ret_data);
return std::move(ret);
} else {
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
if (field_meta.is_vector() &&
field_meta.get_data_type() !=
DataType::VECTOR_SPARSE_FLOAT) {
if (field_meta.get_data_type() == DataType::VECTOR_FLOAT) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.bool_data()
target_field_data.vectors()
.float_vector()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
auto src_data = reinterpret_cast<const int32_t*>(
target_field_data.scalars()
.int_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT64: {
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_BINARY) {
int len = raw_->num_rows() * (field_meta.get_dim() / 8);
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.long_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::FLOAT: {
target_field_data.vectors().binary_vector().data());
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_FLOAT16) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.float_data()
.data()
target_field_data.vectors()
.float16_vector()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::DOUBLE: {
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_BFLOAT16) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.double_data()
.data()
target_field_data.vectors()
.bfloat16_vector()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::VARCHAR: {
auto ret_data =
reinterpret_cast<std::string*>(ret.data());
auto src_data =
target_field_data.scalars().string_data().data();
std::copy(src_data.begin(), src_data.end(), ret_data);
break;
}
case DataType::JSON: {
auto ret_data =
reinterpret_cast<std::string*>(ret.data());
auto src_data =
target_field_data.scalars().json_data().data();
std::copy(src_data.begin(), src_data.end(), ret_data);
break;
}
default: {
std::copy_n(src_data, len, ret.data());
} else if (field_meta.get_data_type() ==
DataType::VECTOR_INT8) {
int len = raw_->num_rows() * field_meta.get_dim();
ret.resize(len);
auto src_data = reinterpret_cast<const T*>(
target_field_data.vectors().int8_vector().data());
std::copy_n(src_data, len, ret.data());
} else {
PanicInfo(Unsupported, "unsupported");
}
return std::move(ret);
}
if constexpr (std::is_same_v<
T,
knowhere::sparse::SparseRow<float>>) {
auto sparse_float_array =
target_field_data.vectors().sparse_float_vector();
auto rows =
SparseBytesToRows(sparse_float_array.contents());
std::copy_n(rows.get(), raw_->num_rows(), ret.data());
} else if constexpr (std::is_same_v<T, ScalarFieldProto>) {
auto ret_data =
reinterpret_cast<ScalarFieldProto*>(ret.data());
auto src_data =
target_field_data.scalars().array_data().data();
std::copy(src_data.begin(), src_data.end(), ret_data);
} else {
switch (field_meta.get_data_type()) {
case DataType::BOOL: {
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.bool_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT8:
case DataType::INT16:
case DataType::INT32: {
auto src_data = reinterpret_cast<const int32_t*>(
target_field_data.scalars()
.int_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::INT64: {
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.long_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::FLOAT: {
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.float_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::DOUBLE: {
auto src_data = reinterpret_cast<const T*>(
target_field_data.scalars()
.double_data()
.data()
.data());
std::copy_n(src_data, raw_->num_rows(), ret.data());
break;
}
case DataType::VARCHAR: {
auto ret_data =
reinterpret_cast<std::string*>(ret.data());
auto src_data = target_field_data.scalars()
.string_data()
.data();
std::copy(
src_data.begin(), src_data.end(), ret_data);
break;
}
case DataType::JSON: {
auto ret_data =
reinterpret_cast<std::string*>(ret.data());
auto src_data =
target_field_data.scalars().json_data().data();
std::copy(
src_data.begin(), src_data.end(), ret_data);
break;
}
default: {
PanicInfo(Unsupported, "unsupported");
}
}
}
}
}
@ -412,55 +433,96 @@ DataGen(SchemaPtr schema,
insert_data->mutable_fields_data()->AddAllocated(array.release());
};
auto generate_float_vector = [&seed, &offset, &random, &distr](
auto& field_meta, int64_t N) {
auto dim = field_meta.get_dim();
vector<float> final(dim * N);
bool is_ip = starts_with(field_meta.get_name().get(), "normalized");
#pragma omp parallel for
for (int n = 0; n < N; ++n) {
vector<float> data(dim);
float sum = 0;
std::default_random_engine er2(seed + n);
std::normal_distribution<> distr2(0, 1);
for (auto& x : data) {
x = distr2(er2) + offset;
sum += x * x;
}
if (is_ip) {
sum = sqrt(sum);
for (auto& x : data) {
x /= sum;
}
}
std::copy(data.begin(), data.end(), final.begin() + dim * n);
}
return final;
};
auto generate_binary_vector = [&seed, &offset, &random](auto& field_meta,
int64_t N) {
auto dim = field_meta.get_dim();
Assert(dim % 8 == 0);
vector<uint8_t> data(dim / 8 * N);
for (auto& x : data) {
x = random();
}
return data;
};
auto generate_float16_vector = [&seed, &offset, &random, &distr](
auto& field_meta, int64_t N) {
auto dim = field_meta.get_dim();
vector<float16> data(dim * N);
for (auto& x : data) {
x = float16(distr(random) + offset);
}
return data;
};
auto generate_bfloat16_vector = [&seed, &offset, &random, &distr](
auto& field_meta, int64_t N) {
auto dim = field_meta.get_dim();
vector<bfloat16> data(dim * N);
for (auto& x : data) {
x = bfloat16(distr(random) + offset);
}
return data;
};
auto generate_int8_vector = [&seed, &offset, &random](auto& field_meta,
int64_t N) {
auto dim = field_meta.get_dim();
vector<int8_t> data(dim * N);
for (auto& x : data) {
x = int8_t(random() % 256 - 128);
}
return data;
};
for (auto field_id : schema->get_field_ids()) {
auto field_meta = schema->operator[](field_id);
switch (field_meta.get_data_type()) {
case DataType::VECTOR_FLOAT: {
auto dim = field_meta.get_dim();
vector<float> final(dim * N);
bool is_ip =
starts_with(field_meta.get_name().get(), "normalized");
#pragma omp parallel for
for (int n = 0; n < N; ++n) {
vector<float> data(dim);
float sum = 0;
std::default_random_engine er2(seed + n);
std::normal_distribution<> distr2(0, 1);
for (auto& x : data) {
x = distr2(er2) + offset;
sum += x * x;
}
if (is_ip) {
sum = sqrt(sum);
for (auto& x : data) {
x /= sum;
}
}
std::copy(
data.begin(), data.end(), final.begin() + dim * n);
}
insert_cols(final, N, field_meta, random_valid);
auto data = generate_float_vector(field_meta, N);
insert_cols(data, N, field_meta, random_valid);
break;
}
case DataType::VECTOR_BINARY: {
auto dim = field_meta.get_dim();
Assert(dim % 8 == 0);
vector<uint8_t> data(dim / 8 * N);
for (auto& x : data) {
x = random();
}
auto data = generate_binary_vector(field_meta, N);
insert_cols(data, N, field_meta, random_valid);
break;
}
case DataType::VECTOR_FLOAT16: {
auto dim = field_meta.get_dim();
vector<float16> final(dim * N);
for (auto& x : final) {
x = float16(distr(random) + offset);
}
insert_cols(final, N, field_meta, random_valid);
auto data = generate_float16_vector(field_meta, N);
insert_cols(data, N, field_meta, random_valid);
break;
}
case DataType::VECTOR_BFLOAT16: {
auto data = generate_bfloat16_vector(field_meta, N);
insert_cols(data, N, field_meta, random_valid);
break;
}
case DataType::VECTOR_SPARSE_FLOAT: {
@ -472,23 +534,81 @@ DataGen(SchemaPtr schema,
array.release());
break;
}
case DataType::VECTOR_BFLOAT16: {
auto dim = field_meta.get_dim();
vector<bfloat16> final(dim * N);
for (auto& x : final) {
x = bfloat16(distr(random) + offset);
}
insert_cols(final, N, field_meta, random_valid);
case DataType::VECTOR_INT8: {
auto data = generate_int8_vector(field_meta, N);
insert_cols(data, N, field_meta, random_valid);
break;
}
case DataType::VECTOR_INT8: {
case DataType::VECTOR_ARRAY: {
auto dim = field_meta.get_dim();
vector<int8> final(dim * N);
srand(seed);
for (auto& x : final) {
x = int8_t(rand() % 256 - 128);
vector<VectorFieldProto> vector_array(N);
for (int i = 0; i < N / repeat_count; ++i) {
VectorFieldProto field_data;
field_data.set_dim(dim);
switch (field_meta.get_element_type()) {
case DataType::VECTOR_FLOAT: {
auto data =
generate_float_vector(field_meta, array_len);
field_data.mutable_float_vector()
->mutable_data()
->Add(data.begin(), data.end());
break;
}
case DataType::VECTOR_BINARY: {
auto num_bytes = array_len * dim / 8;
auto data_raw =
generate_binary_vector(field_meta, array_len);
auto data =
reinterpret_cast<const char*>(data_raw.data());
auto obj = field_data.mutable_binary_vector();
obj->assign(data, num_bytes);
break;
}
case DataType::VECTOR_FLOAT16: {
auto length = array_len * dim;
auto data_raw =
generate_float16_vector(field_meta, array_len);
auto data =
reinterpret_cast<const char*>(data_raw.data());
auto obj = field_data.mutable_float16_vector();
obj->assign(data, length * sizeof(float16));
break;
}
case DataType::VECTOR_SPARSE_FLOAT:
PanicInfo(DataTypeInvalid, "not implemented");
break;
case DataType::VECTOR_BFLOAT16: {
auto length = array_len * dim;
auto data_raw =
generate_bfloat16_vector(field_meta, array_len);
auto data =
reinterpret_cast<const char*>(data_raw.data());
auto obj = field_data.mutable_bfloat16_vector();
obj->assign(data, length * sizeof(bfloat16));
break;
}
case DataType::VECTOR_INT8: {
auto length = array_len * dim;
auto data_raw =
generate_int8_vector(field_meta, array_len);
auto data =
reinterpret_cast<const char*>(data_raw.data());
auto obj = field_data.mutable_int8_vector();
obj->assign(data, length * sizeof(int8_t));
break;
}
default: {
PanicInfo(DataTypeInvalid, "not implemented");
}
}
for (int j = 0; j < repeat_count; ++j) {
vector_array[i * repeat_count + j] = field_data;
}
}
insert_cols(final, N, field_meta, random_valid);
insert_cols(vector_array, N, field_meta, random_valid);
break;
}
case DataType::BOOL: {
@ -594,7 +714,7 @@ DataGen(SchemaPtr schema,
break;
}
case DataType::ARRAY: {
vector<ScalarArray> data(N);
vector<ScalarFieldProto> data(N);
switch (field_meta.get_element_type()) {
case DataType::BOOL: {
for (int i = 0; i < N / repeat_count; i++) {
@ -1050,6 +1170,16 @@ CreateFieldDataFromDataArray(ssize_t raw_count,
createFieldData(raw_data, DataType::VECTOR_INT8, dim);
break;
}
case DataType::VECTOR_ARRAY: {
auto src_data = data->vectors().vector_array().data();
auto dim = field_meta.get_dim();
std::vector<VectorArray> data_raw(src_data.size());
for (int i = 0; i < src_data.size(); i++) {
data_raw[i] = VectorArray(src_data.at(i));
}
createFieldData(data_raw.data(), DataType::VECTOR_ARRAY, dim);
break;
}
default: {
PanicInfo(Unsupported, "unsupported");
}

View File

@ -184,7 +184,7 @@ PrepareSingleFieldInsertBinlog(int64_t collection_id,
inline void
LoadGeneratedDataIntoSegment(const GeneratedData& dataset,
milvus::segcore::SegmentSealed* segment,
milvus::segcore::SegmentInternalInterface* segment,
bool with_mmap = false,
std::vector<int64_t> excluded_field_ids = {}) {
std::string mmap_dir_path = with_mmap ? "./data/mmap-test" : "";
@ -215,6 +215,19 @@ CreateSealedWithFieldDataLoaded(milvus::SchemaPtr schema,
return segment;
}
inline std::unique_ptr<milvus::segcore::SegmentGrowing>
CreateGrowingWithFieldDataLoaded(milvus::SchemaPtr schema,
milvus::IndexMetaPtr indexMeta,
const milvus::segcore::SegcoreConfig& config,
const GeneratedData& dataset,
bool with_mmap = false,
std::vector<int64_t> excluded_field_ids = {}) {
auto segment_growing = CreateGrowingSegment(schema, indexMeta, 1, config);
LoadGeneratedDataIntoSegment(
dataset, segment_growing.get(), with_mmap, excluded_field_ids);
return segment_growing;
}
inline std::vector<int64_t>
GetExcludedFieldIds(milvus::SchemaPtr schema,
std::vector<int64_t> field_ids_to_load) {

View File

@ -0,0 +1,135 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <random>
#include <vector>
#include <string>
#include "common/VectorArray.h"
#include "pb/schema.pb.h"
#include "common/Schema.h"
using namespace milvus;
TEST(VectorArray, TestSchema) {
namespace pb = milvus::proto;
pb::schema::CollectionSchema proto;
proto.set_name("col");
proto.set_description("asdfhsalkgfhsadg");
auto dim = 16;
bool bool_default_value = true;
int32_t int_default_value = 20;
int64_t long_default_value = 20;
float float_default_value = 20;
double double_default_value = 20;
std::string varchar_dafualt_vlaue = "20";
{
auto field = proto.add_fields();
field->set_name("key");
field->set_nullable(false);
field->set_fieldid(100);
field->set_is_primary_key(true);
field->set_description("asdgfsagf");
field->set_data_type(pb::schema::DataType::Int64);
}
{
auto struct_field = proto.add_struct_array_fields();
struct_field->set_name("struct");
struct_field->set_fieldid(101);
auto field = struct_field->add_fields();
field->set_name("struct_key");
field->set_nullable(false);
field->set_fieldid(102);
field->set_data_type(pb::schema::DataType::Array);
field->set_element_type(pb::schema::DataType::Int64);
auto field2 = struct_field->add_fields();
field2->set_name("struct_float_vec");
field2->set_fieldid(103);
field2->set_data_type(pb::schema::DataType::ArrayOfVector);
field2->set_element_type(pb::schema::DataType::FloatVector);
auto param = field2->add_type_params();
param->set_key("dim");
param->set_value("16");
auto iparam = field2->add_index_params();
iparam->set_key("metric_type");
iparam->set_value("L2");
}
auto schema = Schema::ParseFrom(proto);
auto field = schema->operator[](FieldId(102));
ASSERT_EQ(field.get_data_type(), DataType::ARRAY);
ASSERT_EQ(field.get_element_type(), DataType::INT64);
auto field2 = schema->operator[](FieldId(103));
ASSERT_EQ(field2.get_data_type(), DataType::VECTOR_ARRAY);
ASSERT_EQ(field2.get_element_type(), DataType::VECTOR_FLOAT);
ASSERT_EQ(field2.get_dim(), 16);
}
std::vector<float>
generate_float_vector(int64_t seed, int64_t N, int64_t dim) {
std::vector<float> final(dim * N);
for (int n = 0; n < N; ++n) {
// generate random float vector
std::vector<float> data(dim);
std::default_random_engine er2(seed + n);
std::normal_distribution<> distr2(0, 1);
for (auto& x : data) {
x = distr2(er2);
}
std::copy(data.begin(), data.end(), final.begin() + dim * n);
}
return final;
};
TEST(VectorArray, TestConstructVectorArray) {
using namespace milvus;
int N = 10;
// 1. test float vector
int64_t dim = 128;
milvus::proto::schema::VectorField field_float_vector_array;
field_float_vector_array.set_dim(dim);
auto data = generate_float_vector(100, N, dim);
field_float_vector_array.mutable_float_vector()->mutable_data()->Add(
data.begin(), data.end());
auto float_vector_array = VectorArray(field_float_vector_array);
ASSERT_EQ(float_vector_array.length(), N);
ASSERT_EQ(float_vector_array.dim(), dim);
ASSERT_EQ(float_vector_array.get_element_type(), DataType::VECTOR_FLOAT);
ASSERT_EQ(float_vector_array.byte_size(), N * dim * sizeof(float));
ASSERT_TRUE(float_vector_array.is_same_array(field_float_vector_array));
auto float_vector_array_tmp = VectorArray(float_vector_array);
ASSERT_TRUE(float_vector_array_tmp.is_same_array(field_float_vector_array));
auto float_vector_array_view =
VectorArrayView(const_cast<char*>(float_vector_array.data()),
float_vector_array.length(),
float_vector_array.dim(),
float_vector_array.byte_size(),
float_vector_array.get_element_type());
ASSERT_TRUE(
float_vector_array_view.is_same_array(field_float_vector_array));
// todo: add other vector types
}

View File

@ -729,7 +729,7 @@ func Test_createCollectionTask_prepareSchema(t *testing.T) {
Fields: []*schemapb.FieldSchema{
{
Name: field1,
DataType: 200,
DataType: 300,
},
},
}

View File

@ -20,7 +20,7 @@ require (
github.com/jolestar/go-commons-pool/v2 v2.1.2
github.com/json-iterator/go v1.1.12
github.com/klauspost/compress v1.17.9
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847
github.com/minio/minio-go/v7 v7.0.73
github.com/panjf2000/ants/v2 v2.11.3
github.com/prometheus/client_golang v1.14.0

View File

@ -559,6 +559,8 @@ github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZz
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6 h1:/m5rWCbnFL0Pg2KLS6Lw/lzQfdEb4qa2dLEqE4QjpkU=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250527033021-e6b398e94ee6/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847 h1:6MFC+EIDe+n1aSX0BZ4s1/XbbPaEzQCx6JBoM1NTTz0=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/pulsar-client-go v0.12.1 h1:O2JZp1tsYiO7C0MQ4hrUY/aJXnn2Gry6hpm7UodghmE=
github.com/milvus-io/pulsar-client-go v0.12.1/go.mod h1:dkutuH4oS2pXiGm+Ti7fQZ4MRjrMPZ8IJeEGAWMeckk=
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=

View File

@ -51,7 +51,7 @@ require (
github.com/kr/text v0.2.0 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd // indirect
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/opencontainers/runtime-spec v1.0.2 // indirect

View File

@ -318,8 +318,8 @@ github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfr
github.com/mediocregopher/radix/v3 v3.4.2/go.mod h1:8FL3F6UQRXHXIBSPUs5h0RybMF8i4n7wVopoX3x7Bv8=
github.com/microcosm-cc/bluemonday v1.0.2/go.mod h1:iVP4YcDBq+n/5fb23BhYFvIMq/leAFZyRl6bYmGDlGc=
github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd h1:nriBHIny3LiSU56kP2FiIVtxh/0EEFXBpZk4WirsR7s=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250520065019-02ce2e62a9fd/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847 h1:6MFC+EIDe+n1aSX0BZ4s1/XbbPaEzQCx6JBoM1NTTz0=
github.com/milvus-io/milvus-proto/go-api/v2 v2.5.0-beta.0.20250604032224-16218b12b847/go.mod h1:/6UT4zZl6awVeXLeE7UGDWZvXj3IWkRsh3mqsn0DiAs=
github.com/milvus-io/milvus/pkg/v2 v2.0.0-20250319085209-5a6b4e56d59e h1:VCr43pG4efacDbM4au70fh8/5hNTftoWzm1iEumvDWM=
github.com/milvus-io/milvus/pkg/v2 v2.0.0-20250319085209-5a6b4e56d59e/go.mod h1:37AWzxVs2NS4QUJrkcbeLUwi+4Av0h5mEdjLI62EANU=
github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc=