mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
enhance: speed up array-equal operator via inverted index (#33633)
fix: #33632 --------- Signed-off-by: longjiquan <jiquan.long@zilliz.com>
This commit is contained in:
parent
fa26953168
commit
ecf2bcee42
@ -51,6 +51,15 @@ class Schema {
|
||||
return field_id;
|
||||
}
|
||||
|
||||
FieldId
|
||||
AddDebugArrayField(const std::string& name, DataType element_type) {
|
||||
auto field_id = FieldId(debug_id);
|
||||
debug_id++;
|
||||
this->AddField(
|
||||
FieldName(name), field_id, DataType::ARRAY, element_type);
|
||||
return field_id;
|
||||
}
|
||||
|
||||
// auto gen field_id for convenience
|
||||
FieldId
|
||||
AddDebugField(const std::string& name,
|
||||
|
||||
@ -280,6 +280,22 @@ class SegmentExpr : public Expr {
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T, typename FUNC, typename... ValTypes>
|
||||
void
|
||||
ProcessIndexChunksV2(FUNC func, ValTypes... values) {
|
||||
typedef std::
|
||||
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
|
||||
IndexInnerType;
|
||||
using Index = index::ScalarIndex<IndexInnerType>;
|
||||
|
||||
for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
|
||||
const Index& index =
|
||||
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
|
||||
auto* index_ptr = const_cast<Index*>(&index);
|
||||
func(index_ptr, values...);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
CanUseIndex(OpType op) const {
|
||||
|
||||
@ -20,6 +20,66 @@
|
||||
namespace milvus {
|
||||
namespace exec {
|
||||
|
||||
template <typename T>
|
||||
VectorPtr
|
||||
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex() {
|
||||
return ExecRangeVisitorImplArray<T>();
|
||||
}
|
||||
|
||||
template <>
|
||||
VectorPtr
|
||||
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex<
|
||||
proto::plan::Array>() {
|
||||
switch (expr_->op_type_) {
|
||||
case proto::plan::Equal:
|
||||
case proto::plan::NotEqual: {
|
||||
switch (expr_->column_.element_type_) {
|
||||
case DataType::BOOL: {
|
||||
return ExecArrayEqualForIndex<bool>(expr_->op_type_ ==
|
||||
proto::plan::NotEqual);
|
||||
}
|
||||
case DataType::INT8: {
|
||||
return ExecArrayEqualForIndex<int8_t>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
}
|
||||
case DataType::INT16: {
|
||||
return ExecArrayEqualForIndex<int16_t>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
}
|
||||
case DataType::INT32: {
|
||||
return ExecArrayEqualForIndex<int32_t>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
}
|
||||
case DataType::INT64: {
|
||||
return ExecArrayEqualForIndex<int64_t>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
}
|
||||
case DataType::FLOAT:
|
||||
case DataType::DOUBLE: {
|
||||
// not accurate on floating point number, rollback to bruteforce.
|
||||
return ExecRangeVisitorImplArray<proto::plan::Array>();
|
||||
}
|
||||
case DataType::VARCHAR: {
|
||||
if (segment_->type() == SegmentType::Growing) {
|
||||
return ExecArrayEqualForIndex<std::string>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
} else {
|
||||
return ExecArrayEqualForIndex<std::string_view>(
|
||||
expr_->op_type_ == proto::plan::NotEqual);
|
||||
}
|
||||
}
|
||||
default:
|
||||
PanicInfo(DataTypeInvalid,
|
||||
"unsupported element type when execute array "
|
||||
"equal for index: {}",
|
||||
expr_->column_.element_type_);
|
||||
}
|
||||
}
|
||||
default:
|
||||
return ExecRangeVisitorImplArray<proto::plan::Array>();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
switch (expr_->column_.data_type_) {
|
||||
@ -99,7 +159,13 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
result = ExecRangeVisitorImplArray<std::string>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kArrayVal:
|
||||
result = ExecRangeVisitorImplArray<proto::plan::Array>();
|
||||
if (is_index_mode_) {
|
||||
result = ExecRangeVisitorImplArrayForIndex<
|
||||
proto::plan::Array>();
|
||||
} else {
|
||||
result =
|
||||
ExecRangeVisitorImplArray<proto::plan::Array>();
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PanicInfo(
|
||||
@ -196,6 +262,104 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray() {
|
||||
return res_vec;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
VectorPtr
|
||||
PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) {
|
||||
typedef std::
|
||||
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
|
||||
IndexInnerType;
|
||||
using Index = index::ScalarIndex<IndexInnerType>;
|
||||
auto real_batch_size = GetNextBatchSize();
|
||||
if (real_batch_size == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// get all elements.
|
||||
auto val = GetValueFromProto<proto::plan::Array>(expr_->val_);
|
||||
if (val.array_size() == 0) {
|
||||
// rollback to bruteforce. no candidates will be filtered out via index.
|
||||
return ExecRangeVisitorImplArray<proto::plan::Array>();
|
||||
}
|
||||
|
||||
// cache the result to suit the framework.
|
||||
auto batch_res =
|
||||
ProcessIndexChunks<IndexInnerType>([this, &val, reverse](Index* _) {
|
||||
boost::container::vector<IndexInnerType> elems;
|
||||
for (auto const& element : val.array()) {
|
||||
auto e = GetValueFromProto<IndexInnerType>(element);
|
||||
if (std::find(elems.begin(), elems.end(), e) == elems.end()) {
|
||||
elems.push_back(e);
|
||||
}
|
||||
}
|
||||
|
||||
// filtering by index, get candidates.
|
||||
auto size_per_chunk = segment_->size_per_chunk();
|
||||
auto retrieve = [ size_per_chunk, this ](int64_t offset) -> auto {
|
||||
auto chunk_idx = offset / size_per_chunk;
|
||||
auto chunk_offset = offset % size_per_chunk;
|
||||
const auto& chunk =
|
||||
segment_->template chunk_data<milvus::ArrayView>(field_id_,
|
||||
chunk_idx);
|
||||
return chunk.data() + chunk_offset;
|
||||
};
|
||||
|
||||
// compare the array via the raw data.
|
||||
auto filter = [&retrieve, &val, reverse](size_t offset) -> bool {
|
||||
auto data_ptr = retrieve(offset);
|
||||
return data_ptr->is_same_array(val) ^ reverse;
|
||||
};
|
||||
|
||||
// collect all candidates.
|
||||
std::unordered_set<size_t> candidates;
|
||||
std::unordered_set<size_t> tmp_candidates;
|
||||
auto first_callback = [&candidates](size_t offset) -> void {
|
||||
candidates.insert(offset);
|
||||
};
|
||||
auto callback = [&candidates,
|
||||
&tmp_candidates](size_t offset) -> void {
|
||||
if (candidates.find(offset) != candidates.end()) {
|
||||
tmp_candidates.insert(offset);
|
||||
}
|
||||
};
|
||||
auto execute_sub_batch =
|
||||
[](Index* index_ptr,
|
||||
const IndexInnerType& val,
|
||||
const std::function<void(size_t /* offset */)>& callback) {
|
||||
index_ptr->InApplyCallback(1, &val, callback);
|
||||
};
|
||||
|
||||
// run in-filter.
|
||||
for (size_t idx = 0; idx < elems.size(); idx++) {
|
||||
if (idx == 0) {
|
||||
ProcessIndexChunksV2<IndexInnerType>(
|
||||
execute_sub_batch, elems[idx], first_callback);
|
||||
} else {
|
||||
ProcessIndexChunksV2<IndexInnerType>(
|
||||
execute_sub_batch, elems[idx], callback);
|
||||
candidates = std::move(tmp_candidates);
|
||||
}
|
||||
// the size of candidates is small enough.
|
||||
if (candidates.size() * 100 < active_count_) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
TargetBitmap res(active_count_);
|
||||
// run post-filter. The filter will only be executed once in the framework.
|
||||
for (const auto& candidate : candidates) {
|
||||
res[candidate] = filter(candidate);
|
||||
}
|
||||
return res;
|
||||
});
|
||||
AssertInfo(batch_res.size() == real_batch_size,
|
||||
"internal error: expr processed rows {} not equal "
|
||||
"expect batch size {}",
|
||||
batch_res.size(),
|
||||
real_batch_size);
|
||||
|
||||
// return the result.
|
||||
return std::make_shared<ColumnVector>(std::move(batch_res));
|
||||
}
|
||||
|
||||
template <typename ExprValueType>
|
||||
VectorPtr
|
||||
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson() {
|
||||
|
||||
@ -310,6 +310,14 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
|
||||
VectorPtr
|
||||
ExecRangeVisitorImplArray();
|
||||
|
||||
template <typename T>
|
||||
VectorPtr
|
||||
ExecRangeVisitorImplArrayForIndex();
|
||||
|
||||
template <typename T>
|
||||
VectorPtr
|
||||
ExecArrayEqualForIndex(bool reverse);
|
||||
|
||||
// Check overflow and cache result for performace
|
||||
template <typename T>
|
||||
ColumnVectorPtr
|
||||
|
||||
@ -204,6 +204,25 @@ apply_hits(TargetBitmap& bitset, const RustArrayWrapper& w, bool v) {
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
apply_hits_with_filter(TargetBitmap& bitset,
|
||||
const RustArrayWrapper& w,
|
||||
const std::function<bool(size_t /* offset */)>& filter) {
|
||||
for (size_t j = 0; j < w.array_.len; j++) {
|
||||
auto the_offset = w.array_.array[j];
|
||||
bitset[the_offset] = filter(the_offset);
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
apply_hits_with_callback(
|
||||
const RustArrayWrapper& w,
|
||||
const std::function<void(size_t /* offset */)>& callback) {
|
||||
for (size_t j = 0; j < w.array_.len; j++) {
|
||||
callback(w.array_.array[j]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::In(size_t n, const T* values) {
|
||||
@ -215,6 +234,28 @@ InvertedIndexTantivy<T>::In(size_t n, const T* values) {
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::InApplyFilter(
|
||||
size_t n, const T* values, const std::function<bool(size_t)>& filter) {
|
||||
TargetBitmap bitset(Count());
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto array = wrapper_->term_query(values[i]);
|
||||
apply_hits_with_filter(bitset, array, filter);
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
InvertedIndexTantivy<T>::InApplyCallback(
|
||||
size_t n, const T* values, const std::function<void(size_t)>& callback) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto array = wrapper_->term_query(values[i]);
|
||||
apply_hits_with_callback(array, callback);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
|
||||
@ -311,6 +352,9 @@ void
|
||||
InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
||||
const void* values,
|
||||
const Config& config) {
|
||||
if constexpr (std::is_same_v<bool, T>) {
|
||||
schema_.set_data_type(proto::schema::DataType::Bool);
|
||||
}
|
||||
if constexpr (std::is_same_v<int8_t, T>) {
|
||||
schema_.set_data_type(proto::schema::DataType::Int8);
|
||||
}
|
||||
@ -341,7 +385,15 @@ InvertedIndexTantivy<T>::BuildWithRawData(size_t n,
|
||||
std::string field = "test_inverted_index";
|
||||
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
||||
field.c_str(), d_type_, path_.c_str());
|
||||
wrapper_->add_data<T>(static_cast<const T*>(values), n);
|
||||
if (config.find("is_array") != config.end()) {
|
||||
// only used in ut.
|
||||
auto arr = static_cast<const boost::container::vector<T>*>(values);
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
wrapper_->template add_multi_data(arr[i].data(), arr[i].size());
|
||||
}
|
||||
} else {
|
||||
wrapper_->add_data<T>(static_cast<const T*>(values), n);
|
||||
}
|
||||
finish();
|
||||
}
|
||||
|
||||
|
||||
@ -111,6 +111,18 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||
const TargetBitmap
|
||||
In(size_t n, const T* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
InApplyFilter(
|
||||
size_t n,
|
||||
const T* values,
|
||||
const std::function<bool(size_t /* offset */)>& filter) override;
|
||||
|
||||
void
|
||||
InApplyCallback(
|
||||
size_t n,
|
||||
const T* values,
|
||||
const std::function<void(size_t /* offset */)>& callback) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n, const T* values) override;
|
||||
|
||||
|
||||
@ -50,6 +50,20 @@ class ScalarIndex : public IndexBase {
|
||||
virtual const TargetBitmap
|
||||
In(size_t n, const T* values) = 0;
|
||||
|
||||
virtual const TargetBitmap
|
||||
InApplyFilter(size_t n,
|
||||
const T* values,
|
||||
const std::function<bool(size_t /* offset */)>& filter) {
|
||||
PanicInfo(ErrorCode::Unsupported, "InApplyFilter is not implemented");
|
||||
}
|
||||
|
||||
virtual void
|
||||
InApplyCallback(size_t n,
|
||||
const T* values,
|
||||
const std::function<void(size_t /* offset */)>& callback) {
|
||||
PanicInfo(ErrorCode::Unsupported, "InApplyCallback is not implemented");
|
||||
}
|
||||
|
||||
virtual const TargetBitmap
|
||||
NotIn(size_t n, const T* values) = 0;
|
||||
|
||||
|
||||
@ -51,15 +51,6 @@ struct RustArrayWrapper {
|
||||
std::cout << ss.str() << std::endl;
|
||||
}
|
||||
|
||||
std::set<uint32_t>
|
||||
to_set() {
|
||||
std::set<uint32_t> s;
|
||||
for (int i = 0; i < array_.len; i++) {
|
||||
s.insert(array_.array[i]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
RustArray array_;
|
||||
|
||||
private:
|
||||
|
||||
10
internal/core/thirdparty/tantivy/test.cpp
vendored
10
internal/core/thirdparty/tantivy/test.cpp
vendored
@ -200,6 +200,12 @@ test_32717() {
|
||||
}
|
||||
}
|
||||
|
||||
std::set<uint32_t>
|
||||
to_set(const RustArrayWrapper& w) {
|
||||
std::set<uint32_t> s(w.array_.array, w.array_.array + w.array_.len);
|
||||
return s;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::map<T, std::set<uint32_t>>
|
||||
build_inverted_index(const std::vector<std::vector<T>>& vec_of_array) {
|
||||
@ -236,7 +242,7 @@ test_array_int() {
|
||||
|
||||
auto inverted_index = build_inverted_index(vec_of_array);
|
||||
for (const auto& [term, posting_list] : inverted_index) {
|
||||
auto hits = w.term_query(term).to_set();
|
||||
auto hits = to_set(w.term_query(term));
|
||||
assert(posting_list == hits);
|
||||
}
|
||||
}
|
||||
@ -266,7 +272,7 @@ test_array_string() {
|
||||
|
||||
auto inverted_index = build_inverted_index(vec_of_array);
|
||||
for (const auto& [term, posting_list] : inverted_index) {
|
||||
auto hits = w.term_query(term).to_set();
|
||||
auto hits = to_set(w.term_query(term));
|
||||
assert(posting_list == hits);
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,6 +68,7 @@ set(MILVUS_TEST_FILES
|
||||
test_regex_query_util.cpp
|
||||
test_regex_query.cpp
|
||||
test_futures.cpp
|
||||
test_array_inverted_index.cpp
|
||||
)
|
||||
|
||||
if ( INDEX_ENGINE STREQUAL "cardinal" )
|
||||
|
||||
297
internal/core/unittest/test_array_inverted_index.cpp
Normal file
297
internal/core/unittest/test_array_inverted_index.cpp
Normal file
@ -0,0 +1,297 @@
|
||||
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICEN_SE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRAN_TIES OR CON_DITION_S OF AN_Y KIN_D, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <regex>
|
||||
|
||||
#include "pb/plan.pb.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "common/Schema.h"
|
||||
#include "segcore/SegmentSealedImpl.h"
|
||||
#include "test_utils/DataGen.h"
|
||||
#include "test_utils/GenExprProto.h"
|
||||
#include "query/PlanProto.h"
|
||||
#include "query/generated/ExecPlanNodeVisitor.h"
|
||||
|
||||
using namespace milvus;
|
||||
using namespace milvus::query;
|
||||
using namespace milvus::segcore;
|
||||
|
||||
template <typename T>
|
||||
SchemaPtr
|
||||
GenTestSchema() {
|
||||
auto schema_ = std::make_shared<Schema>();
|
||||
schema_->AddDebugField(
|
||||
"fvec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
|
||||
auto pk = schema_->AddDebugField("pk", DataType::INT64);
|
||||
schema_->set_primary_field_id(pk);
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
schema_->AddDebugArrayField("array", DataType::BOOL);
|
||||
} else if constexpr (std::is_same_v<T, int8_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT8);
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT16);
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT32);
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
schema_->AddDebugArrayField("array", DataType::INT64);
|
||||
} else if constexpr (std::is_same_v<T, float>) {
|
||||
schema_->AddDebugArrayField("array", DataType::FLOAT);
|
||||
} else if constexpr (std::is_same_v<T, double>) {
|
||||
schema_->AddDebugArrayField("array", DataType::DOUBLE);
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
schema_->AddDebugArrayField("array", DataType::VARCHAR);
|
||||
}
|
||||
|
||||
return schema_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class ArrayInvertedIndexTest : public ::testing::Test {
|
||||
public:
|
||||
void
|
||||
SetUp() override {
|
||||
schema_ = GenTestSchema<T>();
|
||||
seg_ = CreateSealedSegment(schema_);
|
||||
N_ = 3000;
|
||||
uint64_t seed = 19190504;
|
||||
auto raw_data = DataGen(schema_, N_, seed);
|
||||
auto array_col =
|
||||
raw_data.get_col(schema_->get_field_id(FieldName("array")))
|
||||
->scalars()
|
||||
.array_data()
|
||||
.data();
|
||||
for (size_t i = 0; i < N_; i++) {
|
||||
boost::container::vector<T> array;
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
for (size_t j = 0; j < array_col[i].bool_data().data_size();
|
||||
j++) {
|
||||
array.push_back(array_col[i].bool_data().data(j));
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
for (size_t j = 0; j < array_col[i].long_data().data_size();
|
||||
j++) {
|
||||
array.push_back(array_col[i].long_data().data(j));
|
||||
}
|
||||
} else if constexpr (std::is_integral_v<T>) {
|
||||
for (size_t j = 0; j < array_col[i].int_data().data_size();
|
||||
j++) {
|
||||
array.push_back(array_col[i].int_data().data(j));
|
||||
}
|
||||
} else if constexpr (std::is_floating_point_v<T>) {
|
||||
for (size_t j = 0; j < array_col[i].float_data().data_size();
|
||||
j++) {
|
||||
array.push_back(array_col[i].float_data().data(j));
|
||||
}
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
for (size_t j = 0; j < array_col[i].string_data().data_size();
|
||||
j++) {
|
||||
array.push_back(array_col[i].string_data().data(j));
|
||||
}
|
||||
}
|
||||
vec_of_array_.push_back(array);
|
||||
}
|
||||
SealedLoadFieldData(raw_data, *seg_);
|
||||
LoadInvertedIndex();
|
||||
}
|
||||
|
||||
void
|
||||
TearDown() override {
|
||||
}
|
||||
|
||||
void
|
||||
LoadInvertedIndex() {
|
||||
auto index = std::make_unique<index::InvertedIndexTantivy<T>>();
|
||||
Config cfg;
|
||||
cfg["is_array"] = true;
|
||||
index->BuildWithRawData(N_, vec_of_array_.data(), cfg);
|
||||
LoadIndexInfo info{
|
||||
.field_id = schema_->get_field_id(FieldName("array")).get(),
|
||||
.index = std::move(index),
|
||||
};
|
||||
seg_->LoadIndex(info);
|
||||
}
|
||||
|
||||
public:
|
||||
SchemaPtr schema_;
|
||||
SegmentSealedUPtr seg_;
|
||||
int64_t N_;
|
||||
std::vector<boost::container::vector<T>> vec_of_array_;
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(ArrayInvertedIndexTest);
|
||||
|
||||
TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAny) {
|
||||
const auto& meta = this->schema_->operator[](FieldName("array"));
|
||||
auto column_info = test::GenColumnInfo(
|
||||
meta.get_id().get(),
|
||||
static_cast<proto::schema::DataType>(meta.get_data_type()),
|
||||
false,
|
||||
false,
|
||||
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||
auto contains_expr = std::make_unique<proto::plan::JSONContainsExpr>();
|
||||
contains_expr->set_allocated_column_info(column_info);
|
||||
contains_expr->set_op(proto::plan::JSONContainsExpr_JSONOp::
|
||||
JSONContainsExpr_JSONOp_ContainsAny);
|
||||
contains_expr->set_elements_same_type(true);
|
||||
for (const auto& elem : this->vec_of_array_[0]) {
|
||||
auto t = test::GenGenericValue(elem);
|
||||
contains_expr->mutable_elements()->AddAllocated(t);
|
||||
}
|
||||
auto expr = test::GenExpr();
|
||||
expr->set_allocated_json_contains_expr(contains_expr.release());
|
||||
|
||||
auto parser = ProtoParser(*this->schema_);
|
||||
auto typed_expr = parser.ParseExprs(*expr);
|
||||
auto parsed =
|
||||
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
|
||||
|
||||
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
|
||||
query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP);
|
||||
BitsetType final;
|
||||
visitor.ExecuteExprNode(parsed, segpromote, this->N_, final);
|
||||
|
||||
std::unordered_set<TypeParam> elems(this->vec_of_array_[0].begin(),
|
||||
this->vec_of_array_[0].end());
|
||||
auto ref = [this, &elems](size_t offset) -> bool {
|
||||
std::unordered_set<TypeParam> row(this->vec_of_array_[offset].begin(),
|
||||
this->vec_of_array_[offset].end());
|
||||
for (const auto& elem : elems) {
|
||||
if (row.find(elem) != row.end()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
ASSERT_EQ(final.size(), this->N_);
|
||||
for (size_t i = 0; i < this->N_; i++) {
|
||||
ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i]
|
||||
<< ", ref(i): " << ref(i);
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAll) {
|
||||
const auto& meta = this->schema_->operator[](FieldName("array"));
|
||||
auto column_info = test::GenColumnInfo(
|
||||
meta.get_id().get(),
|
||||
static_cast<proto::schema::DataType>(meta.get_data_type()),
|
||||
false,
|
||||
false,
|
||||
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||
auto contains_expr = std::make_unique<proto::plan::JSONContainsExpr>();
|
||||
contains_expr->set_allocated_column_info(column_info);
|
||||
contains_expr->set_op(proto::plan::JSONContainsExpr_JSONOp::
|
||||
JSONContainsExpr_JSONOp_ContainsAll);
|
||||
contains_expr->set_elements_same_type(true);
|
||||
for (const auto& elem : this->vec_of_array_[0]) {
|
||||
auto t = test::GenGenericValue(elem);
|
||||
contains_expr->mutable_elements()->AddAllocated(t);
|
||||
}
|
||||
auto expr = test::GenExpr();
|
||||
expr->set_allocated_json_contains_expr(contains_expr.release());
|
||||
|
||||
auto parser = ProtoParser(*this->schema_);
|
||||
auto typed_expr = parser.ParseExprs(*expr);
|
||||
auto parsed =
|
||||
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
|
||||
|
||||
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
|
||||
query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP);
|
||||
BitsetType final;
|
||||
visitor.ExecuteExprNode(parsed, segpromote, this->N_, final);
|
||||
|
||||
std::unordered_set<TypeParam> elems(this->vec_of_array_[0].begin(),
|
||||
this->vec_of_array_[0].end());
|
||||
auto ref = [this, &elems](size_t offset) -> bool {
|
||||
std::unordered_set<TypeParam> row(this->vec_of_array_[offset].begin(),
|
||||
this->vec_of_array_[offset].end());
|
||||
for (const auto& elem : elems) {
|
||||
if (row.find(elem) == row.end()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
ASSERT_EQ(final.size(), this->N_);
|
||||
for (size_t i = 0; i < this->N_; i++) {
|
||||
ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i]
|
||||
<< ", ref(i): " << ref(i);
|
||||
}
|
||||
}
|
||||
|
||||
TYPED_TEST_P(ArrayInvertedIndexTest, ArrayEqual) {
|
||||
if (std::is_floating_point_v<TypeParam>) {
|
||||
GTEST_SKIP() << "not accurate to perform equal comparison on floating "
|
||||
"point number";
|
||||
}
|
||||
|
||||
const auto& meta = this->schema_->operator[](FieldName("array"));
|
||||
auto column_info = test::GenColumnInfo(
|
||||
meta.get_id().get(),
|
||||
static_cast<proto::schema::DataType>(meta.get_data_type()),
|
||||
false,
|
||||
false,
|
||||
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||
auto unary_range_expr = std::make_unique<proto::plan::UnaryRangeExpr>();
|
||||
unary_range_expr->set_allocated_column_info(column_info);
|
||||
unary_range_expr->set_op(proto::plan::OpType::Equal);
|
||||
auto arr = new proto::plan::GenericValue;
|
||||
arr->mutable_array_val()->set_element_type(
|
||||
static_cast<proto::schema::DataType>(meta.get_element_type()));
|
||||
arr->mutable_array_val()->set_same_type(true);
|
||||
for (const auto& elem : this->vec_of_array_[0]) {
|
||||
auto e = test::GenGenericValue(elem);
|
||||
arr->mutable_array_val()->mutable_array()->AddAllocated(e);
|
||||
}
|
||||
unary_range_expr->set_allocated_value(arr);
|
||||
auto expr = test::GenExpr();
|
||||
expr->set_allocated_unary_range_expr(unary_range_expr.release());
|
||||
|
||||
auto parser = ProtoParser(*this->schema_);
|
||||
auto typed_expr = parser.ParseExprs(*expr);
|
||||
auto parsed =
|
||||
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
|
||||
|
||||
auto segpromote = dynamic_cast<SegmentSealedImpl*>(this->seg_.get());
|
||||
query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP);
|
||||
BitsetType final;
|
||||
visitor.ExecuteExprNode(parsed, segpromote, this->N_, final);
|
||||
|
||||
auto ref = [this](size_t offset) -> bool {
|
||||
if (this->vec_of_array_[0].size() !=
|
||||
this->vec_of_array_[offset].size()) {
|
||||
return false;
|
||||
}
|
||||
auto size = this->vec_of_array_[0].size();
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
if (this->vec_of_array_[0][i] != this->vec_of_array_[offset][i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
ASSERT_EQ(final.size(), this->N_);
|
||||
for (size_t i = 0; i < this->N_; i++) {
|
||||
ASSERT_EQ(final[i], ref(i)) << "i: " << i << ", final[i]: " << final[i]
|
||||
<< ", ref(i): " << ref(i);
|
||||
}
|
||||
}
|
||||
|
||||
using ElementType = testing::
|
||||
Types<bool, int8_t, int16_t, int32_t, int64_t, float, double, std::string>;
|
||||
|
||||
REGISTER_TYPED_TEST_CASE_P(ArrayInvertedIndexTest,
|
||||
ArrayContainsAny,
|
||||
ArrayContainsAll,
|
||||
ArrayEqual);
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(Naive, ArrayInvertedIndexTest, ElementType);
|
||||
@ -25,8 +25,6 @@
|
||||
|
||||
using namespace milvus;
|
||||
|
||||
// TODO: I would suggest that our all indexes use this test to simulate the real production environment.
|
||||
|
||||
namespace milvus::test {
|
||||
auto
|
||||
gen_field_meta(int64_t collection_id = 1,
|
||||
|
||||
@ -491,8 +491,30 @@ inline GeneratedData DataGen(SchemaPtr schema,
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT8:
|
||||
case DataType::INT16:
|
||||
case DataType::INT8: {
|
||||
for (int i = 0; i < N / repeat_count; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_int_data()->add_data(
|
||||
static_cast<int8_t>(random()));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT16: {
|
||||
for (int i = 0; i < N / repeat_count; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_int_data()->add_data(
|
||||
static_cast<int16_t>(random()));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case DataType::INT32: {
|
||||
for (int i = 0; i < N / repeat_count; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
@ -15,15 +15,18 @@
|
||||
|
||||
namespace milvus::test {
|
||||
inline auto
|
||||
GenColumnInfo(int64_t field_id,
|
||||
proto::schema::DataType field_type,
|
||||
bool auto_id,
|
||||
bool is_pk) {
|
||||
GenColumnInfo(
|
||||
int64_t field_id,
|
||||
proto::schema::DataType field_type,
|
||||
bool auto_id,
|
||||
bool is_pk,
|
||||
proto::schema::DataType element_type = proto::schema::DataType::None) {
|
||||
auto column_info = new proto::plan::ColumnInfo();
|
||||
column_info->set_field_id(field_id);
|
||||
column_info->set_data_type(field_type);
|
||||
column_info->set_is_autoid(auto_id);
|
||||
column_info->set_is_primary_key(is_pk);
|
||||
column_info->set_element_type(element_type);
|
||||
return column_info;
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user