enhance: add skip using array index when some situation (#33947)

#32900

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2024-06-23 21:26:02 +08:00 committed by GitHub
parent 0d7ea8ec42
commit 03a3f50892
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 206 additions and 88 deletions

View File

@ -119,7 +119,9 @@ class SegmentExpr : public Expr {
is_index_mode_ = segment_->HasIndex(field_id_);
if (is_index_mode_) {
num_index_chunk_ = segment_->num_chunk_index(field_id_);
} else {
}
// if index not include raw data, also need load data
if (segment_->HasFieldData(field_id_)) {
num_data_chunk_ = upper_div(active_count_, size_per_chunk_);
}
}
@ -166,6 +168,9 @@ class SegmentExpr : public Expr {
MoveCursor() override {
if (is_index_mode_) {
MoveCursorForIndex();
if (segment_->HasFieldData(field_id_)) {
MoveCursorForData();
}
} else {
MoveCursorForData();
}
@ -173,10 +178,11 @@ class SegmentExpr : public Expr {
int64_t
GetNextBatchSize() {
auto current_chunk =
is_index_mode_ ? current_index_chunk_ : current_data_chunk_;
auto current_chunk_pos =
is_index_mode_ ? current_index_chunk_pos_ : current_data_chunk_pos_;
auto current_chunk = is_index_mode_ && use_index_ ? current_index_chunk_
: current_data_chunk_;
auto current_chunk_pos = is_index_mode_ && use_index_
? current_index_chunk_pos_
: current_data_chunk_pos_;
auto current_rows = current_chunk * size_per_chunk_ + current_chunk_pos;
return current_rows + batch_size_ >= active_count_
? active_count_ - current_rows
@ -330,14 +336,17 @@ class SegmentExpr : public Expr {
DataType pk_type_;
int64_t batch_size_;
// State indicate position that expr computing at
// because expr maybe called for every batch.
bool is_index_mode_{false};
bool is_data_mode_{false};
// sometimes need to skip index and using raw data
// default true means use index as much as possible
bool use_index_{true};
int64_t active_count_{0};
int64_t num_data_chunk_{0};
int64_t num_index_chunk_{0};
// State indicate position that expr computing at
// because expr maybe called for every batch.
int64_t current_data_chunk_{0};
int64_t current_data_chunk_pos_{0};
int64_t current_index_chunk_{0};

View File

@ -20,6 +20,68 @@
namespace milvus {
namespace exec {
template <typename T>
bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray() {
typedef std::
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
IndexInnerType;
using Index = index::ScalarIndex<IndexInnerType>;
for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
const Index& index =
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
if (index.GetIndexType() == milvus::index::ScalarIndexType::HYBRID) {
return false;
}
}
return true;
}
template <>
bool
PhyUnaryRangeFilterExpr::CanUseIndexForArray<milvus::Array>() {
bool res;
if (!is_index_mode_) {
use_index_ = res = false;
return res;
}
switch (expr_->column_.element_type_) {
case DataType::BOOL:
res = CanUseIndexForArray<bool>();
break;
case DataType::INT8:
res = CanUseIndexForArray<int8_t>();
break;
case DataType::INT16:
res = CanUseIndexForArray<int16_t>();
break;
case DataType::INT32:
res = CanUseIndexForArray<int32_t>();
break;
case DataType::INT64:
res = CanUseIndexForArray<int64_t>();
break;
case DataType::FLOAT:
case DataType::DOUBLE:
// not accurate on floating point number, rollback to bruteforce.
res = false;
break;
case DataType::VARCHAR:
case DataType::STRING:
res = CanUseIndexForArray<std::string_view>();
break;
default:
PanicInfo(DataTypeInvalid,
"unsupported element type when execute array "
"equal for index: {}",
expr_->column_.element_type_);
}
use_index_ = res;
return res;
}
template <typename T>
VectorPtr
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex() {
@ -162,7 +224,7 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
result = ExecRangeVisitorImplArray<std::string>();
break;
case proto::plan::GenericValue::ValCase::kArrayVal:
if (is_index_mode_) {
if (CanUseIndexForArray<milvus::Array>()) {
result = ExecRangeVisitorImplArrayForIndex<
proto::plan::Array>();
} else {
@ -297,7 +359,7 @@ PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) {
// filtering by index, get candidates.
auto size_per_chunk = segment_->size_per_chunk();
auto retrieve = [ size_per_chunk, this ](int64_t offset) -> auto{
auto retrieve = [size_per_chunk, this](int64_t offset) -> auto {
auto chunk_idx = offset / size_per_chunk;
auto chunk_offset = offset % size_per_chunk;
const auto& chunk =
@ -784,11 +846,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {
template <typename T>
bool
PhyUnaryRangeFilterExpr::CanUseIndex() const {
if (!is_index_mode_) {
return false;
}
return SegmentExpr::CanUseIndex<T>(expr_->op_type_);
PhyUnaryRangeFilterExpr::CanUseIndex() {
bool res = is_index_mode_ && SegmentExpr::CanUseIndex<T>(expr_->op_type_);
use_index_ = res;
return res;
}
} // namespace exec

View File

@ -25,6 +25,7 @@
#include "common/Vector.h"
#include "exec/expression/Expr.h"
#include "index/Meta.h"
#include "index/ScalarIndex.h"
#include "segcore/SegmentInterface.h"
#include "query/Utils.h"
#include "common/RegexQuery.h"
@ -325,7 +326,11 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
template <typename T>
bool
CanUseIndex() const;
CanUseIndex();
template <typename T>
bool
CanUseIndexForArray();
private:
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;

View File

@ -69,6 +69,11 @@ class BitmapIndex : public ScalarIndex<T> {
return total_num_rows_;
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::BITMAP;
}
void
Build(size_t n, const T* values) override;

View File

@ -40,7 +40,7 @@ HybridScalarIndex<T>::HybridScalarIndex(
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
}
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
internal_index_type_ = InternalIndexType::NONE;
internal_index_type_ = ScalarIndexType::NONE;
}
template <typename T>
@ -57,11 +57,11 @@ HybridScalarIndex<T>::HybridScalarIndex(
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
}
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
internal_index_type_ = InternalIndexType::NONE;
internal_index_type_ = ScalarIndexType::NONE;
}
template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectIndexBuildType(size_t n, const T* values) {
std::set<T> distinct_vals;
for (size_t i = 0; i < n; i++) {
@ -70,15 +70,15 @@ HybridScalarIndex<T>::SelectIndexBuildType(size_t n, const T* values) {
// Decide whether to select bitmap index or stl sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::STLSORT;
internal_index_type_ = ScalarIndexType::STLSORT;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}
template <>
InternalIndexType
ScalarIndexType
HybridScalarIndex<std::string>::SelectIndexBuildType(
size_t n, const std::string* values) {
std::set<std::string> distinct_vals;
@ -91,15 +91,15 @@ HybridScalarIndex<std::string>::SelectIndexBuildType(
// Decide whether to select bitmap index or marisa index
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::MARISA;
internal_index_type_ = ScalarIndexType::MARISA;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}
template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectBuildTypeForPrimitiveType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
@ -116,15 +116,15 @@ HybridScalarIndex<T>::SelectBuildTypeForPrimitiveType(
// Decide whether to select bitmap index or stl sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::STLSORT;
internal_index_type_ = ScalarIndexType::STLSORT;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}
template <>
InternalIndexType
ScalarIndexType
HybridScalarIndex<std::string>::SelectBuildTypeForPrimitiveType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<std::string> distinct_vals;
@ -141,15 +141,15 @@ HybridScalarIndex<std::string>::SelectBuildTypeForPrimitiveType(
// Decide whether to select bitmap index or marisa sort
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::MARISA;
internal_index_type_ = ScalarIndexType::MARISA;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}
template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectBuildTypeForArrayType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
@ -171,15 +171,15 @@ HybridScalarIndex<T>::SelectBuildTypeForArrayType(
}
// Decide whether to select bitmap index or inverted index
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
internal_index_type_ = InternalIndexType::INVERTED;
internal_index_type_ = ScalarIndexType::INVERTED;
} else {
internal_index_type_ = InternalIndexType::BITMAP;
internal_index_type_ = ScalarIndexType::BITMAP;
}
return internal_index_type_;
}
template <typename T>
InternalIndexType
ScalarIndexType
HybridScalarIndex<T>::SelectIndexBuildType(
const std::vector<FieldDataPtr>& field_datas) {
std::set<T> distinct_vals;
@ -200,13 +200,13 @@ HybridScalarIndex<T>::GetInternalIndex() {
if (internal_index_ != nullptr) {
return internal_index_;
}
if (internal_index_type_ == InternalIndexType::BITMAP) {
if (internal_index_type_ == ScalarIndexType::BITMAP) {
internal_index_ =
std::make_shared<BitmapIndex<T>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::STLSORT) {
} else if (internal_index_type_ == ScalarIndexType::STLSORT) {
internal_index_ =
std::make_shared<ScalarIndexSort<T>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ =
std::make_shared<InvertedIndexTantivy<T>>(file_manager_context_);
} else {
@ -223,13 +223,13 @@ HybridScalarIndex<std::string>::GetInternalIndex() {
return internal_index_;
}
if (internal_index_type_ == InternalIndexType::BITMAP) {
if (internal_index_type_ == ScalarIndexType::BITMAP) {
internal_index_ =
std::make_shared<BitmapIndex<std::string>>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::MARISA) {
} else if (internal_index_type_ == ScalarIndexType::MARISA) {
internal_index_ =
std::make_shared<StringIndexMarisa>(file_manager_context_);
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ = std::make_shared<InvertedIndexTantivy<std::string>>(
file_manager_context_);
} else {
@ -374,7 +374,7 @@ HybridScalarIndex<T>::DeserializeIndexType(const BinarySet& binary_set) {
uint8_t index_type;
auto index_type_buffer = binary_set.GetByName(INDEX_TYPE);
memcpy(&index_type, index_type_buffer->data.get(), index_type_buffer->size);
internal_index_type_ = static_cast<InternalIndexType>(index_type);
internal_index_type_ = static_cast<ScalarIndexType>(index_type);
}
template <typename T>

View File

@ -33,14 +33,6 @@
namespace milvus {
namespace index {
enum class InternalIndexType {
NONE = 0,
BITMAP,
STLSORT,
MARISA,
INVERTED,
};
/*
* @brief Implementation of hybrid index
* @details This index only for scalar type.
@ -77,6 +69,11 @@ class HybridScalarIndex : public ScalarIndex<T> {
return internal_index_->Count();
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::HYBRID;
}
void
Build(size_t n, const T* values) override {
SelectIndexBuildType(n, values);
@ -140,17 +137,17 @@ class HybridScalarIndex : public ScalarIndex<T> {
UploadV2(const Config& config = {}) override;
private:
InternalIndexType
ScalarIndexType
SelectBuildTypeForPrimitiveType(
const std::vector<FieldDataPtr>& field_datas);
InternalIndexType
ScalarIndexType
SelectBuildTypeForArrayType(const std::vector<FieldDataPtr>& field_datas);
InternalIndexType
ScalarIndexType
SelectIndexBuildType(const std::vector<FieldDataPtr>& field_datas);
InternalIndexType
ScalarIndexType
SelectIndexBuildType(size_t n, const T* values);
BinarySet
@ -172,7 +169,7 @@ class HybridScalarIndex : public ScalarIndex<T> {
bool is_built_{false};
int32_t bitmap_index_cardinality_limit_;
proto::schema::DataType field_type_;
InternalIndexType internal_index_type_;
ScalarIndexType internal_index_type_;
std::shared_ptr<ScalarIndex<T>> internal_index_{nullptr};
storage::FileManagerContext file_manager_context_;
std::shared_ptr<storage::MemFileManagerImpl> mem_file_manager_{nullptr};

View File

@ -70,6 +70,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
"BuildWithDataset should be deprecated");
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::INVERTED;
}
void
Build(const Config& config = {}) override;

View File

@ -28,6 +28,15 @@
namespace milvus::index {
enum class ScalarIndexType {
NONE = 0,
BITMAP,
STLSORT,
MARISA,
INVERTED,
HYBRID,
};
template <typename T>
class ScalarIndex : public IndexBase {
public:
@ -44,6 +53,9 @@ class ScalarIndex : public IndexBase {
};
public:
virtual ScalarIndexType
GetIndexType() const = 0;
virtual void
Build(size_t n, const T* values) = 0;

View File

@ -58,6 +58,11 @@ class ScalarIndexSort : public ScalarIndex<T> {
return data_.size();
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::STLSORT;
}
void
Build(size_t n, const T* values) override;

View File

@ -57,6 +57,11 @@ class StringIndexMarisa : public StringIndex {
return str_ids_.size();
}
ScalarIndexType
GetIndexType() const override {
return ScalarIndexType::MARISA;
}
void
Build(size_t n, const std::string* values) override;

View File

@ -266,15 +266,18 @@ SegmentSealedImpl::LoadFieldData(const LoadFieldDataInfo& load_info) {
LOG_INFO("segment {} submits load field {} task to thread pool",
this->get_segment_id(),
field_id.get());
bool use_mmap = false;
if (!info.enable_mmap ||
SystemProperty::Instance().IsSystem(field_id)) {
LoadFieldData(field_id, field_data_info);
} else {
MapFieldData(field_id, field_data_info);
use_mmap = true;
}
LOG_INFO("segment {} loads field {} done",
LOG_INFO("segment {} loads field {} mmap {} done",
this->get_segment_id(),
field_id.get());
field_id.get(),
use_mmap);
}
}

View File

@ -277,18 +277,28 @@ class ArrayBitmapIndexTest : public testing::Test {
public:
void
TestInFunc() {
// boost::container::vector<T> test_data;
// std::unordered_set<T> s;
// size_t nq = 10;
// for (size_t i = 0; i < nq; i++) {
// test_data.push_back(data_[i]);
// s.insert(data_[i]);
// }
// auto index_ptr = dynamic_cast<index::ScalarIndex<T>*>(index_.get());
// auto bitset = index_ptr->In(test_data.size(), test_data.data());
// for (size_t i = 0; i < bitset.size(); i++) {
// ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
// }
boost::container::vector<T> test_data;
std::unordered_set<T> s;
size_t nq = 10;
for (size_t i = 0; i < nq; i++) {
test_data.push_back(data_[i]);
s.insert(data_[i]);
}
auto index_ptr = dynamic_cast<index::ScalarIndex<T>*>(index_.get());
auto bitset = index_ptr->In(test_data.size(), test_data.data());
for (size_t i = 0; i < bitset.size(); i++) {
auto ref = [&]() -> bool {
milvus::Array array = data_[i];
for (size_t j = 0; j < array.length(); ++j) {
auto val = array.template get_data<T>(j);
if (s.find(val) != s.end()) {
return true;
}
}
return false;
};
ASSERT_EQ(bitset[i], ref());
}
}
private:

View File

@ -479,27 +479,28 @@ TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnInvertedIndexStringField) {
ASSERT_TRUE(final[4]);
}
TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnUnsupportedIndex) {
std::string operand = "a%";
const auto& str_meta = schema->operator[](FieldName("str"));
auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
proto::schema::DataType::VarChar,
false,
false);
auto unary_range_expr = test::GenUnaryRangeExpr(OpType::Match, operand);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
// TODO: optimize this case
// TEST_F(SealedSegmentRegexQueryTest, RegexQueryOnUnsupportedIndex) {
// std::string operand = "a%";
// const auto& str_meta = schema->operator[](FieldName("str"));
// auto column_info = test::GenColumnInfo(str_meta.get_id().get(),
// proto::schema::DataType::VarChar,
// false,
// false);
// auto unary_range_expr = test::GenUnaryRangeExpr(OpType::Match, operand);
// unary_range_expr->set_allocated_column_info(column_info);
// auto expr = test::GenExpr();
// expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(*schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
// auto parser = ProtoParser(*schema);
// auto typed_expr = parser.ParseExprs(*expr);
// auto parsed =
// std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, typed_expr);
LoadMockIndex();
// LoadMockIndex();
auto segpromote = dynamic_cast<SegmentSealedImpl*>(seg.get());
query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP);
BitsetType final;
ASSERT_ANY_THROW(visitor.ExecuteExprNode(parsed, segpromote, N, final));
}
// auto segpromote = dynamic_cast<SegmentSealedImpl*>(seg.get());
// query::ExecPlanNodeVisitor visitor(*segpromote, MAX_TIMESTAMP);
// BitsetType final;
// ASSERT_ANY_THROW(visitor.ExecuteExprNode(parsed, segpromote, N, final));
// }