From 589d4dfd82b4539961b61b275216964f2eaaca2c Mon Sep 17 00:00:00 2001 From: zhagnlu <1542303831@qq.com> Date: Thu, 30 May 2024 13:09:43 +0800 Subject: [PATCH] enhance: optimize bitmap index (#33358) #32900 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/common/Consts.h | 2 + internal/core/src/index/BitmapIndex.cpp | 368 ++++++++++++---- internal/core/src/index/BitmapIndex.h | 66 ++- internal/core/src/index/CMakeLists.txt | 1 + internal/core/src/index/HybridScalarIndex.cpp | 402 ++++++++++++++++++ internal/core/src/index/HybridScalarIndex.h | 166 ++++++++ internal/core/src/index/Index.h | 1 + internal/core/src/index/IndexFactory.cpp | 14 +- .../core/src/index/InvertedIndexTantivy.cpp | 30 +- internal/core/src/index/Meta.h | 2 + internal/core/src/index/ScalarIndex.h | 10 + internal/core/src/index/ScalarIndexSort.cpp | 51 ++- internal/core/src/index/ScalarIndexSort.h | 17 +- internal/core/src/index/StringIndexMarisa.cpp | 7 + internal/core/src/index/StringIndexMarisa.h | 17 +- internal/core/src/index/Utils.cpp | 9 + internal/core/src/index/Utils.h | 3 + internal/core/unittest/CMakeLists.txt | 2 +- internal/core/unittest/test_expr.cpp | 26 +- ...bitmap_index.cpp => test_hybrid_index.cpp} | 160 ++++++- internal/core/unittest/test_scalar_index.cpp | 267 +++++++++++- internal/proxy/task_index.go | 7 + pkg/common/common.go | 2 + .../indexparamcheck/bitmap_checker_test.go | 4 +- .../indexparamcheck/bitmap_index_checker.go | 6 +- pkg/util/paramtable/component_param.go | 24 +- 26 files changed, 1508 insertions(+), 156 deletions(-) create mode 100644 internal/core/src/index/HybridScalarIndex.cpp create mode 100644 internal/core/src/index/HybridScalarIndex.h rename internal/core/unittest/{test_bitmap_index.cpp => test_hybrid_index.cpp} (63%) diff --git a/internal/core/src/common/Consts.h b/internal/core/src/common/Consts.h index 65e6795b16..44d7d5559c 100644 --- a/internal/core/src/common/Consts.h +++ b/internal/core/src/common/Consts.h @@ -61,3 +61,5 @@ constexpr const char* RANGE_FILTER = knowhere::meta::RANGE_FILTER; const int64_t DEFAULT_MAX_OUTPUT_SIZE = 67108864; // bytes, 64MB const int64_t DEFAULT_CHUNK_MANAGER_REQUEST_TIMEOUT_MS = 10000; + +const int64_t DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND = 500; diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index 5d0a4aabec..3e63763dd2 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -15,10 +15,12 @@ // limitations under the License. #include +#include #include "index/BitmapIndex.h" #include "common/Slice.h" +#include "common/Common.h" #include "index/Meta.h" #include "index/ScalarIndex.h" #include "index/Utils.h" @@ -105,8 +107,13 @@ BitmapIndex::Build(size_t n, const T* data) { } total_num_rows_ = n; - for (auto it = data_.begin(); it != data_.end(); ++it) { - bitsets_[it->first] = ConvertRoaringToBitset(it->second); + if (data_.size() < DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) { + for (auto it = data_.begin(); it != data_.end(); ++it) { + bitsets_[it->first] = ConvertRoaringToBitset(it->second); + } + build_mode_ = BitmapIndexBuildMode::BITSET; + } else { + build_mode_ = BitmapIndexBuildMode::ROARING; } is_built_ = true; @@ -134,6 +141,13 @@ BitmapIndex::BuildV2(const Config& config) { field_datas.push_back(field_data); } + BuildWithFieldData(field_datas); +} + +template +void +BitmapIndex::BuildWithFieldData( + const std::vector& field_datas) { int total_num_rows = 0; for (auto& field_data : field_datas) { total_num_rows += field_data->get_num_rows(); @@ -142,7 +156,6 @@ BitmapIndex::BuildV2(const Config& config) { throw SegcoreError(DataIsEmpty, "scalar bitmap index can not build null values"); } - total_num_rows_ = total_num_rows; int64_t offset = 0; @@ -154,6 +167,7 @@ BitmapIndex::BuildV2(const Config& config) { offset++; } } + is_built_ = true; } @@ -190,6 +204,22 @@ BitmapIndex::SerializeIndexData(uint8_t* data_ptr) { } } +template +std::pair, size_t> +BitmapIndex::SerializeIndexMeta() { + YAML::Node node; + node[BITMAP_INDEX_LENGTH] = data_.size(); + node[BITMAP_INDEX_NUM_ROWS] = total_num_rows_; + + std::stringstream ss; + ss << node; + auto json_string = ss.str(); + auto str_size = json_string.size(); + std::shared_ptr res(new uint8_t[str_size]); + memcpy(res.get(), json_string.data(), str_size); + return std::make_pair(res, str_size); +} + template <> void BitmapIndex::SerializeIndexData(uint8_t* data_ptr) { @@ -217,21 +247,17 @@ BitmapIndex::Serialize(const Config& config) { uint8_t* data_ptr = index_data.get(); SerializeIndexData(data_ptr); - std::shared_ptr index_length(new uint8_t[sizeof(size_t)]); - auto index_size = data_.size(); - memcpy(index_length.get(), &index_size, sizeof(size_t)); - - std::shared_ptr num_rows(new uint8_t[sizeof(size_t)]); - memcpy(num_rows.get(), &total_num_rows_, sizeof(size_t)); + auto index_meta = SerializeIndexMeta(); BinarySet ret_set; ret_set.Append(BITMAP_INDEX_DATA, index_data, index_data_size); - ret_set.Append(BITMAP_INDEX_LENGTH, index_length, sizeof(size_t)); - ret_set.Append(BITMAP_INDEX_NUM_ROWS, num_rows, sizeof(size_t)); + ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second); LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}", - index_size, + Cardinality(), total_num_rows_); + + Disassemble(ret_set); return ret_set; } @@ -283,6 +309,29 @@ BitmapIndex::ConvertRoaringToBitset(const roaring::Roaring& values) { return res; } +template +std::pair +BitmapIndex::DeserializeIndexMeta(const uint8_t* data_ptr, + size_t data_size) { + YAML::Node node = YAML::Load( + std::string(reinterpret_cast(data_ptr), data_size)); + + auto index_length = node[BITMAP_INDEX_LENGTH].as(); + auto index_num_rows = node[BITMAP_INDEX_NUM_ROWS].as(); + + return std::make_pair(index_length, index_num_rows); +} + +template +void +BitmapIndex::ChooseIndexBuildMode() { + if (data_.size() <= DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) { + build_mode_ = BitmapIndexBuildMode::BITSET; + } else { + build_mode_ = BitmapIndexBuildMode::ROARING; + } +} + template void BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, @@ -296,7 +345,12 @@ BitmapIndex::DeserializeIndexData(const uint8_t* data_ptr, value = roaring::Roaring::read(reinterpret_cast(data_ptr)); data_ptr += value.getSizeInBytes(); - bitsets_[key] = ConvertRoaringToBitset(value); + ChooseIndexBuildMode(); + + if (build_mode_ == BitmapIndexBuildMode::BITSET) { + bitsets_[key] = ConvertRoaringToBitset(value); + data_.erase(key); + } } } @@ -324,21 +378,14 @@ template void BitmapIndex::LoadWithoutAssemble(const BinarySet& binary_set, const Config& config) { - size_t index_length; - auto index_length_buffer = binary_set.GetByName(BITMAP_INDEX_LENGTH); - memcpy(&index_length, - index_length_buffer->data.get(), - (size_t)index_length_buffer->size); - - auto num_rows_buffer = binary_set.GetByName(BITMAP_INDEX_NUM_ROWS); - memcpy(&total_num_rows_, - num_rows_buffer->data.get(), - (size_t)num_rows_buffer->size); + auto index_meta_buffer = binary_set.GetByName(BITMAP_INDEX_META); + auto index_meta = DeserializeIndexMeta(index_meta_buffer->data.get(), + index_meta_buffer->size); + auto index_length = index_meta.first; + total_num_rows_ = index_meta.second; auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA); - const uint8_t* data_ptr = index_data_buffer->data.get(); - - DeserializeIndexData(data_ptr, index_length); + DeserializeIndexData(index_data_buffer->data.get(), index_length); LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}", Cardinality(), @@ -416,26 +463,24 @@ BitmapIndex::In(const size_t n, const T* values) { AssertInfo(is_built_, "index has not been built"); TargetBitmap res(total_num_rows_, false); -#if 0 - roaring::Roaring result; - for (size_t i = 0; i < n; ++i) { - auto val = values[i]; - auto it = data_.find(val); - if (it != data_.end()) { - result |= it->second; + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + for (size_t i = 0; i < n; ++i) { + auto val = values[i]; + auto it = data_.find(val); + if (it != data_.end()) { + for (const auto& v : it->second) { + res.set(v); + } + } + } + } else { + for (size_t i = 0; i < n; ++i) { + auto val = values[i]; + if (bitsets_.find(val) != bitsets_.end()) { + res |= bitsets_.at(val); + } } } - for (auto& val : result) { - res.set(val); - } -#else - for (size_t i = 0; i < n; ++i) { - auto val = values[i]; - if (bitsets_.find(val) != bitsets_.end()) { - res |= bitsets_.at(val); - } - } -#endif return res; } @@ -443,36 +488,35 @@ template const TargetBitmap BitmapIndex::NotIn(const size_t n, const T* values) { AssertInfo(is_built_, "index has not been built"); - TargetBitmap res(total_num_rows_, false); -#if 0 - roaring::Roaring result; - for (int i = 0; i < n; ++i) { - auto val = values[i]; - auto it = data_.find(val); - if (it != data_.end()) { - result |= it->second; + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + TargetBitmap res(total_num_rows_, true); + for (int i = 0; i < n; ++i) { + auto val = values[i]; + auto it = data_.find(val); + if (it != data_.end()) { + for (const auto& v : it->second) { + res.reset(v); + } + } } - } - - for (auto& val : result) { - bitset.reset(val); - } -#else - for (size_t i = 0; i < n; ++i) { - auto val = values[i]; - if (bitsets_.find(val) != bitsets_.end()) { - res |= bitsets_.at(val); + return res; + } else { + TargetBitmap res(total_num_rows_, false); + for (size_t i = 0; i < n; ++i) { + auto val = values[i]; + if (bitsets_.find(val) != bitsets_.end()) { + res |= bitsets_.at(val); + } } + res.flip(); + return res; } -#endif - res.flip(); - return res; } template -const TargetBitmap -BitmapIndex::Range(const T value, const OpType op) { +TargetBitmap +BitmapIndex::RangeForBitset(const T value, const OpType op) { AssertInfo(is_built_, "index has not been built"); TargetBitmap res(total_num_rows_, false); if (ShouldSkip(value, value, op)) { @@ -532,10 +576,82 @@ BitmapIndex::Range(const T value, const OpType op) { template const TargetBitmap -BitmapIndex::Range(const T lower_value, - bool lb_inclusive, - const T upper_value, - bool ub_inclusive) { +BitmapIndex::Range(const T value, OpType op) { + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + return std::move(RangeForRoaring(value, op)); + } else { + return std::move(RangeForBitset(value, op)); + } +} + +template +TargetBitmap +BitmapIndex::RangeForRoaring(const T value, const OpType op) { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap res(total_num_rows_, false); + if (ShouldSkip(value, value, op)) { + return res; + } + auto lb = data_.begin(); + auto ub = data_.end(); + + switch (op) { + case OpType::LessThan: { + ub = std::lower_bound(data_.begin(), + data_.end(), + std::make_pair(value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + break; + } + case OpType::LessEqual: { + ub = std::upper_bound(data_.begin(), + data_.end(), + std::make_pair(value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + break; + } + case OpType::GreaterThan: { + lb = std::upper_bound(data_.begin(), + data_.end(), + std::make_pair(value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + break; + } + case OpType::GreaterEqual: { + lb = std::lower_bound(data_.begin(), + data_.end(), + std::make_pair(value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + break; + } + default: { + throw SegcoreError(OpTypeInvalid, + fmt::format("Invalid OperatorType: {}", op)); + } + } + + for (; lb != ub; lb++) { + for (const auto& v : lb->second) { + res.set(v); + } + } + return res; +} + +template +TargetBitmap +BitmapIndex::RangeForBitset(const T lower_value, + bool lb_inclusive, + const T upper_value, + bool ub_inclusive) { AssertInfo(is_built_, "index has not been built"); TargetBitmap res(total_num_rows_, false); if (lower_value > upper_value || @@ -587,15 +703,99 @@ BitmapIndex::Range(const T lower_value, return res; } +template +const TargetBitmap +BitmapIndex::Range(const T lower_value, + bool lb_inclusive, + const T upper_value, + bool ub_inclusive) { + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + return RangeForRoaring( + lower_value, lb_inclusive, upper_value, ub_inclusive); + } else { + return RangeForBitset( + lower_value, lb_inclusive, upper_value, ub_inclusive); + } +} + +template +TargetBitmap +BitmapIndex::RangeForRoaring(const T lower_value, + bool lb_inclusive, + const T upper_value, + bool ub_inclusive) { + AssertInfo(is_built_, "index has not been built"); + TargetBitmap res(total_num_rows_, false); + if (lower_value > upper_value || + (lower_value == upper_value && !(lb_inclusive && ub_inclusive))) { + return res; + } + if (ShouldSkip(lower_value, upper_value, OpType::Range)) { + return res; + } + + auto lb = data_.begin(); + auto ub = data_.end(); + + if (lb_inclusive) { + lb = std::lower_bound(data_.begin(), + data_.end(), + std::make_pair(lower_value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + } else { + lb = std::upper_bound(data_.begin(), + data_.end(), + std::make_pair(lower_value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + } + + if (ub_inclusive) { + ub = std::upper_bound(data_.begin(), + data_.end(), + std::make_pair(upper_value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + } else { + ub = std::lower_bound(data_.begin(), + data_.end(), + std::make_pair(upper_value, TargetBitmap()), + [](const auto& lhs, const auto& rhs) { + return lhs.first < rhs.first; + }); + } + + for (; lb != ub; lb++) { + for (const auto& v : lb->second) { + res.set(v); + } + } + return res; +} + template T BitmapIndex::Reverse_Lookup(size_t idx) const { AssertInfo(is_built_, "index has not been built"); AssertInfo(idx < total_num_rows_, "out of range of total coun"); - for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) { - if (it->second[idx]) { - return it->first; + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + for (auto it = data_.begin(); it != data_.end(); it++) { + for (const auto& v : it->second) { + if (v == idx) { + return it->first; + } + } + } + } else { + for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) { + if (it->second[idx]) { + return it->first; + } } } throw SegcoreError( @@ -610,9 +810,7 @@ bool BitmapIndex::ShouldSkip(const T lower_value, const T upper_value, const OpType op) { - if (!bitsets_.empty()) { - auto lower_bound = bitsets_.begin()->first; - auto upper_bound = bitsets_.rbegin()->first; + auto skip = [&](OpType op, T lower_bound, T upper_bound) -> bool { bool should_skip = false; switch (op) { case OpType::LessThan: { @@ -649,6 +847,22 @@ BitmapIndex::ShouldSkip(const T lower_value, op)); } return should_skip; + }; + + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + if (!data_.empty()) { + auto lower_bound = data_.begin()->first; + auto upper_bound = data_.rbegin()->first; + bool should_skip = skip(op, lower_bound, upper_bound); + return should_skip; + } + } else { + if (!bitsets_.empty()) { + auto lower_bound = bitsets_.begin()->first; + auto upper_bound = bitsets_.rbegin()->first; + bool should_skip = skip(op, lower_bound, upper_bound); + return should_skip; + } } return true; } diff --git a/internal/core/src/index/BitmapIndex.h b/internal/core/src/index/BitmapIndex.h index 38ea600449..2ead42d5de 100644 --- a/internal/core/src/index/BitmapIndex.h +++ b/internal/core/src/index/BitmapIndex.h @@ -30,6 +30,11 @@ namespace milvus { namespace index { +enum class BitmapIndexBuildMode { + ROARING, + BITSET, +}; + /* * @brief Implementation of Bitmap Index * @details This index only for scalar Integral type. @@ -45,6 +50,17 @@ class BitmapIndex : public ScalarIndex { const storage::FileManagerContext& file_manager_context, std::shared_ptr space); + explicit BitmapIndex( + const std::shared_ptr& file_manager) + : file_manager_(file_manager) { + } + + explicit BitmapIndex( + const std::shared_ptr& file_manager, + std::shared_ptr space) + : file_manager_(file_manager), space_(space) { + } + ~BitmapIndex() override = default; BinarySet @@ -61,7 +77,7 @@ class BitmapIndex : public ScalarIndex { int64_t Count() override { - return bitsets_.begin()->second.size(); + return total_num_rows_; } void @@ -70,6 +86,9 @@ class BitmapIndex : public ScalarIndex { void Build(const Config& config = {}) override; + void + BuildWithFieldData(const std::vector& datas) override; + void BuildV2(const Config& config = {}) override; @@ -108,9 +127,17 @@ class BitmapIndex : public ScalarIndex { int64_t Cardinality() { - return bitsets_.size(); + if (build_mode_ == BitmapIndexBuildMode::ROARING) { + return data_.size(); + } else { + return bitsets_.size(); + } } + void + LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) override; + private: size_t GetIndexDataSize(); @@ -118,24 +145,49 @@ class BitmapIndex : public ScalarIndex { void SerializeIndexData(uint8_t* index_data_ptr); + std::pair, size_t> + SerializeIndexMeta(); + + std::pair + DeserializeIndexMeta(const uint8_t* data_ptr, size_t data_size); + void DeserializeIndexData(const uint8_t* data_ptr, size_t index_length); + void + ChooseIndexBuildMode(); + bool ShouldSkip(const T lower_value, const T upper_value, const OpType op); TargetBitmap ConvertRoaringToBitset(const roaring::Roaring& values); - void - LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + TargetBitmap + RangeForRoaring(T value, OpType op); - private: - bool is_built_; + TargetBitmap + RangeForBitset(T value, OpType op); + + TargetBitmap + RangeForRoaring(T lower_bound_value, + bool lb_inclusive, + T upper_bound_value, + bool ub_inclusive); + + TargetBitmap + RangeForBitset(T lower_bound_value, + bool lb_inclusive, + T upper_bound_value, + bool ub_inclusive); + + public: + bool is_built_{false}; Config config_; + BitmapIndexBuildMode build_mode_; std::map data_; std::map bitsets_; - size_t total_num_rows_; + size_t total_num_rows_{0}; std::shared_ptr file_manager_; std::shared_ptr space_; }; diff --git a/internal/core/src/index/CMakeLists.txt b/internal/core/src/index/CMakeLists.txt index ed0f600587..3256ab63a0 100644 --- a/internal/core/src/index/CMakeLists.txt +++ b/internal/core/src/index/CMakeLists.txt @@ -20,6 +20,7 @@ set(INDEX_FILES SkipIndex.cpp InvertedIndexTantivy.cpp BitmapIndex.cpp + HybridScalarIndex.cpp ) milvus_add_pkg_config("milvus_index") diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp new file mode 100644 index 0000000000..518828ea7b --- /dev/null +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -0,0 +1,402 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "index/HybridScalarIndex.h" +#include "common/Slice.h" +#include "common/Common.h" +#include "index/Meta.h" +#include "index/ScalarIndex.h" +#include "index/Utils.h" +#include "storage/Util.h" +#include "storage/space.h" + +namespace milvus { +namespace index { + +template +HybridScalarIndex::HybridScalarIndex( + const storage::FileManagerContext& file_manager_context) + : is_built_(false), + bitmap_index_cardinality_limit_(DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) { + if (file_manager_context.Valid()) { + file_manager_ = + std::make_shared(file_manager_context); + AssertInfo(file_manager_ != nullptr, "create file manager failed!"); + } + internal_index_type_ = InternalIndexType::NONE; +} + +template +HybridScalarIndex::HybridScalarIndex( + const storage::FileManagerContext& file_manager_context, + std::shared_ptr space) + : is_built_(false), + bitmap_index_cardinality_limit_(DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND), + space_(space) { + if (file_manager_context.Valid()) { + file_manager_ = std::make_shared( + file_manager_context, space); + AssertInfo(file_manager_ != nullptr, "create file manager failed!"); + } + internal_index_type_ = InternalIndexType::NONE; +} + +template +InternalIndexType +HybridScalarIndex::SelectIndexBuildType(size_t n, const T* values) { + std::set distinct_vals; + for (size_t i = 0; i < n; i++) { + distinct_vals.insert(values[i]); + } + + // Decide whether to select bitmap index or stl sort + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + internal_index_type_ = InternalIndexType::STLSORT; + } else { + internal_index_type_ = InternalIndexType::BITMAP; + } + return internal_index_type_; +} + +template <> +InternalIndexType +HybridScalarIndex::SelectIndexBuildType( + size_t n, const std::string* values) { + std::set distinct_vals; + for (size_t i = 0; i < n; i++) { + distinct_vals.insert(values[i]); + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + break; + } + } + + // Decide whether to select bitmap index or marisa index + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + internal_index_type_ = InternalIndexType::MARISA; + } else { + internal_index_type_ = InternalIndexType::BITMAP; + } + return internal_index_type_; +} + +template +InternalIndexType +HybridScalarIndex::SelectIndexBuildType( + const std::vector& field_datas) { + std::set distinct_vals; + for (const auto& data : field_datas) { + auto slice_row_num = data->get_num_rows(); + for (size_t i = 0; i < slice_row_num; ++i) { + auto val = reinterpret_cast(data->RawValue(i)); + distinct_vals.insert(*val); + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + break; + } + } + } + + // Decide whether to select bitmap index or stl sort + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + internal_index_type_ = InternalIndexType::STLSORT; + } else { + internal_index_type_ = InternalIndexType::BITMAP; + } + return internal_index_type_; +} + +template <> +InternalIndexType +HybridScalarIndex::SelectIndexBuildType( + const std::vector& field_datas) { + std::set distinct_vals; + for (const auto& data : field_datas) { + auto slice_row_num = data->get_num_rows(); + for (size_t i = 0; i < slice_row_num; ++i) { + auto val = reinterpret_cast(data->RawValue(i)); + distinct_vals.insert(*val); + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + break; + } + } + } + + // Decide whether to select bitmap index or marisa sort + if (distinct_vals.size() >= bitmap_index_cardinality_limit_) { + internal_index_type_ = InternalIndexType::MARISA; + } else { + internal_index_type_ = InternalIndexType::BITMAP; + } + return internal_index_type_; +} + +template +std::shared_ptr> +HybridScalarIndex::GetInternalIndex() { + if (internal_index_ != nullptr) { + return internal_index_; + } + if (internal_index_type_ == InternalIndexType::BITMAP) { + internal_index_ = std::make_shared>(file_manager_); + } else if (internal_index_type_ == InternalIndexType::STLSORT) { + internal_index_ = std::make_shared>(file_manager_); + } else { + PanicInfo(UnexpectedError, + "unknown index type when get internal index"); + } + return internal_index_; +} + +template <> +std::shared_ptr> +HybridScalarIndex::GetInternalIndex() { + if (internal_index_ != nullptr) { + return internal_index_; + } + + if (internal_index_type_ == InternalIndexType::BITMAP) { + internal_index_ = + std::make_shared>(file_manager_); + } else if (internal_index_type_ == InternalIndexType::MARISA) { + internal_index_ = std::make_shared(file_manager_); + } else { + PanicInfo(UnexpectedError, + "unknown index type when get internal index"); + } + return internal_index_; +} + +template +void +HybridScalarIndex::BuildInternal( + const std::vector& field_datas) { + auto index = GetInternalIndex(); + index->BuildWithFieldData(field_datas); +} + +template +void +HybridScalarIndex::Build(const Config& config) { + if (is_built_) { + return; + } + + bitmap_index_cardinality_limit_ = + GetBitmapCardinalityLimitFromConfig(config); + LOG_INFO("config bitmap cardinality limit to {}", + bitmap_index_cardinality_limit_); + + auto insert_files = + GetValueFromConfig>(config, "insert_files"); + AssertInfo(insert_files.has_value(), + "insert file paths is empty when build index"); + + auto field_datas = + file_manager_->CacheRawDataToMemory(insert_files.value()); + + SelectIndexBuildType(field_datas); + BuildInternal(field_datas); + is_built_ = true; +} + +template +void +HybridScalarIndex::BuildV2(const Config& config) { + if (is_built_) { + return; + } + bitmap_index_cardinality_limit_ = + GetBitmapCardinalityLimitFromConfig(config); + LOG_INFO("config bitmap cardinality limit to {}", + bitmap_index_cardinality_limit_); + + auto field_name = file_manager_->GetIndexMeta().field_name; + auto reader = space_->ScanData(); + std::vector field_datas; + for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) { + if (!rec.ok()) { + PanicInfo(DataFormatBroken, "failed to read data"); + } + auto data = rec.ValueUnsafe(); + auto total_num_rows = data->num_rows(); + auto col_data = data->GetColumnByName(field_name); + auto field_data = storage::CreateFieldData( + DataType(GetDType()), 0, total_num_rows); + field_data->FillFieldData(col_data); + field_datas.push_back(field_data); + } + + SelectIndexBuildType(field_datas); + BuildInternal(field_datas); + is_built_ = true; +} + +template +BinarySet +HybridScalarIndex::Serialize(const Config& config) { + AssertInfo(is_built_, "index has not been built yet"); + + auto ret_set = internal_index_->Serialize(config); + + // Add index type info to storage for future restruct index + std::shared_ptr index_type_buf(new uint8_t[sizeof(uint8_t)]); + index_type_buf[0] = static_cast(internal_index_type_); + ret_set.Append(INDEX_TYPE, index_type_buf, sizeof(uint8_t)); + + return ret_set; +} + +template +BinarySet +HybridScalarIndex::Upload(const Config& config) { + auto binary_set = Serialize(config); + file_manager_->AddFile(binary_set); + + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + +template +BinarySet +HybridScalarIndex::UploadV2(const Config& config) { + auto binary_set = Serialize(config); + file_manager_->AddFileV2(binary_set); + + auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize(); + BinarySet ret; + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + +template +void +HybridScalarIndex::DeserializeIndexType(const BinarySet& binary_set) { + uint8_t index_type; + auto index_type_buffer = binary_set.GetByName(INDEX_TYPE); + memcpy(&index_type, index_type_buffer->data.get(), index_type_buffer->size); + internal_index_type_ = static_cast(index_type); +} + +template +void +HybridScalarIndex::LoadInternal(const BinarySet& binary_set, + const Config& config) { + auto index = GetInternalIndex(); + index->LoadWithoutAssemble(binary_set, config); +} + +template +void +HybridScalarIndex::Load(const BinarySet& binary_set, const Config& config) { + milvus::Assemble(const_cast(binary_set)); + DeserializeIndexType(binary_set); + + LoadInternal(binary_set, config); + is_built_ = true; +} + +template +void +HybridScalarIndex::LoadV2(const Config& config) { + auto blobs = space_->StatisticsBlobs(); + std::vector index_files; + auto prefix = file_manager_->GetRemoteIndexObjectPrefixV2(); + for (auto& b : blobs) { + if (b.name.rfind(prefix, 0) == 0) { + index_files.push_back(b.name); + } + } + std::map index_datas{}; + for (auto& file_name : index_files) { + auto res = space_->GetBlobByteSize(file_name); + if (!res.ok()) { + PanicInfo(S3Error, "unable to read index blob"); + } + auto index_blob_data = + std::shared_ptr(new uint8_t[res.value()]); + auto status = space_->ReadBlob(file_name, index_blob_data.get()); + if (!status.ok()) { + PanicInfo(S3Error, "unable to read index blob"); + } + auto raw_index_blob = + storage::DeserializeFileData(index_blob_data, res.value()); + auto key = file_name.substr(file_name.find_last_of('/') + 1); + index_datas[key] = raw_index_blob->GetFieldData(); + } + AssembleIndexDatas(index_datas); + + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->Size(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + + DeserializeIndexType(binary_set); + + LoadInternal(binary_set, config); + + is_built_ = true; +} + +template +void +HybridScalarIndex::Load(milvus::tracer::TraceContext ctx, + const Config& config) { + auto index_files = + GetValueFromConfig>(config, "index_files"); + AssertInfo(index_files.has_value(), + "index file paths is empty when load bitmap index"); + auto index_datas = file_manager_->LoadIndexToMemory(index_files.value()); + AssembleIndexDatas(index_datas); + BinarySet binary_set; + for (auto& [key, data] : index_datas) { + auto size = data->Size(); + auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction + auto buf = std::shared_ptr( + (uint8_t*)const_cast(data->Data()), deleter); + binary_set.Append(key, buf, size); + } + + DeserializeIndexType(binary_set); + + LoadInternal(binary_set, config); + + is_built_ = true; +} + +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; +template class HybridScalarIndex; + +} // namespace index +} // namespace milvus \ No newline at end of file diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h new file mode 100644 index 0000000000..c3c44630bf --- /dev/null +++ b/internal/core/src/index/HybridScalarIndex.h @@ -0,0 +1,166 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "index/ScalarIndex.h" +#include "index/BitmapIndex.h" +#include "index/ScalarIndexSort.h" +#include "index/StringIndexMarisa.h" +#include "storage/FileManager.h" +#include "storage/DiskFileManagerImpl.h" +#include "storage/MemFileManagerImpl.h" +#include "storage/space.h" + +namespace milvus { +namespace index { + +enum class InternalIndexType { + NONE = 0, + BITMAP, + STLSORT, + MARISA, +}; + +/* +* @brief Implementation of hybrid index +* @details This index only for scalar type. +* dynamically choose bitmap/stlsort/marisa type index +* according to data distribution +*/ +template +class HybridScalarIndex : public ScalarIndex { + public: + explicit HybridScalarIndex( + const storage::FileManagerContext& file_manager_context = + storage::FileManagerContext()); + + explicit HybridScalarIndex( + const storage::FileManagerContext& file_manager_context, + std::shared_ptr space); + + ~HybridScalarIndex() override = default; + + BinarySet + Serialize(const Config& config) override; + + void + Load(const BinarySet& index_binary, const Config& config = {}) override; + + void + Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override; + + void + LoadV2(const Config& config = {}) override; + + int64_t + Count() override { + return internal_index_->Count(); + } + + void + Build(size_t n, const T* values) override { + SelectIndexBuildType(n, values); + auto index = GetInternalIndex(); + index->Build(n, values); + is_built_ = true; + } + + void + Build(const Config& config = {}) override; + + void + BuildV2(const Config& config = {}) override; + + const TargetBitmap + In(size_t n, const T* values) override { + return internal_index_->In(n, values); + } + + const TargetBitmap + NotIn(size_t n, const T* values) override { + return internal_index_->NotIn(n, values); + } + + const TargetBitmap + Range(T value, OpType op) override { + return internal_index_->Range(value, op); + } + + const TargetBitmap + Range(T lower_bound_value, + bool lb_inclusive, + T upper_bound_value, + bool ub_inclusive) override { + return internal_index_->Range( + lower_bound_value, lb_inclusive, upper_bound_value, ub_inclusive); + } + + T + Reverse_Lookup(size_t offset) const override { + return internal_index_->Reverse_Lookup(offset); + } + + int64_t + Size() override { + return internal_index_->Size(); + } + + const bool + HasRawData() const override { + return internal_index_->HasRawData(); + } + + BinarySet + Upload(const Config& config = {}) override; + + BinarySet + UploadV2(const Config& config = {}) override; + + private: + InternalIndexType + SelectIndexBuildType(const std::vector& field_datas); + + InternalIndexType + SelectIndexBuildType(size_t n, const T* values); + + void + DeserializeIndexType(const BinarySet& binary_set); + + void + BuildInternal(const std::vector& field_datas); + + void + LoadInternal(const BinarySet& binary_set, const Config& config); + + std::shared_ptr> + GetInternalIndex(); + + public: + bool is_built_{false}; + int32_t bitmap_index_cardinality_limit_; + InternalIndexType internal_index_type_; + std::shared_ptr> internal_index_{nullptr}; + std::shared_ptr file_manager_{nullptr}; + std::shared_ptr space_{nullptr}; +}; + +} // namespace index +} // namespace milvus \ No newline at end of file diff --git a/internal/core/src/index/Index.h b/internal/core/src/index/Index.h index 2f3da4be14..7567bf63e3 100644 --- a/internal/core/src/index/Index.h +++ b/internal/core/src/index/Index.h @@ -18,6 +18,7 @@ #include #include +#include "common/FieldData.h" #include "common/EasyAssert.h" #include "knowhere/comp/index_param.h" #include "knowhere/dataset.h" diff --git a/internal/core/src/index/IndexFactory.cpp b/internal/core/src/index/IndexFactory.cpp index 6d133adc96..79409056d9 100644 --- a/internal/core/src/index/IndexFactory.cpp +++ b/internal/core/src/index/IndexFactory.cpp @@ -27,7 +27,7 @@ #include "index/StringIndexMarisa.h" #include "index/BoolIndex.h" #include "index/InvertedIndexTantivy.h" -#include "index/BitmapIndex.h" +#include "index/HybridScalarIndex.h" namespace milvus::index { @@ -44,7 +44,7 @@ IndexFactory::CreateScalarIndex( file_manager_context); } if (index_type == BITMAP_INDEX_TYPE) { - return std::make_unique>(file_manager_context); + return std::make_unique>(file_manager_context); } return CreateScalarIndexSort(file_manager_context); } @@ -70,7 +70,8 @@ IndexFactory::CreateScalarIndex( cfg, file_manager_context); } if (index_type == BITMAP_INDEX_TYPE) { - return std::make_unique>(file_manager_context); + return std::make_unique>( + file_manager_context); } return CreateStringIndexMarisa(file_manager_context); #else @@ -92,7 +93,8 @@ IndexFactory::CreateScalarIndex( cfg, file_manager_context, space); } if (index_type == BITMAP_INDEX_TYPE) { - return std::make_unique>(file_manager_context, space); + return std::make_unique>(file_manager_context, + space); } return CreateScalarIndexSort(file_manager_context, space); } @@ -112,8 +114,8 @@ IndexFactory::CreateScalarIndex( cfg, file_manager_context, space); } if (index_type == BITMAP_INDEX_TYPE) { - return std::make_unique>(file_manager_context, - space); + return std::make_unique>( + file_manager_context, space); } return CreateStringIndexMarisa(file_manager_context, space); #else diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index 5bb8ba3b16..2c212704aa 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -426,8 +426,34 @@ InvertedIndexTantivy::BuildWithRawData(size_t n, const void* values, const Config& config) { if constexpr (!std::is_same_v) { - PanicInfo(Unsupported, - "InvertedIndex.BuildWithRawData only support string"); + TantivyConfig cfg; + if constexpr (std::is_same_v) { + cfg.data_type_ = DataType::INT8; + } + if constexpr (std::is_same_v) { + cfg.data_type_ = DataType::INT16; + } + if constexpr (std::is_same_v) { + cfg.data_type_ = DataType::INT32; + } + if constexpr (std::is_same_v) { + cfg.data_type_ = DataType::INT64; + } + if constexpr (std::is_same_v) { + cfg.data_type_ = DataType::VARCHAR; + } + boost::uuids::random_generator generator; + auto uuid = generator(); + auto prefix = boost::uuids::to_string(uuid); + path_ = fmt::format("/tmp/{}", prefix); + boost::filesystem::create_directories(path_); + cfg_ = cfg; + d_type_ = cfg_.to_tantivy_data_type(); + std::string field = "test_inverted_index"; + wrapper_ = std::make_shared( + field.c_str(), d_type_, path_.c_str()); + wrapper_->add_data(static_cast(values), n); + finish(); } else { boost::uuids::random_generator generator; auto uuid = generator(); diff --git a/internal/core/src/index/Meta.h b/internal/core/src/index/Meta.h index e44eb6d87a..f1a01231b8 100644 --- a/internal/core/src/index/Meta.h +++ b/internal/core/src/index/Meta.h @@ -54,6 +54,8 @@ constexpr const char* INDEX_BUILD_ID = "index_build_id"; constexpr const char* INDEX_ID = "index_id"; constexpr const char* INDEX_VERSION = "index_version"; constexpr const char* INDEX_ENGINE_VERSION = "index_engine_version"; +constexpr const char* BITMAP_INDEX_CARDINALITY_LIMIT = + "bitmap_cardinality_limit"; // VecIndex file metas constexpr const char* DISK_ANN_PREFIX_PATH = "index_prefix"; diff --git a/internal/core/src/index/ScalarIndex.h b/internal/core/src/index/ScalarIndex.h index aacef521f5..97a8b63c3e 100644 --- a/internal/core/src/index/ScalarIndex.h +++ b/internal/core/src/index/ScalarIndex.h @@ -80,6 +80,16 @@ class ScalarIndex : public IndexBase { RegexQuery(const std::string& pattern) { PanicInfo(Unsupported, "regex query is not supported"); } + + virtual void + BuildWithFieldData(const std::vector& field_datas) { + PanicInfo(Unsupported, "BuildwithFieldData is not supported"); + } + + virtual void + LoadWithoutAssemble(const BinarySet& binary_set, const Config& config) { + PanicInfo(Unsupported, "LoadWithoutAssemble is not supported"); + } }; template diff --git a/internal/core/src/index/ScalarIndexSort.cpp b/internal/core/src/index/ScalarIndexSort.cpp index bcb401ea5b..1f494e5c5a 100644 --- a/internal/core/src/index/ScalarIndexSort.cpp +++ b/internal/core/src/index/ScalarIndexSort.cpp @@ -117,6 +117,35 @@ ScalarIndexSort::Build(const Config& config) { auto field_datas = file_manager_->CacheRawDataToMemory(insert_files.value()); + BuildWithFieldData(field_datas); +} + +template +void +ScalarIndexSort::Build(size_t n, const T* values) { + if (is_built_) + return; + if (n == 0) { + throw SegcoreError(DataIsEmpty, + "ScalarIndexSort cannot build null values!"); + } + data_.reserve(n); + idx_to_offsets_.resize(n); + T* p = const_cast(values); + for (size_t i = 0; i < n; ++i) { + data_.emplace_back(IndexStructure(*p++, i)); + } + std::sort(data_.begin(), data_.end()); + for (size_t i = 0; i < data_.size(); ++i) { + idx_to_offsets_[data_[i].idx_] = i; + } + is_built_ = true; +} + +template +void +ScalarIndexSort::BuildWithFieldData( + const std::vector& field_datas) { int64_t total_num_rows = 0; for (const auto& data : field_datas) { total_num_rows += data->get_num_rows(); @@ -145,28 +174,6 @@ ScalarIndexSort::Build(const Config& config) { is_built_ = true; } -template -void -ScalarIndexSort::Build(size_t n, const T* values) { - if (is_built_) - return; - if (n == 0) { - throw SegcoreError(DataIsEmpty, - "ScalarIndexSort cannot build null values!"); - } - data_.reserve(n); - idx_to_offsets_.resize(n); - T* p = const_cast(values); - for (size_t i = 0; i < n; ++i) { - data_.emplace_back(IndexStructure(*p++, i)); - } - std::sort(data_.begin(), data_.end()); - for (size_t i = 0; i < data_.size(); ++i) { - idx_to_offsets_[data_[i].idx_] = i; - } - is_built_ = true; -} - template BinarySet ScalarIndexSort::Serialize(const Config& config) { diff --git a/internal/core/src/index/ScalarIndexSort.h b/internal/core/src/index/ScalarIndexSort.h index e938b16418..96402017c9 100644 --- a/internal/core/src/index/ScalarIndexSort.h +++ b/internal/core/src/index/ScalarIndexSort.h @@ -41,6 +41,17 @@ class ScalarIndexSort : public ScalarIndex { const storage::FileManagerContext& file_manager_context, std::shared_ptr space); + explicit ScalarIndexSort( + const std::shared_ptr& file_manager) + : file_manager_(file_manager) { + } + + explicit ScalarIndexSort( + const std::shared_ptr& file_manager, + std::shared_ptr space) + : file_manager_(file_manager), space_(space) { + } + BinarySet Serialize(const Config& config) override; @@ -100,6 +111,9 @@ class ScalarIndexSort : public ScalarIndex { return true; } + void + BuildWithFieldData(const std::vector& datas) override; + private: bool ShouldSkip(const T lower_value, const T upper_value, const OpType op); @@ -116,7 +130,8 @@ class ScalarIndexSort : public ScalarIndex { } void - LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) override; private: bool is_built_; diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index aa41438e2b..3e4aa85c52 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -132,6 +132,13 @@ StringIndexMarisa::Build(const Config& config) { "insert file paths is empty when build index"); auto field_datas = file_manager_->CacheRawDataToMemory(insert_files.value()); + + BuildWithFieldData(field_datas); +} + +void +StringIndexMarisa::BuildWithFieldData( + const std::vector& field_datas) { int64_t total_num_rows = 0; // fill key set. diff --git a/internal/core/src/index/StringIndexMarisa.h b/internal/core/src/index/StringIndexMarisa.h index 7b96f06124..e787a7e63b 100644 --- a/internal/core/src/index/StringIndexMarisa.h +++ b/internal/core/src/index/StringIndexMarisa.h @@ -37,6 +37,17 @@ class StringIndexMarisa : public StringIndex { const storage::FileManagerContext& file_manager_context, std::shared_ptr space); + explicit StringIndexMarisa( + const std::shared_ptr& file_manager) + : file_manager_(file_manager) { + } + + explicit StringIndexMarisa( + const std::shared_ptr& file_manager, + std::shared_ptr space) + : file_manager_(file_manager), space_(space) { + } + int64_t Size() override; @@ -63,6 +74,9 @@ class StringIndexMarisa : public StringIndex { void Build(const Config& config = {}) override; + void + BuildWithFieldData(const std::vector& field_datas) override; + void BuildV2(const Config& Config = {}) override; @@ -113,7 +127,8 @@ class StringIndexMarisa : public StringIndex { prefix_match(const std::string_view prefix); void - LoadWithoutAssemble(const BinarySet& binary_set, const Config& config); + LoadWithoutAssemble(const BinarySet& binary_set, + const Config& config) override; private: Config config_; diff --git a/internal/core/src/index/Utils.cpp b/internal/core/src/index/Utils.cpp index a9ad1cf1a0..d931684d91 100644 --- a/internal/core/src/index/Utils.cpp +++ b/internal/core/src/index/Utils.cpp @@ -154,6 +154,15 @@ GetIndexEngineVersionFromConfig(const Config& config) { return (std::stoi(index_engine_version.value())); } +int32_t +GetBitmapCardinalityLimitFromConfig(const Config& config) { + auto bitmap_limit = GetValueFromConfig( + config, index::BITMAP_INDEX_CARDINALITY_LIMIT); + AssertInfo(bitmap_limit.has_value(), + "bitmap cardinality limit not exist in config"); + return (std::stoi(bitmap_limit.value())); +} + // TODO :: too ugly storage::FieldDataMeta GetFieldDataMetaFromConfig(const Config& config) { diff --git a/internal/core/src/index/Utils.h b/internal/core/src/index/Utils.h index 53670dcba2..50c70d8d52 100644 --- a/internal/core/src/index/Utils.h +++ b/internal/core/src/index/Utils.h @@ -103,6 +103,9 @@ GetIndexTypeFromConfig(const Config& config); IndexVersion GetIndexEngineVersionFromConfig(const Config& config); +int32_t +GetBitmapCardinalityLimitFromConfig(const Config& config); + storage::FieldDataMeta GetFieldDataMetaFromConfig(const Config& config); diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index be78b2b36c..7abde651f3 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -32,7 +32,7 @@ set(MILVUS_TEST_FILES test_growing.cpp test_growing_index.cpp test_indexing.cpp - test_bitmap_index.cpp + test_hybrid_index.cpp test_index_c_api.cpp test_index_wrapper.cpp test_init.cpp diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index efeae58f78..339c92955b 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -10,12 +10,14 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #include +#include #include #include #include #include #include #include +#include #include "common/Json.h" #include "common/Types.h" @@ -35,6 +37,8 @@ #include "exec/expression/Expr.h" #include "exec/Task.h" #include "expr/ITypeExpr.h" +#include "index/BitmapIndex.h" +#include "index/InvertedIndexTantivy.h" using namespace milvus; using namespace milvus::query; @@ -1271,7 +1275,7 @@ TEST(Expr, TestExprPerformance) { {DataType::DOUBLE, double_fid}}; auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -1678,7 +1682,7 @@ TEST_P(ExprTest, TestSealedSegmentGetBatchSize) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 100000; auto raw_data = DataGen(schema, N); // load field data auto fields = schema->get_fields(); @@ -1739,7 +1743,7 @@ TEST_P(ExprTest, TestGrowingSegmentGetBatchSize) { schema->set_primary_field_id(str1_fid); auto seg = CreateGrowingSegment(schema, empty_index_meta); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); seg->PreInsert(N); seg->Insert(0, @@ -1804,7 +1808,7 @@ TEST_P(ExprTest, TestConjuctExpr) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data auto fields = schema->get_fields(); @@ -1871,7 +1875,7 @@ TEST_P(ExprTest, TestUnaryBenchTest) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -1942,7 +1946,7 @@ TEST_P(ExprTest, TestBinaryRangeBenchTest) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -2022,7 +2026,7 @@ TEST_P(ExprTest, TestLogicalUnaryBenchTest) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -2096,7 +2100,7 @@ TEST_P(ExprTest, TestBinaryLogicalBenchTest) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -2180,7 +2184,7 @@ TEST_P(ExprTest, TestBinaryArithOpEvalRangeBenchExpr) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -2263,7 +2267,7 @@ TEST_P(ExprTest, TestCompareExprBenchTest) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data @@ -2333,7 +2337,7 @@ TEST_P(ExprTest, TestRefactorExprs) { schema->set_primary_field_id(str1_fid); auto seg = CreateSealedSegment(schema); - int N = 1000000; + int N = 10000; auto raw_data = DataGen(schema, N); // load field data diff --git a/internal/core/unittest/test_bitmap_index.cpp b/internal/core/unittest/test_hybrid_index.cpp similarity index 63% rename from internal/core/unittest/test_bitmap_index.cpp rename to internal/core/unittest/test_hybrid_index.cpp index 99d877d744..4208719930 100644 --- a/internal/core/unittest/test_bitmap_index.cpp +++ b/internal/core/unittest/test_hybrid_index.cpp @@ -17,6 +17,7 @@ #include "common/Tracer.h" #include "index/BitmapIndex.h" +#include "index/HybridScalarIndex.h" #include "storage/Util.h" #include "storage/InsertData.h" #include "indexbuilder/IndexFactory.h" @@ -60,7 +61,7 @@ GenerateData(const size_t size, const size_t cardinality) { } template -class BitmapIndexTest : public testing::Test { +class HybridIndexTestV1 : public testing::Test { protected: void Init(int64_t collection_id, @@ -88,7 +89,8 @@ class BitmapIndexTest : public testing::Test { auto serialized_bytes = insert_data.Serialize(storage::Remote); - auto log_path = fmt::format("{}/{}/{}/{}/{}", + auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}", + "/tmp/test_hybrid/", collection_id, partition_id, segment_id, @@ -103,6 +105,7 @@ class BitmapIndexTest : public testing::Test { Config config; config["index_type"] = milvus::index::BITMAP_INDEX_TYPE; config["insert_files"] = std::vector{log_path}; + config["bitmap_cardinality_limit"] = "1000"; auto build_index = indexbuilder::IndexFactory::GetInstance().CreateIndex( @@ -125,10 +128,14 @@ class BitmapIndexTest : public testing::Test { index_->Load(milvus::tracer::TraceContext{}, config); } - void - SetUp() override { + virtual void + SetParam() { nb_ = 10000; cardinality_ = 30; + } + void + SetUp() override { + SetParam(); if constexpr (std::is_same_v) { type_ = DataType::INT8; @@ -162,7 +169,7 @@ class BitmapIndexTest : public testing::Test { index_version); } - virtual ~BitmapIndexTest() override { + virtual ~HybridIndexTestV1() override { boost::filesystem::remove_all(chunk_manager_->GetRootPath()); } @@ -176,7 +183,8 @@ class BitmapIndexTest : public testing::Test { test_data.push_back(data_[i]); s.insert(data_[i]); } - auto index_ptr = dynamic_cast*>(index_.get()); + auto index_ptr = + dynamic_cast*>(index_.get()); auto bitset = index_ptr->In(test_data.size(), test_data.data()); for (size_t i = 0; i < bitset.size(); i++) { ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end()); @@ -192,7 +200,8 @@ class BitmapIndexTest : public testing::Test { test_data.push_back(data_[i]); s.insert(data_[i]); } - auto index_ptr = dynamic_cast*>(index_.get()); + auto index_ptr = + dynamic_cast*>(index_.get()); auto bitset = index_ptr->NotIn(test_data.size(), test_data.data()); for (size_t i = 0; i < bitset.size(); i++) { ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end()); @@ -219,7 +228,7 @@ class BitmapIndexTest : public testing::Test { }; for (const auto& [test_value, op, ref] : test_cases) { auto index_ptr = - dynamic_cast*>(index_.get()); + dynamic_cast*>(index_.get()); auto bitset = index_ptr->Range(test_value, op); for (size_t i = 0; i < bitset.size(); i++) { auto ans = bitset[i]; @@ -232,8 +241,65 @@ class BitmapIndexTest : public testing::Test { } } - private: - std::shared_ptr chunk_manager_; + void + TestRangeCompareFunc() { + if constexpr (!std::is_same_v) { + using RefFunc = std::function; + struct TestParam { + int64_t lower_val; + int64_t upper_val; + bool lower_inclusive; + bool upper_inclusive; + RefFunc ref; + }; + std::vector test_cases = { + { + 10, + 30, + false, + false, + [&](int64_t i) { return 10 < data_[i] && data_[i] < 30; }, + }, + { + 10, + 30, + true, + false, + [&](int64_t i) { return 10 <= data_[i] && data_[i] < 30; }, + }, + { + 10, + 30, + true, + true, + [&](int64_t i) { return 10 <= data_[i] && data_[i] <= 30; }, + }, + { + 10, + 30, + false, + true, + [&](int64_t i) { return 10 < data_[i] && data_[i] <= 30; }, + }}; + + for (const auto& test_case : test_cases) { + auto index_ptr = + dynamic_cast*>(index_.get()); + auto bitset = index_ptr->Range(test_case.lower_val, + test_case.lower_inclusive, + test_case.upper_val, + test_case.upper_inclusive); + for (size_t i = 0; i < bitset.size(); i++) { + auto ans = bitset[i]; + auto should = test_case.ref(i); + ASSERT_EQ(ans, should) + << "lower:" << test_case.lower_val + << "upper:" << test_case.upper_val << ", @" << i + << ", ans: " << ans << ", ref: " << should; + } + } + } + } public: IndexBasePtr index_; @@ -241,34 +307,92 @@ class BitmapIndexTest : public testing::Test { size_t nb_; size_t cardinality_; boost::container::vector data_; + std::shared_ptr chunk_manager_; }; -TYPED_TEST_SUITE_P(BitmapIndexTest); +TYPED_TEST_SUITE_P(HybridIndexTestV1); -TYPED_TEST_P(BitmapIndexTest, CountFuncTest) { +TYPED_TEST_P(HybridIndexTestV1, CountFuncTest) { auto count = this->index_->Count(); EXPECT_EQ(count, this->nb_); } -TYPED_TEST_P(BitmapIndexTest, INFuncTest) { +TYPED_TEST_P(HybridIndexTestV1, INFuncTest) { this->TestInFunc(); } -TYPED_TEST_P(BitmapIndexTest, NotINFuncTest) { +TYPED_TEST_P(HybridIndexTestV1, NotINFuncTest) { this->TestNotInFunc(); } -TYPED_TEST_P(BitmapIndexTest, CompareValFuncTest) { +TYPED_TEST_P(HybridIndexTestV1, CompareValFuncTest) { this->TestCompareValueFunc(); } +TYPED_TEST_P(HybridIndexTestV1, TestRangeCompareFuncTest) { + this->TestRangeCompareFunc(); +} + using BitmapType = testing::Types; -REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTest, +REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV1, CountFuncTest, INFuncTest, NotINFuncTest, - CompareValFuncTest); + CompareValFuncTest, + TestRangeCompareFuncTest); -INSTANTIATE_TYPED_TEST_SUITE_P(BitmapE2ECheck, BitmapIndexTest, BitmapType); +INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_LowCardinality, + HybridIndexTestV1, + BitmapType); + +template +class HybridIndexTestV2 : public HybridIndexTestV1 { + public: + virtual void + SetParam() override { + this->nb_ = 10000; + this->cardinality_ = 2000; + } + + virtual ~HybridIndexTestV2() { + } +}; + +TYPED_TEST_SUITE_P(HybridIndexTestV2); + +TYPED_TEST_P(HybridIndexTestV2, CountFuncTest) { + auto count = this->index_->Count(); + EXPECT_EQ(count, this->nb_); +} + +TYPED_TEST_P(HybridIndexTestV2, INFuncTest) { + this->TestInFunc(); +} + +TYPED_TEST_P(HybridIndexTestV2, NotINFuncTest) { + this->TestNotInFunc(); +} + +TYPED_TEST_P(HybridIndexTestV2, CompareValFuncTest) { + this->TestCompareValueFunc(); +} + +TYPED_TEST_P(HybridIndexTestV2, TestRangeCompareFuncTest) { + this->TestRangeCompareFunc(); +} + +using BitmapType = + testing::Types; + +REGISTER_TYPED_TEST_SUITE_P(HybridIndexTestV2, + CountFuncTest, + INFuncTest, + NotINFuncTest, + CompareValFuncTest, + TestRangeCompareFuncTest); + +INSTANTIATE_TYPED_TEST_SUITE_P(HybridIndexE2ECheck_HighCardinality, + HybridIndexTestV2, + BitmapType); diff --git a/internal/core/unittest/test_scalar_index.cpp b/internal/core/unittest/test_scalar_index.cpp index 2fc943b57b..2967523daf 100644 --- a/internal/core/unittest/test_scalar_index.cpp +++ b/internal/core/unittest/test_scalar_index.cpp @@ -15,7 +15,11 @@ #include "gtest/gtest-typed-test.h" #include "index/IndexFactory.h" +#include "index/BitmapIndex.h" +#include "index/InvertedIndexTantivy.h" +#include "index/ScalarIndex.h" #include "common/CDataType.h" +#include "common/Types.h" #include "knowhere/comp/index_param.h" #include "test_utils/indexbuilder_test_utils.h" #include "test_utils/AssertUtils.h" @@ -373,7 +377,11 @@ TYPED_TEST_P(TypedScalarIndexTestV2, Base) { create_index_info, file_manager_context, space); auto scalar_index = dynamic_cast*>(index.get()); - scalar_index->BuildV2(); + milvus::Config config; + if (index_type == "BITMAP") { + config["bitmap_cardinality_limit"] = "1000"; + } + scalar_index->BuildV2(config); scalar_index->UploadV2(); auto new_index = @@ -391,3 +399,260 @@ REGISTER_TYPED_TEST_SUITE_P(TypedScalarIndexTestV2, Base); INSTANTIATE_TYPED_TEST_SUITE_P(ArithmeticCheck, TypedScalarIndexTestV2, ScalarT); + +using namespace milvus::index; +template +std::vector +GenerateRawData(int N, int cardinality) { + using std::vector; + std::default_random_engine random(60); + std::normal_distribution<> distr(0, 1); + vector data(N); + for (auto& x : data) { + x = random() % (cardinality); + } + return data; +} + +template <> +std::vector +GenerateRawData(int N, int cardinality) { + using std::vector; + std::default_random_engine random(60); + std::normal_distribution<> distr(0, 1); + vector data(N); + for (auto& x : data) { + x = std::to_string(random() % (cardinality)); + } + return data; +} + +template +IndexBasePtr +TestBuildIndex(int N, int cardinality, int index_type) { + auto raw_data = GenerateRawData(N, cardinality); + if (index_type == 0) { + auto index = std::make_unique>(); + index->Build(N, raw_data.data()); + return std::move(index); + } else if (index_type == 1) { + if constexpr (std::is_same_v) { + auto index = std::make_unique(); + index->Build(N, raw_data.data()); + return std::move(index); + } + auto index = milvus::index::CreateScalarIndexSort(); + index->Build(N, raw_data.data()); + return std::move(index); + } +} + +template +void +TestIndexSearchIn() { + // low data cardinality + { + int N = 1000; + std::vector data_cardinality = {10, 20, 100}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + std::vector terms; + for (int i = 0; i < 10; i++) { + terms.push_back(static_cast(i)); + } + auto final1 = bitmap_index_ptr->In(10, terms.data()); + auto final2 = sort_index_ptr->In(10, terms.data()); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->NotIn(10, terms.data()); + auto final4 = sort_index_ptr->NotIn(10, terms.data()); + EXPECT_EQ(final4.size(), final3.size()); + for (int i = 0; i < final3.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } + + // high data cardinality + { + int N = 10000; + std::vector data_cardinality = {1001, 2000}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + std::vector terms; + for (int i = 0; i < 10; i++) { + terms.push_back(static_cast(i)); + } + auto final1 = bitmap_index_ptr->In(10, terms.data()); + auto final2 = sort_index_ptr->In(10, terms.data()); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->NotIn(10, terms.data()); + auto final4 = sort_index_ptr->NotIn(10, terms.data()); + EXPECT_EQ(final4.size(), final3.size()); + for (int i = 0; i < final3.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } +} + +template <> +void +TestIndexSearchIn() { + // low data cardinality + { + int N = 1000; + std::vector data_cardinality = {10, 20, 100}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + std::vector terms; + for (int i = 0; i < 10; i++) { + terms.push_back(std::to_string(i)); + } + auto final1 = bitmap_index_ptr->In(10, terms.data()); + auto final2 = sort_index_ptr->In(10, terms.data()); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->NotIn(10, terms.data()); + auto final4 = sort_index_ptr->NotIn(10, terms.data()); + EXPECT_EQ(final4.size(), final3.size()); + for (int i = 0; i < final3.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } + // high data cardinality + { + int N = 10000; + std::vector data_cardinality = {1001, 2000}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + std::vector terms; + for (int i = 0; i < 10; i++) { + terms.push_back(std::to_string(i)); + } + auto final1 = bitmap_index_ptr->In(10, terms.data()); + auto final2 = sort_index_ptr->In(10, terms.data()); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->NotIn(10, terms.data()); + auto final4 = sort_index_ptr->NotIn(10, terms.data()); + EXPECT_EQ(final4.size(), final3.size()); + for (int i = 0; i < final3.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } +} + +TEST(ScalarTest, test_function_In) { + TestIndexSearchIn(); + TestIndexSearchIn(); + TestIndexSearchIn(); + TestIndexSearchIn(); + TestIndexSearchIn(); + TestIndexSearchIn(); + TestIndexSearchIn(); +} + +template +void +TestIndexSearchRange() { + // low data cordinality + { + int N = 1000; + std::vector data_cardinality = {10, 20, 100}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + + auto final1 = bitmap_index_ptr->Range(10, milvus::OpType::LessThan); + auto final2 = sort_index_ptr->Range(10, milvus::OpType::LessThan); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->Range(10, true, 100, false); + auto final4 = sort_index_ptr->Range(10, true, 100, false); + EXPECT_EQ(final3.size(), final4.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } + + // high data cordinality + { + int N = 10000; + std::vector data_cardinality = {1001, 2000}; + for (auto& card : data_cardinality) { + auto bitmap_index = TestBuildIndex(N, card, 0); + auto bitmap_index_ptr = + dynamic_cast*>(bitmap_index.get()); + auto sort_index = TestBuildIndex(N, card, 1); + auto sort_index_ptr = + dynamic_cast*>(sort_index.get()); + + auto final1 = bitmap_index_ptr->Range(10, milvus::OpType::LessThan); + auto final2 = sort_index_ptr->Range(10, milvus::OpType::LessThan); + EXPECT_EQ(final1.size(), final2.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final1[i], final2[i]); + } + + auto final3 = bitmap_index_ptr->Range(10, true, 100, false); + auto final4 = sort_index_ptr->Range(10, true, 100, false); + EXPECT_EQ(final3.size(), final4.size()); + for (int i = 0; i < final1.size(); i++) { + EXPECT_EQ(final3[i], final4[i]); + } + } + } +} + +TEST(ScalarTest, test_function_range) { + TestIndexSearchRange(); + TestIndexSearchRange(); + TestIndexSearchRange(); + TestIndexSearchRange(); + TestIndexSearchRange(); + TestIndexSearchRange(); +} diff --git a/internal/proxy/task_index.go b/internal/proxy/task_index.go index 5925391c20..149a13605a 100644 --- a/internal/proxy/task_index.go +++ b/internal/proxy/task_index.go @@ -332,6 +332,13 @@ func fillDimension(field *schemapb.FieldSchema, indexParams map[string]string) e func checkTrain(field *schemapb.FieldSchema, indexParams map[string]string) error { indexType := indexParams[common.IndexTypeKey] + if indexType == indexparamcheck.IndexBitmap { + _, exist := indexParams[common.BitmapCardinalityLimitKey] + if !exist { + indexParams[common.BitmapCardinalityLimitKey] = paramtable.Get().CommonCfg.BitmapIndexCardinalityBound.GetValue() + } + } + checker, err := indexparamcheck.GetIndexCheckerMgrInstance().GetChecker(indexType) if err != nil { log.Warn("Failed to get index checker", zap.String(common.IndexTypeKey, indexType)) diff --git a/pkg/common/common.go b/pkg/common/common.go index ea148b03b7..723f231718 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -112,6 +112,8 @@ const ( MaxCapacityKey = "max_capacity" DropRatioBuildKey = "drop_ratio_build" + + BitmapCardinalityLimitKey = "bitmap_cardinality_limit" ) // Collection properties key diff --git a/pkg/util/indexparamcheck/bitmap_checker_test.go b/pkg/util/indexparamcheck/bitmap_checker_test.go index 4b0cca2bf3..aa1baa8963 100644 --- a/pkg/util/indexparamcheck/bitmap_checker_test.go +++ b/pkg/util/indexparamcheck/bitmap_checker_test.go @@ -11,7 +11,7 @@ import ( func Test_BitmapIndexChecker(t *testing.T) { c := newBITMAPChecker() - assert.NoError(t, c.CheckTrain(map[string]string{})) + assert.NoError(t, c.CheckTrain(map[string]string{"bitmap_cardinality_limit": "100"})) assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64)) assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float)) @@ -19,4 +19,6 @@ func Test_BitmapIndexChecker(t *testing.T) { assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON)) assert.Error(t, c.CheckValidDataType(schemapb.DataType_Array)) + assert.Error(t, c.CheckTrain(map[string]string{})) + assert.Error(t, c.CheckTrain(map[string]string{"bitmap_cardinality_limit": "0"})) } diff --git a/pkg/util/indexparamcheck/bitmap_index_checker.go b/pkg/util/indexparamcheck/bitmap_index_checker.go index da90a7d06d..d41267987d 100644 --- a/pkg/util/indexparamcheck/bitmap_index_checker.go +++ b/pkg/util/indexparamcheck/bitmap_index_checker.go @@ -2,17 +2,21 @@ package indexparamcheck import ( "fmt" + "math" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" + "github.com/milvus-io/milvus/pkg/common" "github.com/milvus-io/milvus/pkg/util/typeutil" ) -// STLSORTChecker checks if a STL_SORT index can be built. type BITMAPChecker struct { scalarIndexChecker } func (c *BITMAPChecker) CheckTrain(params map[string]string) error { + if !CheckIntByRange(params, common.BitmapCardinalityLimitKey, 1, math.MaxInt) { + return fmt.Errorf("failed to check bitmap cardinality limit, should be larger than 0 and smaller than math.MaxInt") + } return c.scalarIndexChecker.CheckTrain(params) } diff --git a/pkg/util/paramtable/component_param.go b/pkg/util/paramtable/component_param.go index 9d3da19aff..5dbd86f009 100644 --- a/pkg/util/paramtable/component_param.go +++ b/pkg/util/paramtable/component_param.go @@ -42,13 +42,14 @@ const ( DefaultSessionTTL = 30 // s DefaultSessionRetryTimes = 30 - DefaultMaxDegree = 56 - DefaultSearchListSize = 100 - DefaultPQCodeBudgetGBRatio = 0.125 - DefaultBuildNumThreadsRatio = 1.0 - DefaultSearchCacheBudgetGBRatio = 0.10 - DefaultLoadNumThreadRatio = 8.0 - DefaultBeamWidthRatio = 4.0 + DefaultMaxDegree = 56 + DefaultSearchListSize = 100 + DefaultPQCodeBudgetGBRatio = 0.125 + DefaultBuildNumThreadsRatio = 1.0 + DefaultSearchCacheBudgetGBRatio = 0.10 + DefaultLoadNumThreadRatio = 8.0 + DefaultBeamWidthRatio = 4.0 + DefaultBitmapIndexCardinalityBound = 500 ) // ComponentParam is used to quickly and easily access all components' configurations. @@ -212,6 +213,7 @@ type commonConfig struct { BeamWidthRatio ParamItem `refreshable:"true"` GracefulTime ParamItem `refreshable:"true"` GracefulStopTimeout ParamItem `refreshable:"true"` + BitmapIndexCardinalityBound ParamItem `refreshable:"false"` StorageType ParamItem `refreshable:"false"` SimdType ParamItem `refreshable:"false"` @@ -443,6 +445,14 @@ This configuration is only used by querynode and indexnode, it selects CPU instr } p.IndexSliceSize.Init(base.mgr) + p.BitmapIndexCardinalityBound = ParamItem{ + Key: "common.bitmapIndexCardinalityBound", + Version: "2.5.0", + DefaultValue: strconv.Itoa(DefaultBitmapIndexCardinalityBound), + Export: true, + } + p.BitmapIndexCardinalityBound.Init(base.mgr) + p.EnableMaterializedView = ParamItem{ Key: "common.materializedView.enabled", Version: "2.5.0",