mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
enhance: support bitmap index (#35336)
pr: #32902 cherry-pick bitmap index from master Signed-off-by: luzhang <luzhang@zilliz.com> Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
parent
b22f3a6276
commit
db9e4b898a
@ -41,6 +41,7 @@ class MilvusConan(ConanFile):
|
||||
"google-cloud-cpp/2.5.0@milvus/2.4#b8dda0943d40adee69d7adc5fafc317d",
|
||||
"opentelemetry-cpp/1.8.3@milvus/2.4#3b8139532791a163c8ff2819c55eb4ac",
|
||||
"librdkafka/1.9.1",
|
||||
"roaring/3.0.0",
|
||||
"abseil/20230125.3",
|
||||
"grpc/1.54.3@milvus/dev#5dfb5e1477b22c6d1e6d6b90ab5501d8",
|
||||
)
|
||||
|
||||
@ -66,3 +66,5 @@ constexpr const char* RANGE_FILTER = knowhere::meta::RANGE_FILTER;
|
||||
const int64_t DEFAULT_MAX_OUTPUT_SIZE = 67108864; // bytes, 64MB
|
||||
|
||||
const int64_t DEFAULT_CHUNK_MANAGER_REQUEST_TIMEOUT_MS = 10000;
|
||||
|
||||
const int64_t DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND = 500;
|
||||
|
||||
@ -78,10 +78,12 @@ PhyBinaryArithOpEvalRangeExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
auto value_type = expr_->value_.val_case();
|
||||
switch (value_type) {
|
||||
case proto::plan::GenericValue::ValCase::kInt64Val: {
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplForArray<int64_t>();
|
||||
break;
|
||||
}
|
||||
case proto::plan::GenericValue::ValCase::kFloatVal: {
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplForArray<double>();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -91,14 +91,17 @@ PhyBinaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
auto value_type = expr_->lower_val_.val_case();
|
||||
switch (value_type) {
|
||||
case proto::plan::GenericValue::ValCase::kInt64Val: {
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplForArray<int64_t>();
|
||||
break;
|
||||
}
|
||||
case proto::plan::GenericValue::ValCase::kFloatVal: {
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplForArray<double>();
|
||||
break;
|
||||
}
|
||||
case proto::plan::GenericValue::ValCase::kStringVal: {
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplForArray<std::string>();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -364,6 +364,11 @@ class SegmentExpr : public Expr {
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
SetNotUseIndex() {
|
||||
use_index_ = false;
|
||||
}
|
||||
|
||||
protected:
|
||||
const segcore::SegmentInternalInterface* segment_;
|
||||
const FieldId field_id_;
|
||||
|
||||
@ -91,21 +91,26 @@ PhyTermFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
}
|
||||
case DataType::ARRAY: {
|
||||
if (expr_->vals_.size() == 0) {
|
||||
SetNotUseIndex();
|
||||
result = ExecVisitorImplTemplateArray<bool>();
|
||||
break;
|
||||
}
|
||||
auto type = expr_->vals_[0].val_case();
|
||||
switch (type) {
|
||||
case proto::plan::GenericValue::ValCase::kBoolVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecVisitorImplTemplateArray<bool>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kInt64Val:
|
||||
SetNotUseIndex();
|
||||
result = ExecVisitorImplTemplateArray<int64_t>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kFloatVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecVisitorImplTemplateArray<double>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kStringVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecVisitorImplTemplateArray<std::string>();
|
||||
break;
|
||||
default:
|
||||
|
||||
@ -20,6 +20,69 @@
|
||||
namespace milvus {
|
||||
namespace exec {
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
PhyUnaryRangeFilterExpr::CanUseIndexForArray() {
|
||||
typedef std::
|
||||
conditional_t<std::is_same_v<T, std::string_view>, std::string, T>
|
||||
IndexInnerType;
|
||||
using Index = index::ScalarIndex<IndexInnerType>;
|
||||
|
||||
for (size_t i = current_index_chunk_; i < num_index_chunk_; i++) {
|
||||
const Index& index =
|
||||
segment_->chunk_scalar_index<IndexInnerType>(field_id_, i);
|
||||
|
||||
if (index.GetIndexType() == milvus::index::ScalarIndexType::BITMAP) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool
|
||||
PhyUnaryRangeFilterExpr::CanUseIndexForArray<milvus::Array>() {
|
||||
bool res;
|
||||
if (!is_index_mode_) {
|
||||
use_index_ = res = false;
|
||||
return res;
|
||||
}
|
||||
switch (expr_->column_.element_type_) {
|
||||
case DataType::BOOL:
|
||||
res = CanUseIndexForArray<bool>();
|
||||
break;
|
||||
case DataType::INT8:
|
||||
res = CanUseIndexForArray<int8_t>();
|
||||
break;
|
||||
case DataType::INT16:
|
||||
res = CanUseIndexForArray<int16_t>();
|
||||
break;
|
||||
case DataType::INT32:
|
||||
res = CanUseIndexForArray<int32_t>();
|
||||
break;
|
||||
case DataType::INT64:
|
||||
res = CanUseIndexForArray<int64_t>();
|
||||
break;
|
||||
case DataType::FLOAT:
|
||||
case DataType::DOUBLE:
|
||||
// not accurate on floating point number, rollback to bruteforce.
|
||||
res = false;
|
||||
break;
|
||||
case DataType::VARCHAR:
|
||||
case DataType::STRING:
|
||||
res = CanUseIndexForArray<std::string_view>();
|
||||
break;
|
||||
default:
|
||||
PanicInfo(DataTypeInvalid,
|
||||
|
||||
"unsupported element type when execute array "
|
||||
"equal for index: {}",
|
||||
expr_->column_.element_type_);
|
||||
}
|
||||
use_index_ = res;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
VectorPtr
|
||||
PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArrayForIndex() {
|
||||
@ -150,19 +213,23 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
|
||||
auto val_type = expr_->val_.val_case();
|
||||
switch (val_type) {
|
||||
case proto::plan::GenericValue::ValCase::kBoolVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplArray<bool>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kInt64Val:
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplArray<int64_t>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kFloatVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplArray<double>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kStringVal:
|
||||
SetNotUseIndex();
|
||||
result = ExecRangeVisitorImplArray<std::string>();
|
||||
break;
|
||||
case proto::plan::GenericValue::ValCase::kArrayVal:
|
||||
if (is_index_mode_) {
|
||||
if (CanUseIndexForArray<milvus::Array>()) {
|
||||
result = ExecRangeVisitorImplArrayForIndex<
|
||||
proto::plan::Array>();
|
||||
} else {
|
||||
@ -784,11 +851,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData() {
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
PhyUnaryRangeFilterExpr::CanUseIndex() const {
|
||||
if (!is_index_mode_) {
|
||||
return false;
|
||||
}
|
||||
return SegmentExpr::CanUseIndex<T>(expr_->op_type_);
|
||||
PhyUnaryRangeFilterExpr::CanUseIndex() {
|
||||
bool res = is_index_mode_ && SegmentExpr::CanUseIndex<T>(expr_->op_type_);
|
||||
use_index_ = res;
|
||||
return res;
|
||||
}
|
||||
|
||||
} // namespace exec
|
||||
|
||||
@ -206,12 +206,8 @@ struct UnaryIndexFuncForMatch {
|
||||
!std::is_same_v<T, std::string>) {
|
||||
PanicInfo(Unsupported, "regex query is only supported on string");
|
||||
} else {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(val);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
|
||||
if (index->SupportRegexQuery()) {
|
||||
return index->RegexQuery(regex_pattern);
|
||||
return index->PatternMatch(val);
|
||||
}
|
||||
if (!index->HasRawData()) {
|
||||
PanicInfo(Unsupported,
|
||||
@ -222,6 +218,9 @@ struct UnaryIndexFuncForMatch {
|
||||
// retrieve raw data to do brute force query, may be very slow.
|
||||
auto cnt = index->Count();
|
||||
TargetBitmap res(cnt);
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(val);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
for (int64_t i = 0; i < cnt; i++) {
|
||||
auto raw = index->Reverse_Lookup(i);
|
||||
res[i] = matcher(raw);
|
||||
@ -325,7 +324,11 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
CanUseIndex() const;
|
||||
CanUseIndex();
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
CanUseIndexForArray();
|
||||
|
||||
private:
|
||||
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;
|
||||
|
||||
988
internal/core/src/index/BitmapIndex.cpp
Normal file
988
internal/core/src/index/BitmapIndex.cpp
Normal file
@ -0,0 +1,988 @@
|
||||
// Licensed to the LF AI & Data foundation under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <yaml-cpp/yaml.h>
|
||||
|
||||
#include "index/BitmapIndex.h"
|
||||
|
||||
#include "common/Slice.h"
|
||||
#include "common/Common.h"
|
||||
#include "index/Meta.h"
|
||||
#include "index/ScalarIndex.h"
|
||||
#include "index/Utils.h"
|
||||
#include "storage/Util.h"
|
||||
#include "query/Utils.h"
|
||||
|
||||
namespace milvus {
|
||||
namespace index {
|
||||
|
||||
template <typename T>
|
||||
BitmapIndex<T>::BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context)
|
||||
: is_built_(false), schema_(file_manager_context.fieldDataMeta.schema) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ =
|
||||
std::make_shared<storage::MemFileManagerImpl>(file_manager_context);
|
||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BitmapIndex<T>::BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: is_built_(false),
|
||||
schema_(file_manager_context.fieldDataMeta.schema),
|
||||
space_(space) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ = std::make_shared<storage::MemFileManagerImpl>(
|
||||
file_manager_context, space);
|
||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::Build(const Config& config) {
|
||||
if (is_built_) {
|
||||
return;
|
||||
}
|
||||
auto insert_files =
|
||||
GetValueFromConfig<std::vector<std::string>>(config, "insert_files");
|
||||
AssertInfo(insert_files.has_value(),
|
||||
"insert file paths is empty when build index");
|
||||
|
||||
auto field_datas =
|
||||
file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::Build(size_t n, const T* data) {
|
||||
if (is_built_) {
|
||||
return;
|
||||
}
|
||||
if (n == 0) {
|
||||
PanicInfo(DataIsEmpty, "BitmapIndex can not build null values");
|
||||
}
|
||||
|
||||
T* p = const_cast<T*>(data);
|
||||
for (int i = 0; i < n; ++i, ++p) {
|
||||
data_[*p].add(i);
|
||||
}
|
||||
total_num_rows_ = n;
|
||||
|
||||
if (data_.size() < DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) {
|
||||
for (auto it = data_.begin(); it != data_.end(); ++it) {
|
||||
bitsets_[it->first] = ConvertRoaringToBitset(it->second);
|
||||
}
|
||||
build_mode_ = BitmapIndexBuildMode::BITSET;
|
||||
} else {
|
||||
build_mode_ = BitmapIndexBuildMode::ROARING;
|
||||
}
|
||||
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildV2(const Config& config) {
|
||||
if (is_built_) {
|
||||
return;
|
||||
}
|
||||
auto field_name = file_manager_->GetIndexMeta().field_name;
|
||||
auto reader = space_->ScanData();
|
||||
std::vector<FieldDataPtr> field_datas;
|
||||
for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) {
|
||||
if (!rec.ok()) {
|
||||
PanicInfo(DataFormatBroken, "failed to read data");
|
||||
}
|
||||
auto data = rec.ValueUnsafe();
|
||||
auto total_num_rows = data->num_rows();
|
||||
auto col_data = data->GetColumnByName(field_name);
|
||||
// todo: support nullable index
|
||||
auto field_data = storage::CreateFieldData(
|
||||
DataType(GetDType<T>()), 0, total_num_rows);
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildPrimitiveField(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto val = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_[*val].add(offset);
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildWithFieldData(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
int total_num_rows = 0;
|
||||
for (auto& field_data : field_datas) {
|
||||
total_num_rows += field_data->get_num_rows();
|
||||
}
|
||||
if (total_num_rows == 0) {
|
||||
PanicInfo(DataIsEmpty, "scalar bitmap index can not build null values");
|
||||
}
|
||||
total_num_rows_ = total_num_rows;
|
||||
|
||||
switch (schema_.data_type()) {
|
||||
case proto::schema::DataType::Bool:
|
||||
case proto::schema::DataType::Int8:
|
||||
case proto::schema::DataType::Int16:
|
||||
case proto::schema::DataType::Int32:
|
||||
case proto::schema::DataType::Int64:
|
||||
case proto::schema::DataType::Float:
|
||||
case proto::schema::DataType::Double:
|
||||
case proto::schema::DataType::String:
|
||||
case proto::schema::DataType::VarChar:
|
||||
BuildPrimitiveField(field_datas);
|
||||
break;
|
||||
case proto::schema::DataType::Array:
|
||||
BuildArrayField(field_datas);
|
||||
break;
|
||||
default:
|
||||
PanicInfo(
|
||||
DataTypeInvalid,
|
||||
fmt::format("Invalid data type: {} for build bitmap index",
|
||||
proto::schema::DataType_Name(schema_.data_type())));
|
||||
}
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildArrayField(const std::vector<FieldDataPtr>& field_datas) {
|
||||
int64_t offset = 0;
|
||||
using GetType = std::conditional_t<std::is_same_v<T, int8_t> ||
|
||||
std::is_same_v<T, int16_t> ||
|
||||
std::is_same_v<T, int32_t>,
|
||||
int32_t,
|
||||
T>;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto array =
|
||||
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
|
||||
|
||||
for (size_t j = 0; j < array->length(); ++j) {
|
||||
auto val = static_cast<T>(array->template get_data<GetType>(j));
|
||||
data_[val].add(offset);
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t
|
||||
BitmapIndex<T>::GetIndexDataSize() {
|
||||
auto index_data_size = 0;
|
||||
for (auto& pair : data_) {
|
||||
index_data_size += pair.second.getSizeInBytes() + sizeof(T);
|
||||
}
|
||||
return index_data_size;
|
||||
}
|
||||
|
||||
template <>
|
||||
size_t
|
||||
BitmapIndex<std::string>::GetIndexDataSize() {
|
||||
auto index_data_size = 0;
|
||||
for (auto& pair : data_) {
|
||||
index_data_size +=
|
||||
pair.second.getSizeInBytes() + pair.first.size() + sizeof(size_t);
|
||||
}
|
||||
return index_data_size;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::SerializeIndexData(uint8_t* data_ptr) {
|
||||
for (auto& pair : data_) {
|
||||
memcpy(data_ptr, &pair.first, sizeof(T));
|
||||
data_ptr += sizeof(T);
|
||||
|
||||
pair.second.write(reinterpret_cast<char*>(data_ptr));
|
||||
data_ptr += pair.second.getSizeInBytes();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::pair<std::shared_ptr<uint8_t[]>, size_t>
|
||||
BitmapIndex<T>::SerializeIndexMeta() {
|
||||
YAML::Node node;
|
||||
node[BITMAP_INDEX_LENGTH] = data_.size();
|
||||
node[BITMAP_INDEX_NUM_ROWS] = total_num_rows_;
|
||||
|
||||
std::stringstream ss;
|
||||
ss << node;
|
||||
auto json_string = ss.str();
|
||||
auto str_size = json_string.size();
|
||||
std::shared_ptr<uint8_t[]> res(new uint8_t[str_size]);
|
||||
memcpy(res.get(), json_string.data(), str_size);
|
||||
return std::make_pair(res, str_size);
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
BitmapIndex<std::string>::SerializeIndexData(uint8_t* data_ptr) {
|
||||
for (auto& pair : data_) {
|
||||
size_t key_size = pair.first.size();
|
||||
memcpy(data_ptr, &key_size, sizeof(size_t));
|
||||
data_ptr += sizeof(size_t);
|
||||
|
||||
memcpy(data_ptr, pair.first.data(), key_size);
|
||||
data_ptr += key_size;
|
||||
|
||||
pair.second.write(reinterpret_cast<char*>(data_ptr));
|
||||
data_ptr += pair.second.getSizeInBytes();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
BitmapIndex<T>::Serialize(const Config& config) {
|
||||
AssertInfo(is_built_, "index has not been built yet");
|
||||
|
||||
auto index_data_size = GetIndexDataSize();
|
||||
|
||||
std::shared_ptr<uint8_t[]> index_data(new uint8_t[index_data_size]);
|
||||
uint8_t* data_ptr = index_data.get();
|
||||
SerializeIndexData(data_ptr);
|
||||
|
||||
auto index_meta = SerializeIndexMeta();
|
||||
|
||||
BinarySet ret_set;
|
||||
ret_set.Append(BITMAP_INDEX_DATA, index_data, index_data_size);
|
||||
ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second);
|
||||
|
||||
LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}",
|
||||
Cardinality(),
|
||||
total_num_rows_);
|
||||
|
||||
Disassemble(ret_set);
|
||||
return ret_set;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
BitmapIndex<T>::Upload(const Config& config) {
|
||||
auto binary_set = Serialize(config);
|
||||
|
||||
file_manager_->AddFile(binary_set);
|
||||
|
||||
auto remote_path_to_size = file_manager_->GetRemotePathsToFileSize();
|
||||
BinarySet ret;
|
||||
for (auto& file : remote_path_to_size) {
|
||||
ret.Append(file.first, nullptr, file.second);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
BitmapIndex<T>::UploadV2(const Config& config) {
|
||||
auto binary_set = Serialize(config);
|
||||
|
||||
file_manager_->AddFileV2(binary_set);
|
||||
|
||||
auto remote_path_to_size = file_manager_->GetRemotePathsToFileSize();
|
||||
BinarySet ret;
|
||||
for (auto& file : remote_path_to_size) {
|
||||
ret.Append(file.first, nullptr, file.second);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::Load(const BinarySet& binary_set, const Config& config) {
|
||||
milvus::Assemble(const_cast<BinarySet&>(binary_set));
|
||||
LoadWithoutAssemble(binary_set, config);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TargetBitmap
|
||||
BitmapIndex<T>::ConvertRoaringToBitset(const roaring::Roaring& values) {
|
||||
AssertInfo(total_num_rows_ != 0, "total num rows should not be 0");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
for (const auto& val : values) {
|
||||
res.set(val);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::pair<size_t, size_t>
|
||||
BitmapIndex<T>::DeserializeIndexMeta(const uint8_t* data_ptr,
|
||||
size_t data_size) {
|
||||
YAML::Node node = YAML::Load(
|
||||
std::string(reinterpret_cast<const char*>(data_ptr), data_size));
|
||||
|
||||
auto index_length = node[BITMAP_INDEX_LENGTH].as<size_t>();
|
||||
auto index_num_rows = node[BITMAP_INDEX_NUM_ROWS].as<size_t>();
|
||||
|
||||
return std::make_pair(index_length, index_num_rows);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::ChooseIndexLoadMode(int64_t index_length) {
|
||||
if (index_length <= DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) {
|
||||
LOG_DEBUG("load bitmap index with bitset mode");
|
||||
build_mode_ = BitmapIndexBuildMode::BITSET;
|
||||
} else {
|
||||
LOG_DEBUG("load bitmap index with raw roaring mode");
|
||||
build_mode_ = BitmapIndexBuildMode::ROARING;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
|
||||
size_t index_length) {
|
||||
ChooseIndexLoadMode(index_length);
|
||||
for (size_t i = 0; i < index_length; ++i) {
|
||||
T key;
|
||||
memcpy(&key, data_ptr, sizeof(T));
|
||||
data_ptr += sizeof(T);
|
||||
|
||||
roaring::Roaring value;
|
||||
value = roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
|
||||
data_ptr += value.getSizeInBytes();
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::BITSET) {
|
||||
bitsets_[key] = ConvertRoaringToBitset(value);
|
||||
} else {
|
||||
data_[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void
|
||||
BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
|
||||
size_t index_length) {
|
||||
ChooseIndexLoadMode(index_length);
|
||||
for (size_t i = 0; i < index_length; ++i) {
|
||||
size_t key_size;
|
||||
memcpy(&key_size, data_ptr, sizeof(size_t));
|
||||
data_ptr += sizeof(size_t);
|
||||
|
||||
std::string key(reinterpret_cast<const char*>(data_ptr), key_size);
|
||||
data_ptr += key_size;
|
||||
|
||||
roaring::Roaring value;
|
||||
value = roaring::Roaring::read(reinterpret_cast<const char*>(data_ptr));
|
||||
data_ptr += value.getSizeInBytes();
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::BITSET) {
|
||||
bitsets_[key] = ConvertRoaringToBitset(value);
|
||||
} else {
|
||||
data_[key] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) {
|
||||
auto index_meta_buffer = binary_set.GetByName(BITMAP_INDEX_META);
|
||||
auto index_meta = DeserializeIndexMeta(index_meta_buffer->data.get(),
|
||||
index_meta_buffer->size);
|
||||
auto index_length = index_meta.first;
|
||||
total_num_rows_ = index_meta.second;
|
||||
|
||||
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
|
||||
DeserializeIndexData(index_data_buffer->data.get(), index_length);
|
||||
|
||||
LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}",
|
||||
Cardinality(),
|
||||
total_num_rows_);
|
||||
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::Load(milvus::tracer::TraceContext ctx, const Config& config) {
|
||||
auto index_files =
|
||||
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
|
||||
AssertInfo(index_files.has_value(),
|
||||
"index file paths is empty when load bitmap index");
|
||||
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
|
||||
AssembleIndexDatas(index_datas);
|
||||
BinarySet binary_set;
|
||||
for (auto& [key, data] : index_datas) {
|
||||
auto size = data->Size();
|
||||
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
|
||||
auto buf = std::shared_ptr<uint8_t[]>(
|
||||
(uint8_t*)const_cast<void*>(data->Data()), deleter);
|
||||
binary_set.Append(key, buf, size);
|
||||
}
|
||||
|
||||
LoadWithoutAssemble(binary_set, config);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::LoadV2(const Config& config) {
|
||||
auto blobs = space_->StatisticsBlobs();
|
||||
std::vector<std::string> index_files;
|
||||
auto prefix = file_manager_->GetRemoteIndexObjectPrefixV2();
|
||||
for (auto& b : blobs) {
|
||||
if (b.name.rfind(prefix, 0) == 0) {
|
||||
index_files.push_back(b.name);
|
||||
}
|
||||
}
|
||||
std::map<std::string, FieldDataPtr> index_datas{};
|
||||
for (auto& file_name : index_files) {
|
||||
auto res = space_->GetBlobByteSize(file_name);
|
||||
if (!res.ok()) {
|
||||
PanicInfo(S3Error, "unable to read index blob");
|
||||
}
|
||||
auto index_blob_data =
|
||||
std::shared_ptr<uint8_t[]>(new uint8_t[res.value()]);
|
||||
auto status = space_->ReadBlob(file_name, index_blob_data.get());
|
||||
if (!status.ok()) {
|
||||
PanicInfo(S3Error, "unable to read index blob");
|
||||
}
|
||||
auto raw_index_blob =
|
||||
storage::DeserializeFileData(index_blob_data, res.value());
|
||||
auto key = file_name.substr(file_name.find_last_of('/') + 1);
|
||||
index_datas[key] = raw_index_blob->GetFieldData();
|
||||
}
|
||||
AssembleIndexDatas(index_datas);
|
||||
|
||||
BinarySet binary_set;
|
||||
for (auto& [key, data] : index_datas) {
|
||||
auto size = data->Size();
|
||||
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
|
||||
auto buf = std::shared_ptr<uint8_t[]>(
|
||||
(uint8_t*)const_cast<void*>(data->Data()), deleter);
|
||||
binary_set.Append(key, buf, size);
|
||||
}
|
||||
|
||||
LoadWithoutAssemble(binary_set, config);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::In(const size_t n, const T* values) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto val = values[i];
|
||||
auto it = data_.find(val);
|
||||
if (it != data_.end()) {
|
||||
for (const auto& v : it->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto val = values[i];
|
||||
if (bitsets_.find(val) != bitsets_.end()) {
|
||||
res |= bitsets_.at(val);
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::NotIn(const size_t n, const T* values) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
TargetBitmap res(total_num_rows_, true);
|
||||
for (int i = 0; i < n; ++i) {
|
||||
auto val = values[i];
|
||||
auto it = data_.find(val);
|
||||
if (it != data_.end()) {
|
||||
for (const auto& v : it->second) {
|
||||
res.reset(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
} else {
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto val = values[i];
|
||||
if (bitsets_.find(val) != bitsets_.end()) {
|
||||
res |= bitsets_.at(val);
|
||||
}
|
||||
}
|
||||
res.flip();
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TargetBitmap
|
||||
BitmapIndex<T>::RangeForBitset(const T value, const OpType op) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (ShouldSkip(value, value, op)) {
|
||||
return res;
|
||||
}
|
||||
auto lb = bitsets_.begin();
|
||||
auto ub = bitsets_.end();
|
||||
|
||||
switch (op) {
|
||||
case OpType::LessThan: {
|
||||
ub = std::lower_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::LessEqual: {
|
||||
ub = std::upper_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterThan: {
|
||||
lb = std::upper_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterEqual: {
|
||||
lb = std::lower_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
PanicInfo(OpTypeInvalid,
|
||||
fmt::format("Invalid OperatorType: {}", op));
|
||||
}
|
||||
}
|
||||
|
||||
for (; lb != ub; lb++) {
|
||||
res |= lb->second;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::Range(const T value, OpType op) {
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
return std::move(RangeForRoaring(value, op));
|
||||
} else {
|
||||
return std::move(RangeForBitset(value, op));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TargetBitmap
|
||||
BitmapIndex<T>::RangeForRoaring(const T value, const OpType op) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (ShouldSkip(value, value, op)) {
|
||||
return res;
|
||||
}
|
||||
auto lb = data_.begin();
|
||||
auto ub = data_.end();
|
||||
|
||||
switch (op) {
|
||||
case OpType::LessThan: {
|
||||
ub = std::lower_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::LessEqual: {
|
||||
ub = std::upper_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterThan: {
|
||||
lb = std::upper_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterEqual: {
|
||||
lb = std::lower_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
PanicInfo(OpTypeInvalid,
|
||||
fmt::format("Invalid OperatorType: {}", op));
|
||||
}
|
||||
}
|
||||
|
||||
for (; lb != ub; lb++) {
|
||||
for (const auto& v : lb->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TargetBitmap
|
||||
BitmapIndex<T>::RangeForBitset(const T lower_value,
|
||||
bool lb_inclusive,
|
||||
const T upper_value,
|
||||
bool ub_inclusive) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (lower_value > upper_value ||
|
||||
(lower_value == upper_value && !(lb_inclusive && ub_inclusive))) {
|
||||
return res;
|
||||
}
|
||||
if (ShouldSkip(lower_value, upper_value, OpType::Range)) {
|
||||
return res;
|
||||
}
|
||||
|
||||
auto lb = bitsets_.begin();
|
||||
auto ub = bitsets_.end();
|
||||
|
||||
if (lb_inclusive) {
|
||||
lb = std::lower_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(lower_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
} else {
|
||||
lb = std::upper_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(lower_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
}
|
||||
|
||||
if (ub_inclusive) {
|
||||
ub = std::upper_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(upper_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
} else {
|
||||
ub = std::lower_bound(bitsets_.begin(),
|
||||
bitsets_.end(),
|
||||
std::make_pair(upper_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
}
|
||||
|
||||
for (; lb != ub; lb++) {
|
||||
res |= lb->second;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::Range(const T lower_value,
|
||||
bool lb_inclusive,
|
||||
const T upper_value,
|
||||
bool ub_inclusive) {
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
return RangeForRoaring(
|
||||
lower_value, lb_inclusive, upper_value, ub_inclusive);
|
||||
} else {
|
||||
return RangeForBitset(
|
||||
lower_value, lb_inclusive, upper_value, ub_inclusive);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
TargetBitmap
|
||||
BitmapIndex<T>::RangeForRoaring(const T lower_value,
|
||||
bool lb_inclusive,
|
||||
const T upper_value,
|
||||
bool ub_inclusive) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (lower_value > upper_value ||
|
||||
(lower_value == upper_value && !(lb_inclusive && ub_inclusive))) {
|
||||
return res;
|
||||
}
|
||||
if (ShouldSkip(lower_value, upper_value, OpType::Range)) {
|
||||
return res;
|
||||
}
|
||||
|
||||
auto lb = data_.begin();
|
||||
auto ub = data_.end();
|
||||
|
||||
if (lb_inclusive) {
|
||||
lb = std::lower_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(lower_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
} else {
|
||||
lb = std::upper_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(lower_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
}
|
||||
|
||||
if (ub_inclusive) {
|
||||
ub = std::upper_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(upper_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
} else {
|
||||
ub = std::lower_bound(data_.begin(),
|
||||
data_.end(),
|
||||
std::make_pair(upper_value, TargetBitmap()),
|
||||
[](const auto& lhs, const auto& rhs) {
|
||||
return lhs.first < rhs.first;
|
||||
});
|
||||
}
|
||||
|
||||
for (; lb != ub; lb++) {
|
||||
for (const auto& v : lb->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T
|
||||
BitmapIndex<T>::Reverse_Lookup(size_t idx) const {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
AssertInfo(idx < total_num_rows_, "out of range of total coun");
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (auto it = data_.begin(); it != data_.end(); it++) {
|
||||
for (const auto& v : it->second) {
|
||||
if (v == idx) {
|
||||
return it->first;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) {
|
||||
if (it->second[idx]) {
|
||||
return it->first;
|
||||
}
|
||||
}
|
||||
}
|
||||
PanicInfo(UnexpectedError,
|
||||
fmt::format(
|
||||
"scalar bitmap index can not lookup target value of index {}",
|
||||
idx));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
BitmapIndex<T>::ShouldSkip(const T lower_value,
|
||||
const T upper_value,
|
||||
const OpType op) {
|
||||
auto skip = [&](OpType op, T lower_bound, T upper_bound) -> bool {
|
||||
bool should_skip = false;
|
||||
switch (op) {
|
||||
case OpType::LessThan: {
|
||||
// lower_value == upper_value
|
||||
should_skip = lower_bound >= lower_value;
|
||||
break;
|
||||
}
|
||||
case OpType::LessEqual: {
|
||||
// lower_value == upper_value
|
||||
should_skip = lower_bound > lower_value;
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterThan: {
|
||||
// lower_value == upper_value
|
||||
should_skip = upper_bound <= lower_value;
|
||||
break;
|
||||
}
|
||||
case OpType::GreaterEqual: {
|
||||
// lower_value == upper_value
|
||||
should_skip = upper_bound < lower_value;
|
||||
break;
|
||||
}
|
||||
case OpType::Range: {
|
||||
// lower_value == upper_value
|
||||
should_skip =
|
||||
lower_bound > upper_value || upper_bound < lower_value;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
PanicInfo(OpTypeInvalid,
|
||||
fmt::format("Invalid OperatorType for "
|
||||
"checking scalar index optimization: {}",
|
||||
op));
|
||||
}
|
||||
return should_skip;
|
||||
};
|
||||
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
if (!data_.empty()) {
|
||||
auto lower_bound = data_.begin()->first;
|
||||
auto upper_bound = data_.rbegin()->first;
|
||||
bool should_skip = skip(op, lower_bound, upper_bound);
|
||||
return should_skip;
|
||||
}
|
||||
} else {
|
||||
if (!bitsets_.empty()) {
|
||||
auto lower_bound = bitsets_.begin()->first;
|
||||
auto upper_bound = bitsets_.rbegin()->first;
|
||||
bool should_skip = skip(op, lower_bound, upper_bound);
|
||||
return should_skip;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::Query(const DatasetPtr& dataset) {
|
||||
return ScalarIndex<T>::Query(dataset);
|
||||
}
|
||||
|
||||
template <>
|
||||
const TargetBitmap
|
||||
BitmapIndex<std::string>::Query(const DatasetPtr& dataset) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
|
||||
auto op = dataset->Get<OpType>(OPERATOR_TYPE);
|
||||
if (op == OpType::PrefixMatch) {
|
||||
auto prefix = dataset->Get<std::string>(PREFIX_VALUE);
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (auto it = data_.begin(); it != data_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (milvus::query::Match(key, prefix, op)) {
|
||||
for (const auto& v : it->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (milvus::query::Match(key, prefix, op)) {
|
||||
res |= it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
} else {
|
||||
PanicInfo(OpTypeInvalid,
|
||||
fmt::format("unsupported op_type:{} for bitmap query", op));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
BitmapIndex<T>::RegexQuery(const std::string& regex_pattern) {
|
||||
return ScalarIndex<T>::RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
template <>
|
||||
const TargetBitmap
|
||||
BitmapIndex<std::string>::RegexQuery(const std::string& regex_pattern) {
|
||||
AssertInfo(is_built_, "index has not been built");
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
TargetBitmap res(total_num_rows_, false);
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
for (auto it = data_.begin(); it != data_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (matcher(key)) {
|
||||
for (const auto& v : it->second) {
|
||||
res.set(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (auto it = bitsets_.begin(); it != bitsets_.end(); ++it) {
|
||||
const auto& key = it->first;
|
||||
if (matcher(key)) {
|
||||
res |= it->second;
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template class BitmapIndex<bool>;
|
||||
template class BitmapIndex<int8_t>;
|
||||
template class BitmapIndex<int16_t>;
|
||||
template class BitmapIndex<int32_t>;
|
||||
template class BitmapIndex<int64_t>;
|
||||
template class BitmapIndex<float>;
|
||||
template class BitmapIndex<double>;
|
||||
template class BitmapIndex<std::string>;
|
||||
|
||||
} // namespace index
|
||||
} // namespace milvus
|
||||
220
internal/core/src/index/BitmapIndex.h
Normal file
220
internal/core/src/index/BitmapIndex.h
Normal file
@ -0,0 +1,220 @@
|
||||
// Licensed to the LF AI & Data foundation under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <roaring/roaring.hh>
|
||||
|
||||
#include "common/RegexQuery.h"
|
||||
#include "index/ScalarIndex.h"
|
||||
#include "storage/FileManager.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
#include "storage/MemFileManagerImpl.h"
|
||||
|
||||
namespace milvus {
|
||||
namespace index {
|
||||
|
||||
enum class BitmapIndexBuildMode {
|
||||
ROARING,
|
||||
BITSET,
|
||||
};
|
||||
|
||||
/*
|
||||
* @brief Implementation of Bitmap Index
|
||||
* @details This index only for scalar Integral type.
|
||||
*/
|
||||
template <typename T>
|
||||
class BitmapIndex : public ScalarIndex<T> {
|
||||
public:
|
||||
explicit BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
explicit BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
~BitmapIndex() override = default;
|
||||
|
||||
BinarySet
|
||||
Serialize(const Config& config) override;
|
||||
|
||||
void
|
||||
Load(const BinarySet& index_binary, const Config& config = {}) override;
|
||||
|
||||
void
|
||||
Load(milvus::tracer::TraceContext ctx, const Config& config = {}) override;
|
||||
|
||||
void
|
||||
LoadV2(const Config& config = {}) override;
|
||||
|
||||
int64_t
|
||||
Count() override {
|
||||
return total_num_rows_;
|
||||
}
|
||||
|
||||
ScalarIndexType
|
||||
GetIndexType() const override {
|
||||
return ScalarIndexType::BITMAP;
|
||||
}
|
||||
|
||||
void
|
||||
Build(size_t n, const T* values) override;
|
||||
|
||||
void
|
||||
Build(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
BuildV2(const Config& config = {}) override;
|
||||
|
||||
void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
|
||||
|
||||
const TargetBitmap
|
||||
In(size_t n, const T* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
NotIn(size_t n, const T* values) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(T value, OpType op) override;
|
||||
|
||||
const TargetBitmap
|
||||
Range(T lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
T upper_bound_value,
|
||||
bool ub_inclusive) override;
|
||||
|
||||
T
|
||||
Reverse_Lookup(size_t offset) const override;
|
||||
|
||||
int64_t
|
||||
Size() override {
|
||||
return Count();
|
||||
}
|
||||
|
||||
BinarySet
|
||||
Upload(const Config& config = {}) override;
|
||||
|
||||
BinarySet
|
||||
UploadV2(const Config& config = {}) override;
|
||||
|
||||
const bool
|
||||
HasRawData() const override {
|
||||
if (schema_.data_type() == proto::schema::DataType::Array) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) override {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
return RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
bool
|
||||
SupportRegexQuery() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
const TargetBitmap
|
||||
RegexQuery(const std::string& regex_pattern) override;
|
||||
|
||||
public:
|
||||
int64_t
|
||||
Cardinality() {
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
return data_.size();
|
||||
} else {
|
||||
return bitsets_.size();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
BuildPrimitiveField(const std::vector<FieldDataPtr>& datas);
|
||||
|
||||
void
|
||||
BuildArrayField(const std::vector<FieldDataPtr>& datas);
|
||||
|
||||
size_t
|
||||
GetIndexDataSize();
|
||||
|
||||
void
|
||||
SerializeIndexData(uint8_t* index_data_ptr);
|
||||
|
||||
std::pair<std::shared_ptr<uint8_t[]>, size_t>
|
||||
SerializeIndexMeta();
|
||||
|
||||
std::pair<size_t, size_t>
|
||||
DeserializeIndexMeta(const uint8_t* data_ptr, size_t data_size);
|
||||
|
||||
void
|
||||
DeserializeIndexData(const uint8_t* data_ptr, size_t index_length);
|
||||
|
||||
void
|
||||
ChooseIndexLoadMode(int64_t index_length);
|
||||
|
||||
bool
|
||||
ShouldSkip(const T lower_value, const T upper_value, const OpType op);
|
||||
|
||||
TargetBitmap
|
||||
ConvertRoaringToBitset(const roaring::Roaring& values);
|
||||
|
||||
TargetBitmap
|
||||
RangeForRoaring(T value, OpType op);
|
||||
|
||||
TargetBitmap
|
||||
RangeForBitset(T value, OpType op);
|
||||
|
||||
TargetBitmap
|
||||
RangeForRoaring(T lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
T upper_bound_value,
|
||||
bool ub_inclusive);
|
||||
|
||||
TargetBitmap
|
||||
RangeForBitset(T lower_bound_value,
|
||||
bool lb_inclusive,
|
||||
T upper_bound_value,
|
||||
bool ub_inclusive);
|
||||
|
||||
public:
|
||||
bool is_built_{false};
|
||||
Config config_;
|
||||
BitmapIndexBuildMode build_mode_;
|
||||
std::map<T, roaring::Roaring> data_;
|
||||
std::map<T, TargetBitmap> bitsets_;
|
||||
size_t total_num_rows_{0};
|
||||
proto::schema::FieldSchema schema_;
|
||||
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
|
||||
std::shared_ptr<milvus_storage::Space> space_;
|
||||
};
|
||||
|
||||
} // namespace index
|
||||
} // namespace milvus
|
||||
@ -18,6 +18,7 @@ set(INDEX_FILES
|
||||
ScalarIndex.cpp
|
||||
ScalarIndexSort.cpp
|
||||
SkipIndex.cpp
|
||||
BitmapIndex.cpp
|
||||
InvertedIndexTantivy.cpp
|
||||
)
|
||||
|
||||
|
||||
@ -27,6 +27,7 @@
|
||||
#include "index/StringIndexMarisa.h"
|
||||
#include "index/BoolIndex.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "index/BitmapIndex.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
@ -38,6 +39,9 @@ IndexFactory::CreateScalarIndex(
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<T>>(file_manager_context);
|
||||
}
|
||||
return CreateScalarIndexSort<T>(file_manager_context);
|
||||
}
|
||||
|
||||
@ -58,6 +62,9 @@ IndexFactory::CreateScalarIndex<std::string>(
|
||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||
file_manager_context);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<std::string>>(file_manager_context);
|
||||
}
|
||||
return CreateStringIndexMarisa(file_manager_context);
|
||||
#else
|
||||
PanicInfo(Unsupported, "unsupported platform");
|
||||
@ -74,6 +81,9 @@ IndexFactory::CreateScalarIndex(
|
||||
return std::make_unique<InvertedIndexTantivy<T>>(file_manager_context,
|
||||
space);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<T>>(file_manager_context, space);
|
||||
}
|
||||
return CreateScalarIndexSort<T>(file_manager_context, space);
|
||||
}
|
||||
|
||||
@ -88,6 +98,10 @@ IndexFactory::CreateScalarIndex<std::string>(
|
||||
return std::make_unique<InvertedIndexTantivy<std::string>>(
|
||||
file_manager_context, space);
|
||||
}
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
return std::make_unique<BitmapIndex<std::string>>(file_manager_context,
|
||||
space);
|
||||
}
|
||||
return CreateStringIndexMarisa(file_manager_context, space);
|
||||
#else
|
||||
PanicInfo(Unsupported, "unsupported platform");
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
|
||||
#include "tantivy-binding.h"
|
||||
#include "common/Slice.h"
|
||||
#include "common/RegexQuery.h"
|
||||
#include "storage/LocalChunkManagerSingleton.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "log/Log.h"
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#include "tantivy-wrapper.h"
|
||||
#include "index/StringIndex.h"
|
||||
#include "storage/space.h"
|
||||
#include "common/RegexQuery.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
@ -157,6 +158,13 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||
const TargetBitmap
|
||||
Query(const DatasetPtr& dataset) override;
|
||||
|
||||
const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) override {
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(pattern);
|
||||
return RegexQuery(regex_pattern);
|
||||
}
|
||||
|
||||
bool
|
||||
SupportRegexQuery() const override {
|
||||
return true;
|
||||
@ -165,6 +173,11 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
|
||||
const TargetBitmap
|
||||
RegexQuery(const std::string& pattern) override;
|
||||
|
||||
ScalarIndexType
|
||||
GetIndexType() const override {
|
||||
return ScalarIndexType::INVERTED;
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
finish();
|
||||
|
||||
@ -37,6 +37,7 @@ constexpr const char* METRIC_TYPE = "metric_type";
|
||||
constexpr const char* ASCENDING_SORT = "STL_SORT";
|
||||
constexpr const char* MARISA_TRIE = "Trie";
|
||||
constexpr const char* INVERTED_INDEX_TYPE = "INVERTED";
|
||||
constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
|
||||
|
||||
// index meta
|
||||
constexpr const char* COLLECTION_ID = "collection_id";
|
||||
@ -48,6 +49,12 @@ constexpr const char* INDEX_ID = "index_id";
|
||||
constexpr const char* INDEX_VERSION = "index_version";
|
||||
constexpr const char* INDEX_ENGINE_VERSION = "index_engine_version";
|
||||
|
||||
// below meta key of store bitmap indexes
|
||||
constexpr const char* BITMAP_INDEX_DATA = "bitmap_index_data";
|
||||
constexpr const char* BITMAP_INDEX_META = "bitmap_index_meta";
|
||||
constexpr const char* BITMAP_INDEX_LENGTH = "bitmap_index_length";
|
||||
constexpr const char* BITMAP_INDEX_NUM_ROWS = "bitmap_index_num_rows";
|
||||
|
||||
// VecIndex file metas
|
||||
constexpr const char* DISK_ANN_PREFIX_PATH = "index_prefix";
|
||||
constexpr const char* DISK_ANN_RAW_DATA_PATH = "data_path";
|
||||
|
||||
@ -23,11 +23,20 @@
|
||||
|
||||
#include "common/Types.h"
|
||||
#include "common/EasyAssert.h"
|
||||
#include "common/FieldData.h"
|
||||
#include "index/Index.h"
|
||||
#include "fmt/format.h"
|
||||
|
||||
namespace milvus::index {
|
||||
|
||||
enum class ScalarIndexType {
|
||||
NONE = 0,
|
||||
BITMAP,
|
||||
STLSORT,
|
||||
MARISA,
|
||||
INVERTED,
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class ScalarIndex : public IndexBase {
|
||||
public:
|
||||
@ -44,6 +53,9 @@ class ScalarIndex : public IndexBase {
|
||||
};
|
||||
|
||||
public:
|
||||
virtual ScalarIndexType
|
||||
GetIndexType() const = 0;
|
||||
|
||||
virtual void
|
||||
Build(size_t n, const T* values) = 0;
|
||||
|
||||
@ -94,6 +106,21 @@ class ScalarIndex : public IndexBase {
|
||||
RegexQuery(const std::string& pattern) {
|
||||
PanicInfo(Unsupported, "regex query is not supported");
|
||||
}
|
||||
|
||||
virtual const TargetBitmap
|
||||
PatternMatch(const std::string& pattern) {
|
||||
PanicInfo(Unsupported, "pattern match is not supported");
|
||||
}
|
||||
|
||||
virtual void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& field_datas) {
|
||||
PanicInfo(Unsupported, "BuildwithFieldData is not supported");
|
||||
}
|
||||
|
||||
virtual void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config) {
|
||||
PanicInfo(Unsupported, "LoadWithoutAssemble is not supported");
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
|
||||
@ -77,31 +77,8 @@ ScalarIndexSort<T>::BuildV2(const Config& config) {
|
||||
field_data->FillFieldData(col_data);
|
||||
field_datas.push_back(field_data);
|
||||
}
|
||||
int64_t total_num_rows = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
total_num_rows += data->get_num_rows();
|
||||
}
|
||||
if (total_num_rows == 0) {
|
||||
PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!");
|
||||
}
|
||||
|
||||
data_.reserve(total_num_rows);
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_num; ++i) {
|
||||
auto value = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_.emplace_back(IndexStructure(*value, offset));
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(data_.begin(), data_.end());
|
||||
idx_to_offsets_.resize(total_num_rows);
|
||||
for (size_t i = 0; i < total_num_rows; ++i) {
|
||||
idx_to_offsets_[data_[i].idx_] = i;
|
||||
}
|
||||
is_built_ = true;
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -264,6 +241,37 @@ ScalarIndexSort<T>::Load(milvus::tracer::TraceContext ctx,
|
||||
LoadWithoutAssemble(binary_set, config);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
ScalarIndexSort<T>::BuildWithFieldData(
|
||||
const std::vector<milvus::FieldDataPtr>& field_datas) {
|
||||
int64_t total_num_rows = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
total_num_rows += data->get_num_rows();
|
||||
}
|
||||
if (total_num_rows == 0) {
|
||||
PanicInfo(DataIsEmpty, "ScalarIndexSort cannot build null values!");
|
||||
}
|
||||
|
||||
data_.reserve(total_num_rows);
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_num; ++i) {
|
||||
auto value = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_.emplace_back(IndexStructure(*value, offset));
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(data_.begin(), data_.end());
|
||||
idx_to_offsets_.resize(total_num_rows);
|
||||
for (size_t i = 0; i < total_num_rows; ++i) {
|
||||
idx_to_offsets_[data_[i].idx_] = i;
|
||||
}
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
ScalarIndexSort<T>::LoadV2(const Config& config) {
|
||||
|
||||
@ -58,6 +58,11 @@ class ScalarIndexSort : public ScalarIndex<T> {
|
||||
return data_.size();
|
||||
}
|
||||
|
||||
ScalarIndexType
|
||||
GetIndexType() const override {
|
||||
return ScalarIndexType::STLSORT;
|
||||
}
|
||||
|
||||
void
|
||||
Build(size_t n, const T* values) override;
|
||||
|
||||
@ -116,7 +121,11 @@ class ScalarIndexSort : public ScalarIndex<T> {
|
||||
}
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
|
||||
|
||||
private:
|
||||
bool is_built_;
|
||||
|
||||
@ -132,6 +132,13 @@ StringIndexMarisa::Build(const Config& config) {
|
||||
"insert file paths is empty when build index");
|
||||
auto field_datas =
|
||||
file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
void
|
||||
StringIndexMarisa::BuildWithFieldData(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
int64_t total_num_rows = 0;
|
||||
|
||||
// fill key set.
|
||||
|
||||
@ -57,6 +57,14 @@ class StringIndexMarisa : public StringIndex {
|
||||
return str_ids_.size();
|
||||
}
|
||||
|
||||
ScalarIndexType
|
||||
GetIndexType() const override {
|
||||
return ScalarIndexType::MARISA;
|
||||
}
|
||||
|
||||
void
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& field_datas) override;
|
||||
|
||||
void
|
||||
Build(size_t n, const std::string* values) override;
|
||||
|
||||
@ -113,7 +121,8 @@ class StringIndexMarisa : public StringIndex {
|
||||
prefix_match(const std::string_view prefix);
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set, const Config& config);
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
private:
|
||||
Config config_;
|
||||
|
||||
@ -70,6 +70,7 @@ set(MILVUS_TEST_FILES
|
||||
test_chunk_vector.cpp
|
||||
test_mmap_chunk_manager.cpp
|
||||
test_futures.cpp
|
||||
test_bitmap_index.cpp
|
||||
test_monitor.cpp
|
||||
)
|
||||
|
||||
|
||||
403
internal/core/unittest/test_bitmap_index.cpp
Normal file
403
internal/core/unittest/test_bitmap_index.cpp
Normal file
@ -0,0 +1,403 @@
|
||||
// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <functional>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <unordered_set>
|
||||
#include <memory>
|
||||
|
||||
#include "common/Tracer.h"
|
||||
#include "index/BitmapIndex.h"
|
||||
#include "storage/Util.h"
|
||||
#include "storage/InsertData.h"
|
||||
#include "indexbuilder/IndexFactory.h"
|
||||
#include "index/IndexFactory.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
#include "index/Meta.h"
|
||||
|
||||
using namespace milvus::index;
|
||||
using namespace milvus::indexbuilder;
|
||||
using namespace milvus;
|
||||
using namespace milvus::index;
|
||||
|
||||
template <typename T>
|
||||
static std::vector<T>
|
||||
GenerateData(const size_t size, const size_t cardinality) {
|
||||
std::vector<T> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(rand() % cardinality);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::vector<bool>
|
||||
GenerateData<bool>(const size_t size, const size_t cardinality) {
|
||||
std::vector<bool> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(rand() % 2 == 0);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::vector<std::string>
|
||||
GenerateData<std::string>(const size_t size, const size_t cardinality) {
|
||||
std::vector<std::string> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(std::to_string(rand() % cardinality));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class BitmapIndexTest : public testing::Test {
|
||||
protected:
|
||||
void
|
||||
Init(int64_t collection_id,
|
||||
int64_t partition_id,
|
||||
int64_t segment_id,
|
||||
int64_t field_id,
|
||||
int64_t index_build_id,
|
||||
int64_t index_version) {
|
||||
proto::schema::FieldSchema field_schema;
|
||||
if constexpr (std::is_same_v<int8_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int8);
|
||||
} else if constexpr (std::is_same_v<int16_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int16);
|
||||
} else if constexpr (std::is_same_v<int32_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int32);
|
||||
} else if constexpr (std::is_same_v<int64_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int64);
|
||||
} else if constexpr (std::is_same_v<float, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Float);
|
||||
} else if constexpr (std::is_same_v<double, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Double);
|
||||
} else if constexpr (std::is_same_v<std::string, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::String);
|
||||
}
|
||||
auto field_meta = storage::FieldDataMeta{
|
||||
collection_id, partition_id, segment_id, field_id, field_schema};
|
||||
auto index_meta = storage::IndexMeta{
|
||||
segment_id, field_id, index_build_id, index_version};
|
||||
|
||||
std::vector<T> data_gen;
|
||||
data_gen = GenerateData<T>(nb_, cardinality_);
|
||||
for (auto x : data_gen) {
|
||||
data_.push_back(x);
|
||||
}
|
||||
|
||||
auto field_data = storage::CreateFieldData(type_);
|
||||
field_data->FillFieldData(data_.data(), data_.size());
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::Remote);
|
||||
|
||||
auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}",
|
||||
"/tmp/test_bitmap/",
|
||||
collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
0);
|
||||
chunk_manager_->Write(
|
||||
log_path, serialized_bytes.data(), serialized_bytes.size());
|
||||
|
||||
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
|
||||
std::vector<std::string> index_files;
|
||||
|
||||
Config config;
|
||||
config["index_type"] = milvus::index::BITMAP_INDEX_TYPE;
|
||||
config["insert_files"] = std::vector<std::string>{log_path};
|
||||
|
||||
auto build_index =
|
||||
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||
type_, config, ctx);
|
||||
build_index->Build();
|
||||
|
||||
auto binary_set = build_index->Upload();
|
||||
for (const auto& [key, _] : binary_set.binary_map_) {
|
||||
index_files.push_back(key);
|
||||
}
|
||||
|
||||
index::CreateIndexInfo index_info{};
|
||||
index_info.index_type = milvus::index::BITMAP_INDEX_TYPE;
|
||||
index_info.field_type = type_;
|
||||
|
||||
config["index_files"] = index_files;
|
||||
|
||||
index_ =
|
||||
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
|
||||
index_->Load(milvus::tracer::TraceContext{}, config);
|
||||
}
|
||||
|
||||
virtual void
|
||||
SetParam() {
|
||||
nb_ = 10000;
|
||||
cardinality_ = 30;
|
||||
}
|
||||
void
|
||||
SetUp() override {
|
||||
SetParam();
|
||||
|
||||
if constexpr (std::is_same_v<T, int8_t>) {
|
||||
type_ = DataType::INT8;
|
||||
} else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
type_ = DataType::INT16;
|
||||
} else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
type_ = DataType::INT32;
|
||||
} else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
type_ = DataType::INT64;
|
||||
} else if constexpr (std::is_same_v<T, std::string>) {
|
||||
type_ = DataType::VARCHAR;
|
||||
}
|
||||
int64_t collection_id = 1;
|
||||
int64_t partition_id = 2;
|
||||
int64_t segment_id = 3;
|
||||
int64_t field_id = 101;
|
||||
int64_t index_build_id = 1000;
|
||||
int64_t index_version = 10000;
|
||||
std::string root_path = "/tmp/test-bitmap-index/";
|
||||
|
||||
storage::StorageConfig storage_config;
|
||||
storage_config.storage_type = "local";
|
||||
storage_config.root_path = root_path;
|
||||
chunk_manager_ = storage::CreateChunkManager(storage_config);
|
||||
|
||||
Init(collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
index_build_id,
|
||||
index_version);
|
||||
}
|
||||
|
||||
virtual ~BitmapIndexTest() override {
|
||||
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
|
||||
}
|
||||
|
||||
public:
|
||||
void
|
||||
TestInFunc() {
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq; i++) {
|
||||
test_data.push_back(data_[i]);
|
||||
s.insert(data_[i]);
|
||||
}
|
||||
auto index_ptr = dynamic_cast<index::BitmapIndex<T>*>(index_.get());
|
||||
auto bitset = index_ptr->In(test_data.size(), test_data.data());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TestNotInFunc() {
|
||||
boost::container::vector<T> test_data;
|
||||
std::unordered_set<T> s;
|
||||
size_t nq = 10;
|
||||
for (size_t i = 0; i < nq; i++) {
|
||||
test_data.push_back(data_[i]);
|
||||
s.insert(data_[i]);
|
||||
}
|
||||
auto index_ptr = dynamic_cast<index::BitmapIndex<T>*>(index_.get());
|
||||
auto bitset = index_ptr->NotIn(test_data.size(), test_data.data());
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
ASSERT_EQ(bitset[i], s.find(data_[i]) == s.end());
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TestCompareValueFunc() {
|
||||
if constexpr (!std::is_same_v<T, std::string>) {
|
||||
using RefFunc = std::function<bool(int64_t)>;
|
||||
std::vector<std::tuple<T, OpType, RefFunc>> test_cases{
|
||||
{10,
|
||||
OpType::GreaterThan,
|
||||
[&](int64_t i) -> bool { return data_[i] > 10; }},
|
||||
{10,
|
||||
OpType::GreaterEqual,
|
||||
[&](int64_t i) -> bool { return data_[i] >= 10; }},
|
||||
{10,
|
||||
OpType::LessThan,
|
||||
[&](int64_t i) -> bool { return data_[i] < 10; }},
|
||||
{10,
|
||||
OpType::LessEqual,
|
||||
[&](int64_t i) -> bool { return data_[i] <= 10; }},
|
||||
};
|
||||
for (const auto& [test_value, op, ref] : test_cases) {
|
||||
auto index_ptr =
|
||||
dynamic_cast<index::BitmapIndex<T>*>(index_.get());
|
||||
auto bitset = index_ptr->Range(test_value, op);
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = ref(i);
|
||||
ASSERT_EQ(ans, should)
|
||||
<< "op: " << op << ", @" << i << ", ans: " << ans
|
||||
<< ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TestRangeCompareFunc() {
|
||||
if constexpr (!std::is_same_v<T, std::string>) {
|
||||
using RefFunc = std::function<bool(int64_t)>;
|
||||
struct TestParam {
|
||||
int64_t lower_val;
|
||||
int64_t upper_val;
|
||||
bool lower_inclusive;
|
||||
bool upper_inclusive;
|
||||
RefFunc ref;
|
||||
};
|
||||
std::vector<TestParam> test_cases = {
|
||||
{
|
||||
10,
|
||||
30,
|
||||
false,
|
||||
false,
|
||||
[&](int64_t i) { return 10 < data_[i] && data_[i] < 30; },
|
||||
},
|
||||
{
|
||||
10,
|
||||
30,
|
||||
true,
|
||||
false,
|
||||
[&](int64_t i) { return 10 <= data_[i] && data_[i] < 30; },
|
||||
},
|
||||
{
|
||||
10,
|
||||
30,
|
||||
true,
|
||||
true,
|
||||
[&](int64_t i) { return 10 <= data_[i] && data_[i] <= 30; },
|
||||
},
|
||||
{
|
||||
10,
|
||||
30,
|
||||
false,
|
||||
true,
|
||||
[&](int64_t i) { return 10 < data_[i] && data_[i] <= 30; },
|
||||
}};
|
||||
|
||||
for (const auto& test_case : test_cases) {
|
||||
auto index_ptr =
|
||||
dynamic_cast<index::BitmapIndex<T>*>(index_.get());
|
||||
auto bitset = index_ptr->Range(test_case.lower_val,
|
||||
test_case.lower_inclusive,
|
||||
test_case.upper_val,
|
||||
test_case.upper_inclusive);
|
||||
for (size_t i = 0; i < bitset.size(); i++) {
|
||||
auto ans = bitset[i];
|
||||
auto should = test_case.ref(i);
|
||||
ASSERT_EQ(ans, should)
|
||||
<< "lower:" << test_case.lower_val
|
||||
<< "upper:" << test_case.upper_val << ", @" << i
|
||||
<< ", ans: " << ans << ", ref: " << should;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
IndexBasePtr index_;
|
||||
DataType type_;
|
||||
size_t nb_;
|
||||
size_t cardinality_;
|
||||
boost::container::vector<T> data_;
|
||||
std::shared_ptr<storage::ChunkManager> chunk_manager_;
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(BitmapIndexTest);
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTest, CountFuncTest) {
|
||||
auto count = this->index_->Count();
|
||||
EXPECT_EQ(count, this->nb_);
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTest, INFuncTest) {
|
||||
this->TestInFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTest, NotINFuncTest) {
|
||||
this->TestNotInFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTest, CompareValFuncTest) {
|
||||
this->TestCompareValueFunc();
|
||||
}
|
||||
|
||||
using BitmapType =
|
||||
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
|
||||
|
||||
REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTest,
|
||||
CountFuncTest,
|
||||
INFuncTest,
|
||||
NotINFuncTest,
|
||||
CompareValFuncTest);
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(BitmapE2ECheck, BitmapIndexTest, BitmapType);
|
||||
|
||||
template <typename T>
|
||||
class BitmapIndexTestV2 : public BitmapIndexTest<T> {
|
||||
public:
|
||||
virtual void
|
||||
SetParam() override {
|
||||
this->nb_ = 10000;
|
||||
this->cardinality_ = 2000;
|
||||
}
|
||||
|
||||
virtual ~BitmapIndexTestV2() {
|
||||
}
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(BitmapIndexTestV2);
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTestV2, CountFuncTest) {
|
||||
auto count = this->index_->Count();
|
||||
EXPECT_EQ(count, this->nb_);
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTestV2, INFuncTest) {
|
||||
this->TestInFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTestV2, NotINFuncTest) {
|
||||
this->TestNotInFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTestV2, CompareValFuncTest) {
|
||||
this->TestCompareValueFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(BitmapIndexTestV2, TestRangeCompareFuncTest) {
|
||||
this->TestRangeCompareFunc();
|
||||
}
|
||||
|
||||
using BitmapType =
|
||||
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
|
||||
|
||||
REGISTER_TYPED_TEST_SUITE_P(BitmapIndexTestV2,
|
||||
CountFuncTest,
|
||||
INFuncTest,
|
||||
NotINFuncTest,
|
||||
CompareValFuncTest,
|
||||
TestRangeCompareFuncTest);
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(BitmapIndexE2ECheck_HighCardinality,
|
||||
BitmapIndexTestV2,
|
||||
BitmapType);
|
||||
@ -478,26 +478,30 @@ GenDsFromPB(const google::protobuf::Message& msg) {
|
||||
template <typename T>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypes() {
|
||||
return std::vector<std::string>{"sort"};
|
||||
return std::vector<std::string>{"sort", milvus::index::BITMAP_INDEX_TYPE};
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypes<std::string>() {
|
||||
return std::vector<std::string>{"sort", "marisa"};
|
||||
return std::vector<std::string>{
|
||||
"sort", "marisa", milvus::index::BITMAP_INDEX_TYPE};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2() {
|
||||
return std::vector<std::string>{"sort", milvus::index::INVERTED_INDEX_TYPE};
|
||||
return std::vector<std::string>{"sort",
|
||||
milvus::index::INVERTED_INDEX_TYPE,
|
||||
milvus::index::BITMAP_INDEX_TYPE};
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2<std::string>() {
|
||||
return std::vector<std::string>{milvus::index::INVERTED_INDEX_TYPE,
|
||||
"marisa"};
|
||||
"marisa",
|
||||
milvus::index::BITMAP_INDEX_TYPE};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -409,7 +409,7 @@ func checkTrain(field *schemapb.FieldSchema, indexParams map[string]string) erro
|
||||
indexParams[IsSparseKey] = "true"
|
||||
}
|
||||
|
||||
if err := checker.CheckValidDataType(field.GetDataType()); err != nil {
|
||||
if err := checker.CheckValidDataType(field); err != nil {
|
||||
log.Info("create index with invalid data type", zap.Error(err), zap.String("data_type", field.GetDataType().String()))
|
||||
return err
|
||||
}
|
||||
|
||||
@ -13,7 +13,7 @@ func (c *AUTOINDEXChecker) CheckTrain(params map[string]string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *AUTOINDEXChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
func (c *AUTOINDEXChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ func (c baseChecker) CheckTrain(params map[string]string) error {
|
||||
}
|
||||
|
||||
// CheckValidDataType check whether the field data type is supported for the index type
|
||||
func (c baseChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
func (c baseChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@ -114,7 +114,8 @@ func Test_baseChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newBaseChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
fieldSchema := &schemapb.FieldSchema{DataType: test.dType}
|
||||
err := c.CheckValidDataType(fieldSchema)
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -136,7 +136,8 @@ func Test_binFlatChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newBinFlatChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
fieldSchema := &schemapb.FieldSchema{DataType: test.dType}
|
||||
err := c.CheckValidDataType(fieldSchema)
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -187,7 +187,8 @@ func Test_binIVFFlatChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newBinIVFFlatChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
fieldSchema := &schemapb.FieldSchema{DataType: test.dType}
|
||||
err := c.CheckValidDataType(fieldSchema)
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -27,8 +27,8 @@ func (c binaryVectorBaseChecker) CheckTrain(params map[string]string) error {
|
||||
return c.staticCheck(params)
|
||||
}
|
||||
|
||||
func (c binaryVectorBaseChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if dType != schemapb.DataType_BinaryVector {
|
||||
func (c binaryVectorBaseChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if field.GetDataType() != schemapb.DataType_BinaryVector {
|
||||
return fmt.Errorf("binary vector is only supported")
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -69,7 +69,8 @@ func Test_binaryVectorBaseChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newBinaryVectorBaseChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
fieldSchema := &schemapb.FieldSchema{DataType: test.dType}
|
||||
err := c.CheckValidDataType(fieldSchema)
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
32
pkg/util/indexparamcheck/bitmap_checker_test.go
Normal file
32
pkg/util/indexparamcheck/bitmap_checker_test.go
Normal file
@ -0,0 +1,32 @@
|
||||
package indexparamcheck
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
)
|
||||
|
||||
func Test_BitmapIndexChecker(t *testing.T) {
|
||||
c := newBITMAPChecker()
|
||||
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int8}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int16}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int32}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_String}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Bool}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Int8}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Int16}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Int32}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Int64}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_String}))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Float}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Double}))
|
||||
}
|
||||
36
pkg/util/indexparamcheck/bitmap_index_checker.go
Normal file
36
pkg/util/indexparamcheck/bitmap_index_checker.go
Normal file
@ -0,0 +1,36 @@
|
||||
package indexparamcheck
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
||||
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
||||
)
|
||||
|
||||
type BITMAPChecker struct {
|
||||
scalarIndexChecker
|
||||
}
|
||||
|
||||
func (c *BITMAPChecker) CheckTrain(params map[string]string) error {
|
||||
return c.scalarIndexChecker.CheckTrain(params)
|
||||
}
|
||||
|
||||
func (c *BITMAPChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
mainType := field.GetDataType()
|
||||
elemType := field.GetElementType()
|
||||
if !typeutil.IsBoolType(mainType) && !typeutil.IsIntegerType(mainType) &&
|
||||
!typeutil.IsStringType(mainType) && !typeutil.IsArrayType(mainType) {
|
||||
return fmt.Errorf("bitmap index are only supported on bool, int, string and array field")
|
||||
}
|
||||
if typeutil.IsArrayType(mainType) {
|
||||
if !typeutil.IsBoolType(elemType) && !typeutil.IsIntegerType(elemType) &&
|
||||
!typeutil.IsStringType(elemType) {
|
||||
return fmt.Errorf("bitmap index are only supported on bool, int, string for array field")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newBITMAPChecker() *BITMAPChecker {
|
||||
return &BITMAPChecker{}
|
||||
}
|
||||
@ -63,6 +63,7 @@ func (mgr *indexCheckerMgrImpl) registerIndexChecker() {
|
||||
mgr.checkers[IndexINVERTED] = newINVERTEDChecker()
|
||||
mgr.checkers[IndexSTLSORT] = newSTLSORTChecker()
|
||||
mgr.checkers["Asceneding"] = newSTLSORTChecker()
|
||||
mgr.checkers[IndexBitmap] = newBITMAPChecker()
|
||||
mgr.checkers[IndexTRIE] = newTRIEChecker()
|
||||
mgr.checkers[IndexTrie] = newTRIEChecker()
|
||||
mgr.checkers["marisa-trie"] = newTRIEChecker()
|
||||
|
||||
@ -144,7 +144,7 @@ func Test_diskannChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newDiskannChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -28,8 +28,8 @@ func (c floatVectorBaseChecker) CheckTrain(params map[string]string) error {
|
||||
return c.staticCheck(params)
|
||||
}
|
||||
|
||||
func (c floatVectorBaseChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsDenseFloatVectorType(dType) {
|
||||
func (c floatVectorBaseChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsDenseFloatVectorType(field.GetDataType()) {
|
||||
return fmt.Errorf("data type should be FloatVector, Float16Vector or BFloat16Vector")
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -69,7 +69,7 @@ func Test_floatVectorBaseChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newFloatVectorBaseChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -32,9 +32,9 @@ func (c hnswChecker) CheckTrain(params map[string]string) error {
|
||||
return c.baseChecker.CheckTrain(params)
|
||||
}
|
||||
|
||||
func (c hnswChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsVectorType(dType) {
|
||||
return fmt.Errorf("can't create hnsw in not vector type")
|
||||
func (c hnswChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsVectorType(field.GetDataType()) {
|
||||
return fmt.Errorf("can't build hnsw in not vector type")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -164,7 +164,7 @@ func Test_hnswChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newHnswChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -22,7 +22,7 @@ import (
|
||||
|
||||
type IndexChecker interface {
|
||||
CheckTrain(map[string]string) error
|
||||
CheckValidDataType(dType schemapb.DataType) error
|
||||
CheckValidDataType(field *schemapb.FieldSchema) error
|
||||
SetDefaultMetricTypeIfNotExist(map[string]string, schemapb.DataType)
|
||||
StaticCheck(map[string]string) error
|
||||
}
|
||||
|
||||
@ -37,6 +37,7 @@ const (
|
||||
IndexSTLSORT IndexType = "STL_SORT"
|
||||
IndexTRIE IndexType = "TRIE"
|
||||
IndexTrie IndexType = "Trie"
|
||||
IndexBitmap IndexType = "BITMAP"
|
||||
|
||||
AutoIndex IndexType = "AUTOINDEX"
|
||||
)
|
||||
|
||||
@ -16,7 +16,8 @@ func (c *INVERTEDChecker) CheckTrain(params map[string]string) error {
|
||||
return c.scalarIndexChecker.CheckTrain(params)
|
||||
}
|
||||
|
||||
func (c *INVERTEDChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
func (c *INVERTEDChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
dType := field.GetDataType()
|
||||
if !typeutil.IsBoolType(dType) && !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) &&
|
||||
!typeutil.IsArrayType(dType) {
|
||||
return fmt.Errorf("INVERTED are not supported on %s field", dType.String())
|
||||
|
||||
@ -13,13 +13,13 @@ func Test_INVERTEDIndexChecker(t *testing.T) {
|
||||
|
||||
assert.NoError(t, c.CheckTrain(map[string]string{}))
|
||||
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_VarChar))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_String))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Bool))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Array))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_String}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array}))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_FloatVector))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_FloatVector}))
|
||||
}
|
||||
|
||||
@ -142,7 +142,7 @@ func Test_ivfBaseChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newIVFBaseChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -213,7 +213,7 @@ func Test_ivfPQChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newIVFPQChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -162,7 +162,7 @@ func Test_ivfSQChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newIVFSQChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -156,7 +156,7 @@ func Test_raftIvfFlatChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newRaftIVFFlatChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -216,7 +216,7 @@ func Test_raftIVFPQChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newRaftIVFPQChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -159,7 +159,7 @@ func Test_scaNNChecker_CheckValidDataType(t *testing.T) {
|
||||
|
||||
c := newScaNNChecker()
|
||||
for _, test := range cases {
|
||||
err := c.CheckValidDataType(test.dType)
|
||||
err := c.CheckValidDataType(&schemapb.FieldSchema{DataType: test.dType})
|
||||
if test.errIsNil {
|
||||
assert.NoError(t, err)
|
||||
} else {
|
||||
|
||||
@ -32,8 +32,8 @@ func (c sparseFloatVectorBaseChecker) CheckTrain(params map[string]string) error
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c sparseFloatVectorBaseChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsSparseFloatVectorType(dType) {
|
||||
func (c sparseFloatVectorBaseChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsSparseFloatVectorType(field.GetDataType()) {
|
||||
return fmt.Errorf("only sparse float vector is supported for the specified index tpye")
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -16,8 +16,8 @@ func (c *STLSORTChecker) CheckTrain(params map[string]string) error {
|
||||
return c.scalarIndexChecker.CheckTrain(params)
|
||||
}
|
||||
|
||||
func (c *STLSORTChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsArithmetic(dType) {
|
||||
func (c *STLSORTChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsArithmetic(field.GetDataType()) {
|
||||
return fmt.Errorf("STL_SORT are only supported on numeric field")
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -13,10 +13,10 @@ func Test_STLSORTIndexChecker(t *testing.T) {
|
||||
|
||||
assert.NoError(t, c.CheckTrain(map[string]string{}))
|
||||
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Bool))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_VarChar))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
|
||||
}
|
||||
|
||||
@ -16,8 +16,8 @@ func (c *TRIEChecker) CheckTrain(params map[string]string) error {
|
||||
return c.scalarIndexChecker.CheckTrain(params)
|
||||
}
|
||||
|
||||
func (c *TRIEChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsStringType(dType) {
|
||||
func (c *TRIEChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
|
||||
if !typeutil.IsStringType(field.GetDataType()) {
|
||||
return fmt.Errorf("TRIE are only supported on varchar field")
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -13,11 +13,11 @@ func Test_TrieIndexChecker(t *testing.T) {
|
||||
|
||||
assert.NoError(t, c.CheckTrain(map[string]string{}))
|
||||
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_VarChar))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_String))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_VarChar}))
|
||||
assert.NoError(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_String}))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Bool))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Int64))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Float))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Bool}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Int64}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Float}))
|
||||
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_JSON}))
|
||||
}
|
||||
|
||||
@ -2685,6 +2685,98 @@ class TestQueryString(TestcaseBase):
|
||||
collection_w.query(expression, output_fields=output_fields,
|
||||
check_task=CheckTasks.check_query_results, check_items={exp_res: res})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_prefixes_auto_index(self):
|
||||
"""
|
||||
target: test query with prefix string expression and indexed with auto index
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="varchar_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_auto_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_prefixes_bitmap(self):
|
||||
"""
|
||||
target: test query with prefix string expression and indexed with bitmap
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="bitmap_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_bitmap_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_match_auto_index(self):
|
||||
"""
|
||||
target: test query with match string expression and indexed with auto index
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="varchar_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "%0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_auto_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_expr_with_match_bitmap(self):
|
||||
"""
|
||||
target: test query with match string expression and indexed with bitmap
|
||||
expected: verify query successfully
|
||||
"""
|
||||
collection_w, vectors = self.init_collection_general(prefix, insert_data=True,is_index=False,
|
||||
primary_field=default_int_field_name)[0:2]
|
||||
|
||||
collection_w.create_index(ct.default_float_vec_field_name, default_index_params, index_name="query_expr_pre_index")
|
||||
collection_w.create_index("varchar", index_name="bitmap_auto_index")
|
||||
time.sleep(1)
|
||||
collection_w.load()
|
||||
expression = 'varchar like "%0%"'
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len = len(result)
|
||||
collection_w.release()
|
||||
collection_w.drop_index(index_name="varchar_bitmap_index")
|
||||
collection_w.load()
|
||||
result , _ = collection_w.query(expression, output_fields=['varchar'])
|
||||
res_len_1 = len(result)
|
||||
assert res_len_1 == res_len
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_query_string_with_invalid_prefix_expr(self):
|
||||
"""
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user