mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
enhance: support array bitmap index (#33527)
#32900 --------- Signed-off-by: luzhang <luzhang@zilliz.com> Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
parent
e422168f09
commit
d43ec4db0b
@ -258,6 +258,34 @@ IsBinaryDataType(DataType data_type) {
|
||||
return IsJsonDataType(data_type) || IsArrayDataType(data_type);
|
||||
}
|
||||
|
||||
inline bool
|
||||
IsPrimitiveType(proto::schema::DataType type) {
|
||||
switch (type) {
|
||||
case proto::schema::DataType::Bool:
|
||||
case proto::schema::DataType::Int8:
|
||||
case proto::schema::DataType::Int16:
|
||||
case proto::schema::DataType::Int32:
|
||||
case proto::schema::DataType::Int64:
|
||||
case proto::schema::DataType::Float:
|
||||
case proto::schema::DataType::Double:
|
||||
case proto::schema::DataType::String:
|
||||
case proto::schema::DataType::VarChar:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool
|
||||
IsJsonType(proto::schema::DataType type) {
|
||||
return type == proto::schema::DataType::JSON;
|
||||
}
|
||||
|
||||
inline bool
|
||||
IsArrayType(proto::schema::DataType type) {
|
||||
return type == proto::schema::DataType::Array;
|
||||
}
|
||||
|
||||
inline bool
|
||||
IsBinaryVectorDataType(DataType data_type) {
|
||||
return data_type == DataType::VECTOR_BINARY;
|
||||
|
||||
@ -33,7 +33,8 @@ namespace index {
|
||||
template <typename T>
|
||||
BitmapIndex<T>::BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context)
|
||||
: is_built_(false) {
|
||||
: is_built_(false),
|
||||
schema_(file_manager_context.fieldDataMeta.field_schema) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ =
|
||||
std::make_shared<storage::MemFileManagerImpl>(file_manager_context);
|
||||
@ -45,7 +46,9 @@ template <typename T>
|
||||
BitmapIndex<T>::BitmapIndex(
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: is_built_(false), data_(), space_(space) {
|
||||
: is_built_(false),
|
||||
schema_(file_manager_context.fieldDataMeta.field_schema),
|
||||
space_(space) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ = std::make_shared<storage::MemFileManagerImpl>(
|
||||
file_manager_context, space);
|
||||
@ -67,27 +70,7 @@ BitmapIndex<T>::Build(const Config& config) {
|
||||
auto field_datas =
|
||||
file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
|
||||
int total_num_rows = 0;
|
||||
for (const auto& field_data : field_datas) {
|
||||
total_num_rows += field_data->get_num_rows();
|
||||
}
|
||||
if (total_num_rows == 0) {
|
||||
throw SegcoreError(DataIsEmpty,
|
||||
"scalar bitmap index can not build null values");
|
||||
}
|
||||
|
||||
total_num_rows_ = total_num_rows;
|
||||
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto val = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_[*val].add(offset);
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
is_built_ = true;
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -144,6 +127,21 @@ BitmapIndex<T>::BuildV2(const Config& config) {
|
||||
BuildWithFieldData(field_datas);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildPrimitiveField(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto val = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_[*val].add(offset);
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildWithFieldData(
|
||||
@ -158,17 +156,46 @@ BitmapIndex<T>::BuildWithFieldData(
|
||||
}
|
||||
total_num_rows_ = total_num_rows;
|
||||
|
||||
switch (schema_.data_type()) {
|
||||
case proto::schema::DataType::Bool:
|
||||
case proto::schema::DataType::Int8:
|
||||
case proto::schema::DataType::Int16:
|
||||
case proto::schema::DataType::Int32:
|
||||
case proto::schema::DataType::Int64:
|
||||
case proto::schema::DataType::Float:
|
||||
case proto::schema::DataType::Double:
|
||||
case proto::schema::DataType::String:
|
||||
case proto::schema::DataType::VarChar:
|
||||
BuildPrimitiveField(field_datas);
|
||||
break;
|
||||
case proto::schema::DataType::Array:
|
||||
BuildArrayField(field_datas);
|
||||
break;
|
||||
default:
|
||||
PanicInfo(
|
||||
DataTypeInvalid,
|
||||
fmt::format("Invalid data type: {} for build bitmap index",
|
||||
proto::schema::DataType_Name(schema_.data_type())));
|
||||
}
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
BitmapIndex<T>::BuildArrayField(const std::vector<FieldDataPtr>& field_datas) {
|
||||
int64_t offset = 0;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto val = reinterpret_cast<const T*>(data->RawValue(i));
|
||||
data_[*val].add(offset);
|
||||
auto array =
|
||||
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
|
||||
for (size_t j = 0; j < array->length(); ++j) {
|
||||
auto val = array->template get_data<T>(j);
|
||||
data_[val].add(offset);
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -877,4 +904,4 @@ template class BitmapIndex<double>;
|
||||
template class BitmapIndex<std::string>;
|
||||
|
||||
} // namespace index
|
||||
} // namespace milvus
|
||||
} // namespace milvus
|
||||
|
||||
@ -50,17 +50,6 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
explicit BitmapIndex(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager)
|
||||
: file_manager_(file_manager) {
|
||||
}
|
||||
|
||||
explicit BitmapIndex(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: file_manager_(file_manager), space_(space) {
|
||||
}
|
||||
|
||||
~BitmapIndex() override = default;
|
||||
|
||||
BinarySet
|
||||
@ -117,6 +106,7 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
|
||||
BinarySet
|
||||
Upload(const Config& config = {}) override;
|
||||
|
||||
BinarySet
|
||||
UploadV2(const Config& config = {}) override;
|
||||
|
||||
@ -125,6 +115,11 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
public:
|
||||
int64_t
|
||||
Cardinality() {
|
||||
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
|
||||
@ -134,11 +129,13 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
LoadWithoutAssemble(const BinarySet& binary_set,
|
||||
const Config& config) override;
|
||||
|
||||
private:
|
||||
void
|
||||
BuildPrimitiveField(const std::vector<FieldDataPtr>& datas);
|
||||
|
||||
void
|
||||
BuildArrayField(const std::vector<FieldDataPtr>& datas);
|
||||
|
||||
size_t
|
||||
GetIndexDataSize();
|
||||
|
||||
@ -188,6 +185,7 @@ class BitmapIndex : public ScalarIndex<T> {
|
||||
std::map<T, roaring::Roaring> data_;
|
||||
std::map<T, TargetBitmap> bitsets_;
|
||||
size_t total_num_rows_{0};
|
||||
proto::schema::FieldSchema schema_;
|
||||
std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
|
||||
std::shared_ptr<milvus_storage::Space> space_;
|
||||
};
|
||||
|
||||
@ -32,12 +32,14 @@ template <typename T>
|
||||
HybridScalarIndex<T>::HybridScalarIndex(
|
||||
const storage::FileManagerContext& file_manager_context)
|
||||
: is_built_(false),
|
||||
bitmap_index_cardinality_limit_(DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND) {
|
||||
bitmap_index_cardinality_limit_(DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND),
|
||||
file_manager_context_(file_manager_context) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ =
|
||||
mem_file_manager_ =
|
||||
std::make_shared<storage::MemFileManagerImpl>(file_manager_context);
|
||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
|
||||
}
|
||||
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
|
||||
internal_index_type_ = InternalIndexType::NONE;
|
||||
}
|
||||
|
||||
@ -47,12 +49,14 @@ HybridScalarIndex<T>::HybridScalarIndex(
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: is_built_(false),
|
||||
bitmap_index_cardinality_limit_(DEFAULT_BITMAP_INDEX_CARDINALITY_BOUND),
|
||||
file_manager_context_(file_manager_context),
|
||||
space_(space) {
|
||||
if (file_manager_context.Valid()) {
|
||||
file_manager_ = std::make_shared<storage::MemFileManagerImpl>(
|
||||
mem_file_manager_ = std::make_shared<storage::MemFileManagerImpl>(
|
||||
file_manager_context, space);
|
||||
AssertInfo(file_manager_ != nullptr, "create file manager failed!");
|
||||
AssertInfo(mem_file_manager_ != nullptr, "create file manager failed!");
|
||||
}
|
||||
field_type_ = file_manager_context.fieldDataMeta.field_schema.data_type();
|
||||
internal_index_type_ = InternalIndexType::NONE;
|
||||
}
|
||||
|
||||
@ -96,7 +100,7 @@ HybridScalarIndex<std::string>::SelectIndexBuildType(
|
||||
|
||||
template <typename T>
|
||||
InternalIndexType
|
||||
HybridScalarIndex<T>::SelectIndexBuildType(
|
||||
HybridScalarIndex<T>::SelectBuildTypeForPrimitiveType(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
std::set<T> distinct_vals;
|
||||
for (const auto& data : field_datas) {
|
||||
@ -121,7 +125,7 @@ HybridScalarIndex<T>::SelectIndexBuildType(
|
||||
|
||||
template <>
|
||||
InternalIndexType
|
||||
HybridScalarIndex<std::string>::SelectIndexBuildType(
|
||||
HybridScalarIndex<std::string>::SelectBuildTypeForPrimitiveType(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
std::set<std::string> distinct_vals;
|
||||
for (const auto& data : field_datas) {
|
||||
@ -144,6 +148,52 @@ HybridScalarIndex<std::string>::SelectIndexBuildType(
|
||||
return internal_index_type_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InternalIndexType
|
||||
HybridScalarIndex<T>::SelectBuildTypeForArrayType(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
std::set<T> distinct_vals;
|
||||
for (const auto& data : field_datas) {
|
||||
auto slice_row_num = data->get_num_rows();
|
||||
for (size_t i = 0; i < slice_row_num; ++i) {
|
||||
auto array =
|
||||
reinterpret_cast<const milvus::Array*>(data->RawValue(i));
|
||||
for (size_t j = 0; j < array->length(); ++j) {
|
||||
auto val = array->template get_data<T>(j);
|
||||
distinct_vals.insert(val);
|
||||
|
||||
// Limit the bitmap index cardinality because of memory usage
|
||||
if (distinct_vals.size() > bitmap_index_cardinality_limit_) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Decide whether to select bitmap index or inverted index
|
||||
if (distinct_vals.size() >= bitmap_index_cardinality_limit_) {
|
||||
internal_index_type_ = InternalIndexType::INVERTED;
|
||||
} else {
|
||||
internal_index_type_ = InternalIndexType::BITMAP;
|
||||
}
|
||||
return internal_index_type_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InternalIndexType
|
||||
HybridScalarIndex<T>::SelectIndexBuildType(
|
||||
const std::vector<FieldDataPtr>& field_datas) {
|
||||
std::set<T> distinct_vals;
|
||||
if (IsPrimitiveType(field_type_)) {
|
||||
return SelectBuildTypeForPrimitiveType(field_datas);
|
||||
} else if (IsArrayType(field_type_)) {
|
||||
return SelectBuildTypeForArrayType(field_datas);
|
||||
} else {
|
||||
PanicInfo(Unsupported,
|
||||
fmt::format("unsupported build index for type {}",
|
||||
DataType_Name(field_type_)));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::shared_ptr<ScalarIndex<T>>
|
||||
HybridScalarIndex<T>::GetInternalIndex() {
|
||||
@ -151,9 +201,14 @@ HybridScalarIndex<T>::GetInternalIndex() {
|
||||
return internal_index_;
|
||||
}
|
||||
if (internal_index_type_ == InternalIndexType::BITMAP) {
|
||||
internal_index_ = std::make_shared<BitmapIndex<T>>(file_manager_);
|
||||
internal_index_ =
|
||||
std::make_shared<BitmapIndex<T>>(file_manager_context_);
|
||||
} else if (internal_index_type_ == InternalIndexType::STLSORT) {
|
||||
internal_index_ = std::make_shared<ScalarIndexSort<T>>(file_manager_);
|
||||
internal_index_ =
|
||||
std::make_shared<ScalarIndexSort<T>>(file_manager_context_);
|
||||
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
|
||||
internal_index_ =
|
||||
std::make_shared<InvertedIndexTantivy<T>>(file_manager_context_);
|
||||
} else {
|
||||
PanicInfo(UnexpectedError,
|
||||
"unknown index type when get internal index");
|
||||
@ -170,9 +225,13 @@ HybridScalarIndex<std::string>::GetInternalIndex() {
|
||||
|
||||
if (internal_index_type_ == InternalIndexType::BITMAP) {
|
||||
internal_index_ =
|
||||
std::make_shared<BitmapIndex<std::string>>(file_manager_);
|
||||
std::make_shared<BitmapIndex<std::string>>(file_manager_context_);
|
||||
} else if (internal_index_type_ == InternalIndexType::MARISA) {
|
||||
internal_index_ = std::make_shared<StringIndexMarisa>(file_manager_);
|
||||
internal_index_ =
|
||||
std::make_shared<StringIndexMarisa>(file_manager_context_);
|
||||
} else if (internal_index_type_ == InternalIndexType::INVERTED) {
|
||||
internal_index_ = std::make_shared<InvertedIndexTantivy<std::string>>(
|
||||
file_manager_context_);
|
||||
} else {
|
||||
PanicInfo(UnexpectedError,
|
||||
"unknown index type when get internal index");
|
||||
@ -206,7 +265,7 @@ HybridScalarIndex<T>::Build(const Config& config) {
|
||||
"insert file paths is empty when build index");
|
||||
|
||||
auto field_datas =
|
||||
file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
mem_file_manager_->CacheRawDataToMemory(insert_files.value());
|
||||
|
||||
SelectIndexBuildType(field_datas);
|
||||
BuildInternal(field_datas);
|
||||
@ -224,7 +283,7 @@ HybridScalarIndex<T>::BuildV2(const Config& config) {
|
||||
LOG_INFO("config bitmap cardinality limit to {}",
|
||||
bitmap_index_cardinality_limit_);
|
||||
|
||||
auto field_name = file_manager_->GetIndexMeta().field_name;
|
||||
auto field_name = mem_file_manager_->GetIndexMeta().field_name;
|
||||
auto reader = space_->ScanData();
|
||||
std::vector<FieldDataPtr> field_datas;
|
||||
for (auto rec = reader->Next(); rec != nullptr; rec = reader->Next()) {
|
||||
@ -262,32 +321,51 @@ HybridScalarIndex<T>::Serialize(const Config& config) {
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
HybridScalarIndex<T>::Upload(const Config& config) {
|
||||
auto binary_set = Serialize(config);
|
||||
file_manager_->AddFile(binary_set);
|
||||
HybridScalarIndex<T>::SerializeIndexType() {
|
||||
// Add index type info to storage for future restruct index
|
||||
BinarySet index_binary_set;
|
||||
std::shared_ptr<uint8_t[]> index_type_buf(new uint8_t[sizeof(uint8_t)]);
|
||||
index_type_buf[0] = static_cast<uint8_t>(internal_index_type_);
|
||||
index_binary_set.Append(index::INDEX_TYPE, index_type_buf, sizeof(uint8_t));
|
||||
mem_file_manager_->AddFile(index_binary_set);
|
||||
|
||||
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
|
||||
BinarySet ret;
|
||||
auto remote_paths_to_size = mem_file_manager_->GetRemotePathsToFileSize();
|
||||
BinarySet ret_set;
|
||||
Assert(remote_paths_to_size.size() == 1);
|
||||
for (auto& file : remote_paths_to_size) {
|
||||
ret.Append(file.first, nullptr, file.second);
|
||||
ret_set.Append(file.first, nullptr, file.second);
|
||||
}
|
||||
return ret_set;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
HybridScalarIndex<T>::Upload(const Config& config) {
|
||||
auto internal_index = GetInternalIndex();
|
||||
auto index_ret = internal_index->Upload(config);
|
||||
|
||||
auto index_type_ret = SerializeIndexType();
|
||||
|
||||
for (auto& [key, value] : index_type_ret.binary_map_) {
|
||||
index_ret.Append(key, value);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return index_ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
BinarySet
|
||||
HybridScalarIndex<T>::UploadV2(const Config& config) {
|
||||
auto binary_set = Serialize(config);
|
||||
file_manager_->AddFileV2(binary_set);
|
||||
auto internal_index = GetInternalIndex();
|
||||
auto index_ret = internal_index->Upload(config);
|
||||
|
||||
auto remote_paths_to_size = file_manager_->GetRemotePathsToFileSize();
|
||||
BinarySet ret;
|
||||
for (auto& file : remote_paths_to_size) {
|
||||
ret.Append(file.first, nullptr, file.second);
|
||||
auto index_type_ret = SerializeIndexType();
|
||||
|
||||
for (auto& [key, value] : index_type_ret.binary_map_) {
|
||||
index_ret.Append(key, value);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return index_ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -301,64 +379,32 @@ HybridScalarIndex<T>::DeserializeIndexType(const BinarySet& binary_set) {
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
HybridScalarIndex<T>::LoadInternal(const BinarySet& binary_set,
|
||||
const Config& config) {
|
||||
auto index = GetInternalIndex();
|
||||
index->LoadWithoutAssemble(binary_set, config);
|
||||
HybridScalarIndex<T>::LoadV2(const Config& config) {
|
||||
PanicInfo(Unsupported, "HybridScalarIndex LoadV2 not implemented");
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string
|
||||
HybridScalarIndex<T>::GetRemoteIndexTypeFile(
|
||||
const std::vector<std::string>& files) {
|
||||
std::string ret;
|
||||
for (auto& file : files) {
|
||||
auto file_name = file.substr(file.find_last_of('/') + 1);
|
||||
if (file_name == index::INDEX_TYPE) {
|
||||
ret = file;
|
||||
}
|
||||
}
|
||||
AssertInfo(!ret.empty(), "index type file not found for hybrid index");
|
||||
return ret;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
HybridScalarIndex<T>::Load(const BinarySet& binary_set, const Config& config) {
|
||||
milvus::Assemble(const_cast<BinarySet&>(binary_set));
|
||||
DeserializeIndexType(binary_set);
|
||||
|
||||
LoadInternal(binary_set, config);
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
HybridScalarIndex<T>::LoadV2(const Config& config) {
|
||||
auto blobs = space_->StatisticsBlobs();
|
||||
std::vector<std::string> index_files;
|
||||
auto prefix = file_manager_->GetRemoteIndexObjectPrefixV2();
|
||||
for (auto& b : blobs) {
|
||||
if (b.name.rfind(prefix, 0) == 0) {
|
||||
index_files.push_back(b.name);
|
||||
}
|
||||
}
|
||||
std::map<std::string, FieldDataPtr> index_datas{};
|
||||
for (auto& file_name : index_files) {
|
||||
auto res = space_->GetBlobByteSize(file_name);
|
||||
if (!res.ok()) {
|
||||
PanicInfo(S3Error, "unable to read index blob");
|
||||
}
|
||||
auto index_blob_data =
|
||||
std::shared_ptr<uint8_t[]>(new uint8_t[res.value()]);
|
||||
auto status = space_->ReadBlob(file_name, index_blob_data.get());
|
||||
if (!status.ok()) {
|
||||
PanicInfo(S3Error, "unable to read index blob");
|
||||
}
|
||||
auto raw_index_blob =
|
||||
storage::DeserializeFileData(index_blob_data, res.value());
|
||||
auto key = file_name.substr(file_name.find_last_of('/') + 1);
|
||||
index_datas[key] = raw_index_blob->GetFieldData();
|
||||
}
|
||||
AssembleIndexDatas(index_datas);
|
||||
|
||||
BinarySet binary_set;
|
||||
for (auto& [key, data] : index_datas) {
|
||||
auto size = data->Size();
|
||||
auto deleter = [&](uint8_t*) {}; // avoid repeated deconstruction
|
||||
auto buf = std::shared_ptr<uint8_t[]>(
|
||||
(uint8_t*)const_cast<void*>(data->Data()), deleter);
|
||||
binary_set.Append(key, buf, size);
|
||||
}
|
||||
|
||||
DeserializeIndexType(binary_set);
|
||||
|
||||
LoadInternal(binary_set, config);
|
||||
auto index = GetInternalIndex();
|
||||
index->Load(binary_set, config);
|
||||
|
||||
is_built_ = true;
|
||||
}
|
||||
@ -371,7 +417,11 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
|
||||
GetValueFromConfig<std::vector<std::string>>(config, "index_files");
|
||||
AssertInfo(index_files.has_value(),
|
||||
"index file paths is empty when load bitmap index");
|
||||
auto index_datas = file_manager_->LoadIndexToMemory(index_files.value());
|
||||
|
||||
auto index_type_file = GetRemoteIndexTypeFile(index_files.value());
|
||||
|
||||
auto index_datas = mem_file_manager_->LoadIndexToMemory(
|
||||
std::vector<std::string>{index_type_file});
|
||||
AssembleIndexDatas(index_datas);
|
||||
BinarySet binary_set;
|
||||
for (auto& [key, data] : index_datas) {
|
||||
@ -384,7 +434,8 @@ HybridScalarIndex<T>::Load(milvus::tracer::TraceContext ctx,
|
||||
|
||||
DeserializeIndexType(binary_set);
|
||||
|
||||
LoadInternal(binary_set, config);
|
||||
auto index = GetInternalIndex();
|
||||
index->Load(ctx, config);
|
||||
|
||||
is_built_ = true;
|
||||
}
|
||||
|
||||
@ -24,6 +24,7 @@
|
||||
#include "index/BitmapIndex.h"
|
||||
#include "index/ScalarIndexSort.h"
|
||||
#include "index/StringIndexMarisa.h"
|
||||
#include "index/InvertedIndexTantivy.h"
|
||||
#include "storage/FileManager.h"
|
||||
#include "storage/DiskFileManagerImpl.h"
|
||||
#include "storage/MemFileManagerImpl.h"
|
||||
@ -37,6 +38,7 @@ enum class InternalIndexType {
|
||||
BITMAP,
|
||||
STLSORT,
|
||||
MARISA,
|
||||
INVERTED,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -125,6 +127,9 @@ class HybridScalarIndex : public ScalarIndex<T> {
|
||||
|
||||
const bool
|
||||
HasRawData() const override {
|
||||
if (field_type_ == proto::schema::DataType::Array) {
|
||||
return false;
|
||||
}
|
||||
return internal_index_->HasRawData();
|
||||
}
|
||||
|
||||
@ -135,30 +140,42 @@ class HybridScalarIndex : public ScalarIndex<T> {
|
||||
UploadV2(const Config& config = {}) override;
|
||||
|
||||
private:
|
||||
InternalIndexType
|
||||
SelectBuildTypeForPrimitiveType(
|
||||
const std::vector<FieldDataPtr>& field_datas);
|
||||
|
||||
InternalIndexType
|
||||
SelectBuildTypeForArrayType(const std::vector<FieldDataPtr>& field_datas);
|
||||
|
||||
InternalIndexType
|
||||
SelectIndexBuildType(const std::vector<FieldDataPtr>& field_datas);
|
||||
|
||||
InternalIndexType
|
||||
SelectIndexBuildType(size_t n, const T* values);
|
||||
|
||||
BinarySet
|
||||
SerializeIndexType();
|
||||
|
||||
void
|
||||
DeserializeIndexType(const BinarySet& binary_set);
|
||||
|
||||
void
|
||||
BuildInternal(const std::vector<FieldDataPtr>& field_datas);
|
||||
|
||||
void
|
||||
LoadInternal(const BinarySet& binary_set, const Config& config);
|
||||
|
||||
std::shared_ptr<ScalarIndex<T>>
|
||||
GetInternalIndex();
|
||||
|
||||
std::string
|
||||
GetRemoteIndexTypeFile(const std::vector<std::string>& files);
|
||||
|
||||
public:
|
||||
bool is_built_{false};
|
||||
int32_t bitmap_index_cardinality_limit_;
|
||||
proto::schema::DataType field_type_;
|
||||
InternalIndexType internal_index_type_;
|
||||
std::shared_ptr<ScalarIndex<T>> internal_index_{nullptr};
|
||||
std::shared_ptr<storage::MemFileManagerImpl> file_manager_{nullptr};
|
||||
storage::FileManagerContext file_manager_context_;
|
||||
std::shared_ptr<storage::MemFileManagerImpl> mem_file_manager_{nullptr};
|
||||
std::shared_ptr<milvus_storage::Space> space_{nullptr};
|
||||
};
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ namespace milvus::index {
|
||||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
IndexFactory::CreateScalarIndex(
|
||||
IndexFactory::CreatePrimitiveScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
if (index_type == INVERTED_INDEX_TYPE) {
|
||||
@ -54,7 +54,7 @@ IndexFactory::CreateScalarIndex(
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
#if defined(__linux__) || defined(__APPLE__)
|
||||
@ -74,7 +74,7 @@ IndexFactory::CreateScalarIndex<std::string>(
|
||||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
IndexFactory::CreateScalarIndex(
|
||||
IndexFactory::CreatePrimitiveScalarIndex(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
@ -91,7 +91,7 @@ IndexFactory::CreateScalarIndex(
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
IndexFactory::CreatePrimitiveScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
@ -142,25 +142,32 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
||||
switch (data_type) {
|
||||
// create scalar index
|
||||
case DataType::BOOL:
|
||||
return CreateScalarIndex<bool>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<bool>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::INT8:
|
||||
return CreateScalarIndex<int8_t>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<int8_t>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::INT16:
|
||||
return CreateScalarIndex<int16_t>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<int16_t>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::INT32:
|
||||
return CreateScalarIndex<int32_t>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<int32_t>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::INT64:
|
||||
return CreateScalarIndex<int64_t>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<int64_t>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::FLOAT:
|
||||
return CreateScalarIndex<float>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<float>(index_type,
|
||||
file_manager_context);
|
||||
case DataType::DOUBLE:
|
||||
return CreateScalarIndex<double>(index_type, file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<double>(index_type,
|
||||
file_manager_context);
|
||||
|
||||
// create string index
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return CreateScalarIndex<std::string>(index_type,
|
||||
file_manager_context);
|
||||
return CreatePrimitiveScalarIndex<std::string>(
|
||||
index_type, file_manager_context);
|
||||
default:
|
||||
throw SegcoreError(
|
||||
DataTypeInvalid,
|
||||
@ -168,21 +175,57 @@ IndexFactory::CreatePrimitiveScalarIndex(
|
||||
}
|
||||
}
|
||||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateCompositeScalarIndex(
|
||||
IndexType index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
if (index_type == BITMAP_INDEX_TYPE) {
|
||||
auto element_type = static_cast<DataType>(
|
||||
file_manager_context.fieldDataMeta.field_schema.element_type());
|
||||
return CreatePrimitiveScalarIndex(
|
||||
element_type, index_type, file_manager_context);
|
||||
} else if (index_type == INVERTED_INDEX_TYPE) {
|
||||
auto element_type = static_cast<DataType>(
|
||||
file_manager_context.fieldDataMeta.field_schema.element_type());
|
||||
return CreatePrimitiveScalarIndex(
|
||||
element_type, index_type, file_manager_context);
|
||||
}
|
||||
}
|
||||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateComplexScalarIndex(
|
||||
IndexType index_type,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
PanicInfo(Unsupported, "Complex index not supported now");
|
||||
}
|
||||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateScalarIndex(
|
||||
const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context) {
|
||||
switch (create_index_info.field_type) {
|
||||
case DataType::ARRAY:
|
||||
auto data_type = create_index_info.field_type;
|
||||
switch (data_type) {
|
||||
case DataType::BOOL:
|
||||
case DataType::INT8:
|
||||
case DataType::INT16:
|
||||
case DataType::INT32:
|
||||
case DataType::INT64:
|
||||
case DataType::FLOAT:
|
||||
case DataType::DOUBLE:
|
||||
case DataType::VARCHAR:
|
||||
case DataType::STRING:
|
||||
return CreatePrimitiveScalarIndex(
|
||||
static_cast<DataType>(
|
||||
file_manager_context.fieldDataMeta.schema.element_type()),
|
||||
create_index_info.index_type,
|
||||
file_manager_context);
|
||||
default:
|
||||
return CreatePrimitiveScalarIndex(create_index_info.field_type,
|
||||
create_index_info.index_type,
|
||||
data_type, create_index_info.index_type, file_manager_context);
|
||||
case DataType::ARRAY: {
|
||||
return CreateCompositeScalarIndex(create_index_info.index_type,
|
||||
file_manager_context);
|
||||
}
|
||||
case DataType::JSON: {
|
||||
return CreateComplexScalarIndex(create_index_info.index_type,
|
||||
file_manager_context);
|
||||
}
|
||||
default:
|
||||
PanicInfo(DataTypeInvalid, "Invalid data type:{}", data_type);
|
||||
}
|
||||
}
|
||||
|
||||
@ -251,43 +294,6 @@ IndexFactory::CreateVectorIndex(
|
||||
}
|
||||
}
|
||||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateScalarIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
auto data_type = create_index_info.field_type;
|
||||
auto index_type = create_index_info.index_type;
|
||||
|
||||
switch (data_type) {
|
||||
// create scalar index
|
||||
case DataType::BOOL:
|
||||
return CreateScalarIndex<bool>(index_type, file_manager, space);
|
||||
case DataType::INT8:
|
||||
return CreateScalarIndex<int8_t>(index_type, file_manager, space);
|
||||
case DataType::INT16:
|
||||
return CreateScalarIndex<int16_t>(index_type, file_manager, space);
|
||||
case DataType::INT32:
|
||||
return CreateScalarIndex<int32_t>(index_type, file_manager, space);
|
||||
case DataType::INT64:
|
||||
return CreateScalarIndex<int64_t>(index_type, file_manager, space);
|
||||
case DataType::FLOAT:
|
||||
return CreateScalarIndex<float>(index_type, file_manager, space);
|
||||
case DataType::DOUBLE:
|
||||
return CreateScalarIndex<double>(index_type, file_manager, space);
|
||||
|
||||
// create string index
|
||||
case DataType::STRING:
|
||||
case DataType::VARCHAR:
|
||||
return CreateScalarIndex<std::string>(
|
||||
index_type, file_manager, space);
|
||||
default:
|
||||
throw SegcoreError(
|
||||
DataTypeInvalid,
|
||||
fmt::format("invalid data type to build mem index: {}",
|
||||
data_type));
|
||||
}
|
||||
}
|
||||
|
||||
IndexBasePtr
|
||||
IndexFactory::CreateVectorIndex(
|
||||
const CreateIndexInfo& create_index_info,
|
||||
|
||||
@ -65,6 +65,7 @@ class IndexFactory {
|
||||
CreateVectorIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context);
|
||||
|
||||
// For base types like int, float, double, string, etc
|
||||
IndexBasePtr
|
||||
CreatePrimitiveScalarIndex(
|
||||
DataType data_type,
|
||||
@ -72,6 +73,20 @@ class IndexFactory {
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
// For types like array, struct, union, etc
|
||||
IndexBasePtr
|
||||
CreateCompositeScalarIndex(
|
||||
IndexType index_type,
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
// For types like Json, XML, etc
|
||||
IndexBasePtr
|
||||
CreateComplexScalarIndex(
|
||||
IndexType index_type,
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
storage::FileManagerContext());
|
||||
|
||||
IndexBasePtr
|
||||
CreateScalarIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context =
|
||||
@ -85,7 +100,10 @@ class IndexFactory {
|
||||
IndexBasePtr
|
||||
CreateScalarIndex(const CreateIndexInfo& create_index_info,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
std::shared_ptr<milvus_storage::Space> space) {
|
||||
PanicInfo(ErrorCode::Unsupported,
|
||||
"CreateScalarIndexV2 not implemented");
|
||||
}
|
||||
|
||||
// IndexBasePtr
|
||||
// CreateIndex(DataType dtype, const IndexType& index_type);
|
||||
@ -94,28 +112,15 @@ class IndexFactory {
|
||||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
CreateScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager =
|
||||
storage::FileManagerContext());
|
||||
CreatePrimitiveScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager =
|
||||
storage::FileManagerContext());
|
||||
|
||||
template <typename T>
|
||||
ScalarIndexPtr<T>
|
||||
CreateScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
CreatePrimitiveScalarIndex(const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
};
|
||||
|
||||
// template <>
|
||||
// ScalarIndexPtr<std::string>
|
||||
// IndexFactory::CreateScalarIndex<std::string>(
|
||||
// const IndexType& index_type,
|
||||
// const storage::FileManagerContext& file_manager_context,
|
||||
// DataType d_type);
|
||||
|
||||
template <>
|
||||
ScalarIndexPtr<std::string>
|
||||
IndexFactory::CreateScalarIndex<std::string>(
|
||||
const IndexType& index_type,
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
} // namespace milvus::index
|
||||
|
||||
@ -66,7 +66,7 @@ template <typename T>
|
||||
InvertedIndexTantivy<T>::InvertedIndexTantivy(
|
||||
const storage::FileManagerContext& ctx,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: space_(space), schema_(ctx.fieldDataMeta.schema) {
|
||||
: space_(space), schema_(ctx.fieldDataMeta.field_schema) {
|
||||
mem_file_manager_ = std::make_shared<MemFileManager>(ctx, ctx.space_);
|
||||
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx, ctx.space_);
|
||||
auto field =
|
||||
@ -259,8 +259,7 @@ InvertedIndexTantivy<T>::InApplyCallback(
|
||||
template <typename T>
|
||||
const TargetBitmap
|
||||
InvertedIndexTantivy<T>::NotIn(size_t n, const T* values) {
|
||||
TargetBitmap bitset(Count());
|
||||
bitset.set();
|
||||
TargetBitmap bitset(Count(), true);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
auto array = wrapper_->term_query(values[i]);
|
||||
apply_hits(bitset, array, false);
|
||||
|
||||
@ -41,17 +41,6 @@ class ScalarIndexSort : public ScalarIndex<T> {
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
explicit ScalarIndexSort(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager)
|
||||
: file_manager_(file_manager) {
|
||||
}
|
||||
|
||||
explicit ScalarIndexSort(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: file_manager_(file_manager), space_(space) {
|
||||
}
|
||||
|
||||
BinarySet
|
||||
Serialize(const Config& config) override;
|
||||
|
||||
|
||||
@ -37,17 +37,6 @@ class StringIndexMarisa : public StringIndex {
|
||||
const storage::FileManagerContext& file_manager_context,
|
||||
std::shared_ptr<milvus_storage::Space> space);
|
||||
|
||||
explicit StringIndexMarisa(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager)
|
||||
: file_manager_(file_manager) {
|
||||
}
|
||||
|
||||
explicit StringIndexMarisa(
|
||||
const std::shared_ptr<storage::MemFileManagerImpl>& file_manager,
|
||||
std::shared_ptr<milvus_storage::Space> space)
|
||||
: file_manager_(file_manager), space_(space) {
|
||||
}
|
||||
|
||||
int64_t
|
||||
Size() override;
|
||||
|
||||
|
||||
@ -274,7 +274,8 @@ CreateIndexV2(CIndex* res_index,
|
||||
build_index_info->collectionid(),
|
||||
build_index_info->partitionid(),
|
||||
build_index_info->segmentid(),
|
||||
build_index_info->field_schema().fieldid()};
|
||||
build_index_info->field_schema().fieldid(),
|
||||
build_index_info->field_schema()};
|
||||
milvus::storage::IndexMeta index_meta{
|
||||
build_index_info->segmentid(),
|
||||
build_index_info->field_schema().fieldid(),
|
||||
|
||||
@ -64,7 +64,7 @@ struct FieldDataMeta {
|
||||
int64_t partition_id;
|
||||
int64_t segment_id;
|
||||
int64_t field_id;
|
||||
proto::schema::FieldSchema schema;
|
||||
proto::schema::FieldSchema field_schema;
|
||||
};
|
||||
|
||||
enum CodecType {
|
||||
|
||||
@ -20,7 +20,6 @@ set(MILVUS_TEST_FILES
|
||||
test_bf.cpp
|
||||
test_bf_sparse.cpp
|
||||
test_binary.cpp
|
||||
test_bitmap.cpp
|
||||
test_bool_index.cpp
|
||||
test_common.cpp
|
||||
test_concurrent_vector.cpp
|
||||
@ -33,6 +32,7 @@ set(MILVUS_TEST_FILES
|
||||
test_growing_index.cpp
|
||||
test_indexing.cpp
|
||||
test_hybrid_index.cpp
|
||||
test_array_bitmap_index.cpp
|
||||
test_index_c_api.cpp
|
||||
test_index_wrapper.cpp
|
||||
test_init.cpp
|
||||
|
||||
330
internal/core/unittest/test_array_bitmap_index.cpp
Normal file
330
internal/core/unittest/test_array_bitmap_index.cpp
Normal file
@ -0,0 +1,330 @@
|
||||
// Copyright(C) 2019 - 2020 Zilliz.All rights reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <functional>
|
||||
#include <boost/filesystem.hpp>
|
||||
#include <unordered_set>
|
||||
#include <memory>
|
||||
|
||||
#include "common/Tracer.h"
|
||||
#include "index/BitmapIndex.h"
|
||||
#include "storage/Util.h"
|
||||
#include "storage/InsertData.h"
|
||||
#include "indexbuilder/IndexFactory.h"
|
||||
#include "index/IndexFactory.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
#include "index/Meta.h"
|
||||
#include "pb/schema.pb.h"
|
||||
|
||||
using namespace milvus::index;
|
||||
using namespace milvus::indexbuilder;
|
||||
using namespace milvus;
|
||||
using namespace milvus::index;
|
||||
|
||||
template <typename T>
|
||||
static std::vector<T>
|
||||
GenerateData(const size_t size, const size_t cardinality) {
|
||||
std::vector<T> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(rand() % cardinality);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::vector<bool>
|
||||
GenerateData<bool>(const size_t size, const size_t cardinality) {
|
||||
std::vector<bool> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(rand() % 2 == 0);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::vector<std::string>
|
||||
GenerateData<std::string>(const size_t size, const size_t cardinality) {
|
||||
std::vector<std::string> result;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
result.push_back(std::to_string(rand() % cardinality));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<milvus::Array>
|
||||
GenerateArrayData(proto::schema::DataType element_type,
|
||||
int cardinality,
|
||||
int size,
|
||||
int array_len) {
|
||||
std::vector<ScalarArray> data(size);
|
||||
switch (element_type) {
|
||||
case proto::schema::DataType::Bool: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_bool_data()->add_data(
|
||||
static_cast<bool>(random()));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case proto::schema::DataType::Int8:
|
||||
case proto::schema::DataType::Int16:
|
||||
case proto::schema::DataType::Int32: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_int_data()->add_data(
|
||||
static_cast<int>(random() % cardinality));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case proto::schema::DataType::Int64: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_long_data()->add_data(
|
||||
static_cast<int64_t>(random() % cardinality));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case proto::schema::DataType::String: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_string_data()->add_data(
|
||||
std::to_string(random() % cardinality));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case proto::schema::DataType::Float: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_float_data()->add_data(
|
||||
static_cast<float>(random() % cardinality));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case proto::schema::DataType::Double: {
|
||||
for (int i = 0; i < size; i++) {
|
||||
milvus::proto::schema::ScalarField field_data;
|
||||
|
||||
for (int j = 0; j < array_len; j++) {
|
||||
field_data.mutable_double_data()->add_data(
|
||||
static_cast<double>(random() % cardinality));
|
||||
}
|
||||
data[i] = field_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
throw std::runtime_error("unsupported data type");
|
||||
}
|
||||
}
|
||||
std::vector<milvus::Array> res;
|
||||
for (int i = 0; i < size; i++) {
|
||||
res.push_back(milvus::Array(data[i]));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class ArrayBitmapIndexTest : public testing::Test {
|
||||
protected:
|
||||
void
|
||||
Init(int64_t collection_id,
|
||||
int64_t partition_id,
|
||||
int64_t segment_id,
|
||||
int64_t field_id,
|
||||
int64_t index_build_id,
|
||||
int64_t index_version) {
|
||||
proto::schema::FieldSchema field_schema;
|
||||
field_schema.set_data_type(proto::schema::DataType::Array);
|
||||
proto::schema::DataType element_type;
|
||||
if constexpr (std::is_same_v<int8_t, T>) {
|
||||
element_type = proto::schema::DataType::Int8;
|
||||
} else if constexpr (std::is_same_v<int16_t, T>) {
|
||||
element_type = proto::schema::DataType::Int16;
|
||||
} else if constexpr (std::is_same_v<int32_t, T>) {
|
||||
element_type = proto::schema::DataType::Int32;
|
||||
} else if constexpr (std::is_same_v<int64_t, T>) {
|
||||
element_type = proto::schema::DataType::Int64;
|
||||
} else if constexpr (std::is_same_v<float, T>) {
|
||||
element_type = proto::schema::DataType::Float;
|
||||
} else if constexpr (std::is_same_v<double, T>) {
|
||||
element_type = proto::schema::DataType::Double;
|
||||
} else if constexpr (std::is_same_v<std::string, T>) {
|
||||
element_type = proto::schema::DataType::String;
|
||||
}
|
||||
field_schema.set_element_type(element_type);
|
||||
auto field_meta = storage::FieldDataMeta{
|
||||
collection_id, partition_id, segment_id, field_id, field_schema};
|
||||
auto index_meta = storage::IndexMeta{
|
||||
segment_id, field_id, index_build_id, index_version};
|
||||
|
||||
data_ = GenerateArrayData(element_type, cardinality_, nb_, 10);
|
||||
|
||||
auto field_data = storage::CreateFieldData(DataType::ARRAY);
|
||||
field_data->FillFieldData(data_.data(), data_.size());
|
||||
storage::InsertData insert_data(field_data);
|
||||
insert_data.SetFieldDataMeta(field_meta);
|
||||
insert_data.SetTimestamps(0, 100);
|
||||
|
||||
auto serialized_bytes = insert_data.Serialize(storage::Remote);
|
||||
|
||||
auto log_path = fmt::format("{}/{}/{}/{}/{}/{}",
|
||||
"test_array_bitmap",
|
||||
collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
0);
|
||||
chunk_manager_->Write(
|
||||
log_path, serialized_bytes.data(), serialized_bytes.size());
|
||||
|
||||
storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_);
|
||||
std::vector<std::string> index_files;
|
||||
|
||||
Config config;
|
||||
config["index_type"] = milvus::index::BITMAP_INDEX_TYPE;
|
||||
config["insert_files"] = std::vector<std::string>{log_path};
|
||||
config["bitmap_cardinality_limit"] = "1000";
|
||||
|
||||
auto build_index =
|
||||
indexbuilder::IndexFactory::GetInstance().CreateIndex(
|
||||
DataType::ARRAY, config, ctx);
|
||||
build_index->Build();
|
||||
|
||||
auto binary_set = build_index->Upload();
|
||||
for (const auto& [key, _] : binary_set.binary_map_) {
|
||||
index_files.push_back(key);
|
||||
}
|
||||
|
||||
index::CreateIndexInfo index_info{};
|
||||
index_info.index_type = milvus::index::BITMAP_INDEX_TYPE;
|
||||
index_info.field_type = DataType::ARRAY;
|
||||
|
||||
config["index_files"] = index_files;
|
||||
|
||||
index_ =
|
||||
index::IndexFactory::GetInstance().CreateIndex(index_info, ctx);
|
||||
index_->Load(milvus::tracer::TraceContext{}, config);
|
||||
}
|
||||
|
||||
void
|
||||
SetUp() override {
|
||||
nb_ = 10000;
|
||||
cardinality_ = 30;
|
||||
|
||||
// if constexpr (std::is_same_v<T, int8_t>) {
|
||||
// type_ = DataType::INT8;
|
||||
// } else if constexpr (std::is_same_v<T, int16_t>) {
|
||||
// type_ = DataType::INT16;
|
||||
// } else if constexpr (std::is_same_v<T, int32_t>) {
|
||||
// type_ = DataType::INT32;
|
||||
// } else if constexpr (std::is_same_v<T, int64_t>) {
|
||||
// type_ = DataType::INT64;
|
||||
// } else if constexpr (std::is_same_v<T, std::string>) {
|
||||
// type_ = DataType::VARCHAR;
|
||||
// }
|
||||
int64_t collection_id = 1;
|
||||
int64_t partition_id = 2;
|
||||
int64_t segment_id = 3;
|
||||
int64_t field_id = 101;
|
||||
int64_t index_build_id = 1000;
|
||||
int64_t index_version = 10000;
|
||||
std::string root_path = "/tmp/test-bitmap-index/";
|
||||
|
||||
storage::StorageConfig storage_config;
|
||||
storage_config.storage_type = "local";
|
||||
storage_config.root_path = root_path;
|
||||
chunk_manager_ = storage::CreateChunkManager(storage_config);
|
||||
|
||||
Init(collection_id,
|
||||
partition_id,
|
||||
segment_id,
|
||||
field_id,
|
||||
index_build_id,
|
||||
index_version);
|
||||
}
|
||||
|
||||
virtual ~ArrayBitmapIndexTest() override {
|
||||
boost::filesystem::remove_all(chunk_manager_->GetRootPath());
|
||||
}
|
||||
|
||||
public:
|
||||
void
|
||||
TestInFunc() {
|
||||
// boost::container::vector<T> test_data;
|
||||
// std::unordered_set<T> s;
|
||||
// size_t nq = 10;
|
||||
// for (size_t i = 0; i < nq; i++) {
|
||||
// test_data.push_back(data_[i]);
|
||||
// s.insert(data_[i]);
|
||||
// }
|
||||
// auto index_ptr = dynamic_cast<index::ScalarIndex<T>*>(index_.get());
|
||||
// auto bitset = index_ptr->In(test_data.size(), test_data.data());
|
||||
// for (size_t i = 0; i < bitset.size(); i++) {
|
||||
// ASSERT_EQ(bitset[i], s.find(data_[i]) != s.end());
|
||||
// }
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<storage::ChunkManager> chunk_manager_;
|
||||
|
||||
public:
|
||||
DataType type_;
|
||||
IndexBasePtr index_;
|
||||
size_t nb_;
|
||||
size_t cardinality_;
|
||||
std::vector<milvus::Array> data_;
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(ArrayBitmapIndexTest);
|
||||
|
||||
TYPED_TEST_P(ArrayBitmapIndexTest, CountFuncTest) {
|
||||
auto count = this->index_->Count();
|
||||
EXPECT_EQ(count, this->nb_);
|
||||
}
|
||||
|
||||
TYPED_TEST_P(ArrayBitmapIndexTest, INFuncTest) {
|
||||
// this->TestInFunc();
|
||||
}
|
||||
|
||||
TYPED_TEST_P(ArrayBitmapIndexTest, NotINFuncTest) {
|
||||
//this->TestNotInFunc();
|
||||
}
|
||||
|
||||
using BitmapType =
|
||||
testing::Types<int8_t, int16_t, int32_t, int64_t, std::string>;
|
||||
|
||||
REGISTER_TYPED_TEST_SUITE_P(ArrayBitmapIndexTest,
|
||||
CountFuncTest,
|
||||
INFuncTest,
|
||||
NotINFuncTest);
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(ArrayBitmapE2ECheck,
|
||||
ArrayBitmapIndexTest,
|
||||
BitmapType);
|
||||
@ -24,6 +24,7 @@
|
||||
#include "index/IndexFactory.h"
|
||||
#include "test_utils/indexbuilder_test_utils.h"
|
||||
#include "index/Meta.h"
|
||||
#include "pb/schema.pb.h"
|
||||
|
||||
using namespace milvus::index;
|
||||
using namespace milvus::indexbuilder;
|
||||
@ -70,8 +71,24 @@ class HybridIndexTestV1 : public testing::Test {
|
||||
int64_t field_id,
|
||||
int64_t index_build_id,
|
||||
int64_t index_version) {
|
||||
proto::schema::FieldSchema field_schema;
|
||||
if constexpr (std::is_same_v<int8_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int8);
|
||||
} else if constexpr (std::is_same_v<int16_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int16);
|
||||
} else if constexpr (std::is_same_v<int32_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int32);
|
||||
} else if constexpr (std::is_same_v<int64_t, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Int64);
|
||||
} else if constexpr (std::is_same_v<float, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Float);
|
||||
} else if constexpr (std::is_same_v<double, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::Double);
|
||||
} else if constexpr (std::is_same_v<std::string, T>) {
|
||||
field_schema.set_data_type(proto::schema::DataType::String);
|
||||
}
|
||||
auto field_meta = storage::FieldDataMeta{
|
||||
collection_id, partition_id, segment_id, field_id};
|
||||
collection_id, partition_id, segment_id, field_id, field_schema};
|
||||
auto index_meta = storage::IndexMeta{
|
||||
segment_id, field_id, index_build_id, index_version};
|
||||
|
||||
|
||||
@ -40,8 +40,9 @@ gen_field_meta(int64_t collection_id = 1,
|
||||
.segment_id = segment_id,
|
||||
.field_id = field_id,
|
||||
};
|
||||
meta.schema.set_data_type(static_cast<proto::schema::DataType>(data_type));
|
||||
meta.schema.set_element_type(
|
||||
meta.field_schema.set_data_type(
|
||||
static_cast<proto::schema::DataType>(data_type));
|
||||
meta.field_schema.set_element_type(
|
||||
static_cast<proto::schema::DataType>(element_type));
|
||||
return meta;
|
||||
}
|
||||
|
||||
@ -56,7 +56,7 @@ TYPED_TEST_P(TypedScalarIndexTest, Dummy) {
|
||||
auto
|
||||
GetTempFileManagerCtx(CDataType data_type) {
|
||||
auto ctx = milvus::storage::FileManagerContext();
|
||||
ctx.fieldDataMeta.schema.set_data_type(
|
||||
ctx.fieldDataMeta.field_schema.set_data_type(
|
||||
static_cast<milvus::proto::schema::DataType>(data_type));
|
||||
return ctx;
|
||||
}
|
||||
@ -356,60 +356,6 @@ struct TypedScalarIndexTestV2<double>::Helper {
|
||||
using C = arrow::DoubleType;
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE_P(TypedScalarIndexTestV2);
|
||||
|
||||
TYPED_TEST_P(TypedScalarIndexTestV2, Base) {
|
||||
using T = TypeParam;
|
||||
auto dtype = milvus::GetDType<T>();
|
||||
auto index_types = GetIndexTypesV2<T>();
|
||||
for (const auto& index_type : index_types) {
|
||||
milvus::index::CreateIndexInfo create_index_info;
|
||||
create_index_info.field_type = milvus::DataType(dtype);
|
||||
create_index_info.index_type = index_type;
|
||||
create_index_info.field_name = "scalar";
|
||||
|
||||
auto storage_config = get_default_local_storage_config();
|
||||
auto chunk_manager =
|
||||
milvus::storage::CreateChunkManager(storage_config);
|
||||
|
||||
milvus::test::TmpPath tmp_path;
|
||||
auto temp_path = tmp_path.get();
|
||||
auto vec_size = DIM * 4;
|
||||
auto dataset = GenDataset(nb, knowhere::metric::L2, false);
|
||||
auto scalars = GenSortedArr<T>(nb);
|
||||
auto space = TestSpace<T>(temp_path, vec_size, dataset, scalars);
|
||||
milvus::storage::FileManagerContext file_manager_context(
|
||||
{}, {.field_name = "scalar"}, chunk_manager, space);
|
||||
file_manager_context.fieldDataMeta.schema.set_data_type(
|
||||
static_cast<milvus::proto::schema::DataType>(dtype));
|
||||
auto index =
|
||||
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
|
||||
create_index_info, file_manager_context, space);
|
||||
auto scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(index.get());
|
||||
milvus::Config config;
|
||||
if (index_type == "BITMAP") {
|
||||
config["bitmap_cardinality_limit"] = "1000";
|
||||
}
|
||||
scalar_index->BuildV2(config);
|
||||
scalar_index->UploadV2();
|
||||
|
||||
auto new_index =
|
||||
milvus::index::IndexFactory::GetInstance().CreateScalarIndex(
|
||||
create_index_info, file_manager_context, space);
|
||||
auto new_scalar_index =
|
||||
dynamic_cast<milvus::index::ScalarIndex<T>*>(new_index.get());
|
||||
new_scalar_index->LoadV2();
|
||||
ASSERT_EQ(nb, new_scalar_index->Count());
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_TYPED_TEST_SUITE_P(TypedScalarIndexTestV2, Base);
|
||||
|
||||
INSTANTIATE_TYPED_TEST_SUITE_P(ArithmeticCheck,
|
||||
TypedScalarIndexTestV2,
|
||||
ScalarT);
|
||||
|
||||
using namespace milvus::index;
|
||||
template <typename T>
|
||||
std::vector<T>
|
||||
|
||||
@ -123,7 +123,7 @@ TEST_F(StringIndexMarisaTest, Reverse) {
|
||||
auto index_types = GetIndexTypes<std::string>();
|
||||
for (const auto& index_type : index_types) {
|
||||
auto index = milvus::index::IndexFactory::GetInstance()
|
||||
.CreateScalarIndex<std::string>(index_type);
|
||||
.CreatePrimitiveScalarIndex<std::string>(index_type);
|
||||
index->Build(nb, strs.data());
|
||||
assert_reverse<std::string>(index.get(), strs);
|
||||
}
|
||||
|
||||
@ -491,17 +491,14 @@ GetIndexTypes<std::string>() {
|
||||
template <typename T>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2() {
|
||||
return std::vector<std::string>{"sort",
|
||||
milvus::index::INVERTED_INDEX_TYPE,
|
||||
milvus::index::BITMAP_INDEX_TYPE};
|
||||
return std::vector<std::string>{"sort", milvus::index::INVERTED_INDEX_TYPE};
|
||||
}
|
||||
|
||||
template <>
|
||||
inline std::vector<std::string>
|
||||
GetIndexTypesV2<std::string>() {
|
||||
return std::vector<std::string>{"marisa",
|
||||
milvus::index::INVERTED_INDEX_TYPE,
|
||||
milvus::index::BITMAP_INDEX_TYPE};
|
||||
milvus::index::INVERTED_INDEX_TYPE};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -16,9 +16,9 @@ func Test_BitmapIndexChecker(t *testing.T) {
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Int64))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Float))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_String))
|
||||
assert.NoError(t, c.CheckValidDataType(schemapb.DataType_Array))
|
||||
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_JSON))
|
||||
assert.Error(t, c.CheckValidDataType(schemapb.DataType_Array))
|
||||
assert.Error(t, c.CheckTrain(map[string]string{}))
|
||||
assert.Error(t, c.CheckTrain(map[string]string{"bitmap_cardinality_limit": "0"}))
|
||||
}
|
||||
|
||||
@ -21,8 +21,8 @@ func (c *BITMAPChecker) CheckTrain(params map[string]string) error {
|
||||
}
|
||||
|
||||
func (c *BITMAPChecker) CheckValidDataType(dType schemapb.DataType) error {
|
||||
if !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) {
|
||||
return fmt.Errorf("bitmap index are only supported on numeric and string field")
|
||||
if !typeutil.IsArithmetic(dType) && !typeutil.IsStringType(dType) && !typeutil.IsArrayType(dType) {
|
||||
return fmt.Errorf("bitmap index are only supported on numeric, string and array field")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user