enhance: add bitmap offset cache to speed up retrieve raw data (#35498)

#35458

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2024-08-24 01:40:58 +08:00 committed by GitHub
parent 75da36d1aa
commit 42f7800b5b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 118 additions and 21 deletions

View File

@ -255,7 +255,7 @@ BitmapIndex<T>::Serialize(const Config& config) {
ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second); ret_set.Append(BITMAP_INDEX_META, index_meta.first, index_meta.second);
LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}", LOG_INFO("build bitmap index with cardinality = {}, num_rows = {}",
Cardinality(), data_.size(),
total_num_rows_); total_num_rows_);
Disassemble(ret_set); Disassemble(ret_set);
@ -345,6 +345,31 @@ BitmapIndex<T>::DeserializeIndexData(const uint8_t* data_ptr,
} }
} }
template <typename T>
void
BitmapIndex<T>::BuildOffsetCache() {
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
data_offsets_cache_.resize(total_num_rows_);
for (auto it = data_.begin(); it != data_.end(); it++) {
for (const auto& v : it->second) {
data_offsets_cache_[v] = it;
}
}
} else {
for (auto it = bitsets_.begin(); it != bitsets_.end(); it++) {
bitsets_offsets_cache_.resize(total_num_rows_);
const auto& bits = it->second;
for (int i = 0; i < bits.size(); i++) {
if (bits[i]) {
bitsets_offsets_cache_[i] = it;
}
}
}
}
use_offset_cache_ = true;
LOG_INFO("build offset cache for bitmap index");
}
template <> template <>
void void
BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr, BitmapIndex<std::string>::DeserializeIndexData(const uint8_t* data_ptr,
@ -377,6 +402,9 @@ template <typename T>
void void
BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set, BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
const Config& config) { const Config& config) {
auto enable_offset_cache =
GetValueFromConfig<bool>(config, ENABLE_OFFSET_CACHE);
auto index_meta_buffer = binary_set.GetByName(BITMAP_INDEX_META); auto index_meta_buffer = binary_set.GetByName(BITMAP_INDEX_META);
auto index_meta = DeserializeIndexMeta(index_meta_buffer->data.get(), auto index_meta = DeserializeIndexMeta(index_meta_buffer->data.get(),
index_meta_buffer->size); index_meta_buffer->size);
@ -387,6 +415,10 @@ BitmapIndex<T>::LoadWithoutAssemble(const BinarySet& binary_set,
auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA); auto index_data_buffer = binary_set.GetByName(BITMAP_INDEX_DATA);
DeserializeIndexData(index_data_buffer->data.get(), index_length); DeserializeIndexData(index_data_buffer->data.get(), index_length);
if (enable_offset_cache.has_value() && enable_offset_cache.value()) {
BuildOffsetCache();
}
LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}", LOG_INFO("load bitmap index with cardinality = {}, num_rows = {}",
Cardinality(), Cardinality(),
total_num_rows_); total_num_rows_);
@ -575,7 +607,6 @@ BitmapIndex<T>::RangeForRoaring(const T value, const OpType op) {
} }
auto lb = data_.begin(); auto lb = data_.begin();
auto ub = data_.end(); auto ub = data_.end();
switch (op) { switch (op) {
case OpType::LessThan: { case OpType::LessThan: {
ub = std::lower_bound(data_.begin(), ub = std::lower_bound(data_.begin(),
@ -758,12 +789,26 @@ BitmapIndex<T>::RangeForRoaring(const T lower_value,
return res; return res;
} }
template <typename T>
T
BitmapIndex<T>::Reverse_Lookup_InCache(size_t idx) const {
if (build_mode_ == BitmapIndexBuildMode::ROARING) {
return data_offsets_cache_[idx]->first;
} else {
return bitsets_offsets_cache_[idx]->first;
}
}
template <typename T> template <typename T>
T T
BitmapIndex<T>::Reverse_Lookup(size_t idx) const { BitmapIndex<T>::Reverse_Lookup(size_t idx) const {
AssertInfo(is_built_, "index has not been built"); AssertInfo(is_built_, "index has not been built");
AssertInfo(idx < total_num_rows_, "out of range of total coun"); AssertInfo(idx < total_num_rows_, "out of range of total coun");
if (use_offset_cache_) {
return Reverse_Lookup_InCache(idx);
}
if (build_mode_ == BitmapIndexBuildMode::ROARING) { if (build_mode_ == BitmapIndexBuildMode::ROARING) {
for (auto it = data_.begin(); it != data_.end(); it++) { for (auto it = data_.begin(); it != data_.end(); it++) {
for (const auto& v : it->second) { for (const auto& v : it->second) {

View File

@ -175,6 +175,12 @@ class BitmapIndex : public ScalarIndex<T> {
void void
DeserializeIndexData(const uint8_t* data_ptr, size_t index_length); DeserializeIndexData(const uint8_t* data_ptr, size_t index_length);
void
BuildOffsetCache();
T
Reverse_Lookup_InCache(size_t idx) const;
void void
ChooseIndexLoadMode(int64_t index_length); ChooseIndexLoadMode(int64_t index_length);
@ -210,6 +216,11 @@ class BitmapIndex : public ScalarIndex<T> {
std::map<T, TargetBitmap> bitsets_; std::map<T, TargetBitmap> bitsets_;
size_t total_num_rows_{0}; size_t total_num_rows_{0};
proto::schema::FieldSchema schema_; proto::schema::FieldSchema schema_;
bool use_offset_cache_{false};
std::vector<typename std::map<T, roaring::Roaring>::iterator>
data_offsets_cache_;
std::vector<typename std::map<T, TargetBitmap>::iterator>
bitsets_offsets_cache_;
std::shared_ptr<storage::MemFileManagerImpl> file_manager_; std::shared_ptr<storage::MemFileManagerImpl> file_manager_;
// generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate // generate valid_bitset to speed up NotIn and IsNull and IsNotNull operate

View File

@ -25,9 +25,6 @@
#include "common/Tracer.h" #include "common/Tracer.h"
#include "common/Types.h" #include "common/Types.h"
const std::string kMmapFilepath = "mmap_filepath";
const std::string kEnableMmap = "enable_mmap";
namespace milvus::index { namespace milvus::index {
class IndexBase { class IndexBase {

View File

@ -58,6 +58,12 @@ constexpr const char* INDEX_ENGINE_VERSION = "index_engine_version";
constexpr const char* BITMAP_INDEX_CARDINALITY_LIMIT = constexpr const char* BITMAP_INDEX_CARDINALITY_LIMIT =
"bitmap_cardinality_limit"; "bitmap_cardinality_limit";
// index config key
constexpr const char* MMAP_FILE_PATH = "mmap_filepath";
constexpr const char* ENABLE_MMAP = "enable_mmap";
constexpr const char* INDEX_FILES = "index_files";
constexpr const char* ENABLE_OFFSET_CACHE = "indexoffsetcache.enabled";
// VecIndex file metas // VecIndex file metas
constexpr const char* DISK_ANN_PREFIX_PATH = "index_prefix"; constexpr const char* DISK_ANN_PREFIX_PATH = "index_prefix";
constexpr const char* DISK_ANN_RAW_DATA_PATH = "data_path"; constexpr const char* DISK_ANN_RAW_DATA_PATH = "data_path";

View File

@ -201,7 +201,7 @@ StringIndexMarisa::LoadWithoutAssemble(const BinarySet& set,
} }
file.Seek(0, SEEK_SET); file.Seek(0, SEEK_SET);
if (config.contains(kEnableMmap)) { if (config.contains(ENABLE_MMAP)) {
trie_.mmap(file_name.c_str()); trie_.mmap(file_name.c_str());
} else { } else {
trie_.read(file.Descriptor()); trie_.read(file.Descriptor());

View File

@ -26,6 +26,7 @@
#include <tuple> #include <tuple>
#include <map> #include <map>
#include <string> #include <string>
#include <boost/algorithm/string.hpp>
#include "common/Types.h" #include "common/Types.h"
#include "common/FieldData.h" #include "common/FieldData.h"
@ -79,7 +80,12 @@ void inline CheckParameter(Config& conf,
template <typename T> template <typename T>
inline std::optional<T> inline std::optional<T>
GetValueFromConfig(const Config& cfg, const std::string& key) { GetValueFromConfig(const Config& cfg, const std::string& key) {
// cfg value are all string type
if (cfg.contains(key)) { if (cfg.contains(key)) {
if constexpr (std::is_same_v<T, bool>) {
return boost::algorithm::to_lower_copy(
cfg.at(key).get<std::string>()) == "true";
}
return cfg.at(key).get<T>(); return cfg.at(key).get<T>();
} }
return std::nullopt; return std::nullopt;

View File

@ -406,9 +406,9 @@ VectorDiskAnnIndex<T>::update_load_json(const Config& config) {
} }
} }
if (config.contains(kMmapFilepath)) { if (config.contains(MMAP_FILE_PATH)) {
load_config.erase(kMmapFilepath); load_config.erase(MMAP_FILE_PATH);
load_config[kEnableMmap] = true; load_config[ENABLE_MMAP] = true;
} }
return load_config; return load_config;

View File

@ -32,6 +32,7 @@
#include "index/Index.h" #include "index/Index.h"
#include "index/IndexInfo.h" #include "index/IndexInfo.h"
#include "index/Meta.h"
#include "index/Utils.h" #include "index/Utils.h"
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "config/ConfigKnowhere.h" #include "config/ConfigKnowhere.h"
@ -142,7 +143,7 @@ template <typename T>
void void
VectorMemIndex<T>::Load(milvus::tracer::TraceContext ctx, VectorMemIndex<T>::Load(milvus::tracer::TraceContext ctx,
const Config& config) { const Config& config) {
if (config.contains(kMmapFilepath)) { if (config.contains(MMAP_FILE_PATH)) {
return LoadFromFile(config); return LoadFromFile(config);
} }
@ -483,7 +484,7 @@ VectorMemIndex<T>::GetSparseVector(const DatasetPtr dataset) const {
template <typename T> template <typename T>
void VectorMemIndex<T>::LoadFromFile(const Config& config) { void VectorMemIndex<T>::LoadFromFile(const Config& config) {
auto filepath = GetValueFromConfig<std::string>(config, kMmapFilepath); auto filepath = GetValueFromConfig<std::string>(config, MMAP_FILE_PATH);
AssertInfo(filepath.has_value(), "mmap filepath is empty when load index"); AssertInfo(filepath.has_value(), "mmap filepath is empty when load index");
std::filesystem::create_directories( std::filesystem::create_directories(
@ -598,8 +599,8 @@ void VectorMemIndex<T>::LoadFromFile(const Config& config) {
LOG_INFO("load index into Knowhere..."); LOG_INFO("load index into Knowhere...");
auto conf = config; auto conf = config;
conf.erase(kMmapFilepath); conf.erase(MMAP_FILE_PATH);
conf[kEnableMmap] = true; conf[ENABLE_MMAP] = true;
auto start_deserialize = std::chrono::system_clock::now(); auto start_deserialize = std::chrono::system_clock::now();
auto stat = index_.DeserializeFromFile(filepath.value(), conf); auto stat = index_.DeserializeFromFile(filepath.value(), conf);
auto deserialize_duration = auto deserialize_duration =

View File

@ -221,7 +221,6 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
static_cast<milvus::segcore::LoadIndexInfo*>(c_load_index_info); static_cast<milvus::segcore::LoadIndexInfo*>(c_load_index_info);
auto& index_params = load_index_info->index_params; auto& index_params = load_index_info->index_params;
auto field_type = load_index_info->field_type; auto field_type = load_index_info->field_type;
auto engine_version = load_index_info->index_engine_version; auto engine_version = load_index_info->index_engine_version;
milvus::index::CreateIndexInfo index_info; milvus::index::CreateIndexInfo index_info;
@ -271,7 +270,7 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
auto config = milvus::index::ParseConfigFromIndexParams( auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params); load_index_info->index_params);
config["index_files"] = load_index_info->index_files; config[milvus::index::INDEX_FILES] = load_index_info->index_files;
milvus::storage::FileManagerContext fileManagerContext( milvus::storage::FileManagerContext fileManagerContext(
field_meta, index_meta, remote_chunk_manager); field_meta, index_meta, remote_chunk_manager);
@ -289,9 +288,10 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
std::to_string(load_index_info->field_id) / std::to_string(load_index_info->field_id) /
std::to_string(load_index_info->index_id); std::to_string(load_index_info->index_id);
config[kMmapFilepath] = filepath.string(); config[milvus::index::MMAP_FILE_PATH] = filepath.string();
} }
LOG_DEBUG("load index with configs: {}", config.dump());
load_index_info->index->Load(ctx, config); load_index_info->index->Load(ctx, config);
span->End(); span->End();

View File

@ -272,13 +272,19 @@ func (s *Server) CreateIndex(ctx context.Context, req *indexpb.CreateIndexReques
func ValidateIndexParams(index *model.Index) error { func ValidateIndexParams(index *model.Index) error {
indexType := GetIndexType(index.IndexParams) indexType := GetIndexType(index.IndexParams)
indexParams := funcutil.KeyValuePair2Map(index.IndexParams) indexParams := funcutil.KeyValuePair2Map(index.IndexParams)
userIndexParams := funcutil.KeyValuePair2Map(index.UserIndexParams)
if err := indexparamcheck.ValidateMmapIndexParams(indexType, indexParams); err != nil { if err := indexparamcheck.ValidateMmapIndexParams(indexType, indexParams); err != nil {
return merr.WrapErrParameterInvalidMsg("invalid mmap index params", err.Error()) return merr.WrapErrParameterInvalidMsg("invalid mmap index params", err.Error())
} }
userIndexParams := funcutil.KeyValuePair2Map(index.UserIndexParams)
if err := indexparamcheck.ValidateMmapIndexParams(indexType, userIndexParams); err != nil { if err := indexparamcheck.ValidateMmapIndexParams(indexType, userIndexParams); err != nil {
return merr.WrapErrParameterInvalidMsg("invalid mmap user index params", err.Error()) return merr.WrapErrParameterInvalidMsg("invalid mmap user index params", err.Error())
} }
if err := indexparamcheck.ValidateOffsetCacheIndexParams(indexType, indexParams); err != nil {
return merr.WrapErrParameterInvalidMsg("invalid offset cache index params", err.Error())
}
if err := indexparamcheck.ValidateOffsetCacheIndexParams(indexType, userIndexParams); err != nil {
return merr.WrapErrParameterInvalidMsg("invalid offset cache index params", err.Error())
}
return nil return nil
} }

View File

@ -171,6 +171,7 @@ const (
LazyLoadEnableKey = "lazyload.enabled" LazyLoadEnableKey = "lazyload.enabled"
PartitionKeyIsolationKey = "partitionkey.isolation" PartitionKeyIsolationKey = "partitionkey.isolation"
FieldSkipLoadKey = "field.skipLoad" FieldSkipLoadKey = "field.skipLoad"
IndexOffsetCacheEnabledKey = "indexoffsetcache.enabled"
) )
const ( const (

View File

@ -29,4 +29,5 @@ func Test_BitmapIndexChecker(t *testing.T) {
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double})) assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double}))
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Float})) assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Float}))
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Double})) assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Array, ElementType: schemapb.DataType_Double}))
assert.Error(t, c.CheckValidDataType(&schemapb.FieldSchema{DataType: schemapb.DataType_Double, IsPrimaryKey: true}))
} }

View File

@ -16,6 +16,9 @@ func (c *BITMAPChecker) CheckTrain(params map[string]string) error {
} }
func (c *BITMAPChecker) CheckValidDataType(field *schemapb.FieldSchema) error { func (c *BITMAPChecker) CheckValidDataType(field *schemapb.FieldSchema) error {
if field.IsPrimaryKey {
return fmt.Errorf("create bitmap index on primary key not supported")
}
mainType := field.GetDataType() mainType := field.GetDataType()
elemType := field.GetElementType() elemType := field.GetElementType()
if !typeutil.IsBoolType(mainType) && !typeutil.IsIntegerType(mainType) && if !typeutil.IsBoolType(mainType) && !typeutil.IsIntegerType(mainType) &&

View File

@ -73,6 +73,10 @@ func IsVectorMmapIndex(indexType IndexType) bool {
indexType == IndexSparseWand indexType == IndexSparseWand
} }
func IsOffsetCacheSupported(indexType IndexType) bool {
return indexType == IndexBitmap
}
func IsDiskIndex(indexType IndexType) bool { func IsDiskIndex(indexType IndexType) bool {
return indexType == IndexDISKANN return indexType == IndexDISKANN
} }
@ -96,3 +100,18 @@ func ValidateMmapIndexParams(indexType IndexType, indexParams map[string]string)
} }
return nil return nil
} }
func ValidateOffsetCacheIndexParams(indexType IndexType, indexParams map[string]string) error {
offsetCacheEnable, ok := indexParams[common.IndexOffsetCacheEnabledKey]
if !ok {
return nil
}
enable, err := strconv.ParseBool(offsetCacheEnable)
if err != nil {
return fmt.Errorf("invalid %s value: %s, expected: true, false", common.IndexOffsetCacheEnabledKey, offsetCacheEnable)
}
if enable && IsOffsetCacheSupported(indexType) {
return fmt.Errorf("only bitmap index support %s now", common.IndexOffsetCacheEnabledKey)
}
return nil
}

View File

@ -54,6 +54,7 @@ var configableIndexParams = typeutil.NewSet[string]()
func init() { func init() {
configableIndexParams.Insert(common.MmapEnabledKey) configableIndexParams.Insert(common.MmapEnabledKey)
configableIndexParams.Insert(common.IndexOffsetCacheEnabledKey)
} }
func IsConfigableIndexParam(key string) bool { func IsConfigableIndexParam(key string) bool {