diff --git a/internal/core/src/common/Consts.h b/internal/core/src/common/Consts.h index 1517f4671e..8e4d96c9e2 100644 --- a/internal/core/src/common/Consts.h +++ b/internal/core/src/common/Consts.h @@ -95,6 +95,7 @@ const std::string PARTITION_KEY_ISOLATION_KEY = "partition_key_isolation"; const std::string STORAGE_VERSION_KEY = "storage_version"; const std::string DIM_KEY = "dim"; const std::string DATA_TYPE_KEY = "data_type"; +const std::string INDEX_NUM_ROWS_KEY = "index_num_rows"; // storage version const int64_t STORAGE_V1 = 1; diff --git a/internal/core/src/index/BitmapIndex.cpp b/internal/core/src/index/BitmapIndex.cpp index e18dcc9720..9f3dbfb752 100644 --- a/internal/core/src/index/BitmapIndex.cpp +++ b/internal/core/src/index/BitmapIndex.cpp @@ -71,27 +71,9 @@ BitmapIndex::Build(const Config& config) { if (is_built_) { return; } - auto field_datas = file_manager_->CacheRawDataToMemory(config); - - auto lack_binlog_rows = - GetValueFromConfig(config, "lack_binlog_rows"); - if (lack_binlog_rows.has_value() && lack_binlog_rows.value() > 0) { - auto field_schema = file_manager_->GetFieldDataMeta().field_schema; - auto default_value = [&]() -> std::optional { - if (!field_schema.has_default_value()) { - return std::nullopt; - } - return field_schema.default_value(); - }(); - auto field_data = storage::CreateFieldData( - static_cast(field_schema.data_type()), - true, - 1, - lack_binlog_rows.value()); - field_data->FillFieldData(default_value, lack_binlog_rows.value()); - field_datas.insert(field_datas.begin(), field_data); - } + auto field_datas = + storage::CacheRawDataAndFillMissing(file_manager_, config); BuildWithFieldData(field_datas); } diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index 78b494a758..fce069000b 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -244,26 +244,8 @@ HybridScalarIndex::Build(const Config& config) { GetBitmapCardinalityLimitFromConfig(config); LOG_INFO("config bitmap cardinality limit to {}", bitmap_index_cardinality_limit_); - auto field_datas = mem_file_manager_->CacheRawDataToMemory(config); - - auto lack_binlog_rows = - GetValueFromConfig(config, "lack_binlog_rows"); - if (lack_binlog_rows.has_value() && lack_binlog_rows.value() > 0) { - auto field_schema = mem_file_manager_->GetFieldDataMeta().field_schema; - auto default_value = [&]() -> std::optional { - if (!field_schema.has_default_value()) { - return std::nullopt; - } - return field_schema.default_value(); - }(); - auto field_data = storage::CreateFieldData( - static_cast(field_schema.data_type()), - true, - 1, - lack_binlog_rows.value()); - field_data->FillFieldData(default_value, lack_binlog_rows.value()); - field_datas.insert(field_datas.begin(), field_data); - } + auto field_datas = + storage::CacheRawDataAndFillMissing(mem_file_manager_, config); SelectIndexBuildType(field_datas); BuildInternal(field_datas); diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index eeb5466443..173191be4c 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -165,25 +165,8 @@ InvertedIndexTantivy::Upload(const Config& config) { template void InvertedIndexTantivy::Build(const Config& config) { - auto field_datas = mem_file_manager_->CacheRawDataToMemory(config); - auto lack_binlog_rows = - GetValueFromConfig(config, "lack_binlog_rows"); - if (lack_binlog_rows.has_value() && lack_binlog_rows.value() > 0) { - auto field_schema = mem_file_manager_->GetFieldDataMeta().field_schema; - auto default_value = [&]() -> std::optional { - if (!field_schema.has_default_value()) { - return std::nullopt; - } - return field_schema.default_value(); - }(); - auto field_data = storage::CreateFieldData( - static_cast(field_schema.data_type()), - true, - 1, - lack_binlog_rows.value()); - field_data->FillFieldData(default_value, lack_binlog_rows.value()); - field_datas.insert(field_datas.begin(), field_data); - } + auto field_datas = + storage::CacheRawDataAndFillMissing(mem_file_manager_, config); BuildWithFieldData(field_datas); } diff --git a/internal/core/src/index/ScalarIndexSort.cpp b/internal/core/src/index/ScalarIndexSort.cpp index b7126239a1..8073778d80 100644 --- a/internal/core/src/index/ScalarIndexSort.cpp +++ b/internal/core/src/index/ScalarIndexSort.cpp @@ -51,26 +51,8 @@ ScalarIndexSort::Build(const Config& config) { if (is_built_) { return; } - auto field_datas = file_manager_->CacheRawDataToMemory(config); - - auto lack_binlog_rows = - GetValueFromConfig(config, "lack_binlog_rows"); - if (lack_binlog_rows.has_value() && lack_binlog_rows.value() > 0) { - auto field_schema = file_manager_->GetFieldDataMeta().field_schema; - auto default_value = [&]() -> std::optional { - if (!field_schema.has_default_value()) { - return std::nullopt; - } - return field_schema.default_value(); - }(); - auto field_data = storage::CreateFieldData( - static_cast(field_schema.data_type()), - true, - 1, - lack_binlog_rows.value()); - field_data->FillFieldData(default_value, lack_binlog_rows.value()); - field_datas.insert(field_datas.begin(), field_data); - } + auto field_datas = + storage::CacheRawDataAndFillMissing(file_manager_, config); BuildWithFieldData(field_datas); } diff --git a/internal/core/src/index/StringIndexMarisa.cpp b/internal/core/src/index/StringIndexMarisa.cpp index 6f6b997788..aadf61271a 100644 --- a/internal/core/src/index/StringIndexMarisa.cpp +++ b/internal/core/src/index/StringIndexMarisa.cpp @@ -65,26 +65,8 @@ StringIndexMarisa::Build(const Config& config) { if (built_) { PanicInfo(IndexAlreadyBuild, "index has been built"); } - auto field_datas = file_manager_->CacheRawDataToMemory(config); - - auto lack_binlog_rows = - GetValueFromConfig(config, "lack_binlog_rows"); - if (lack_binlog_rows.has_value() && lack_binlog_rows.value() > 0) { - auto field_schema = file_manager_->GetFieldDataMeta().field_schema; - auto default_value = [&]() -> std::optional { - if (!field_schema.has_default_value()) { - return std::nullopt; - } - return field_schema.default_value(); - }(); - auto field_data = storage::CreateFieldData( - static_cast(field_schema.data_type()), - true, - 1, - lack_binlog_rows.value()); - field_data->FillFieldData(default_value, lack_binlog_rows.value()); - field_datas.insert(field_datas.begin(), field_data); - } + auto field_datas = + storage::CacheRawDataAndFillMissing(file_manager_, config); BuildWithFieldData(field_datas); } diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index 3970e2c560..9706b05210 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -164,10 +164,10 @@ get_config(std::unique_ptr& info) { if (info->opt_fields().size()) { config[VEC_OPT_FIELDS] = get_opt_field(info->opt_fields()); } - config["lack_binlog_rows"] = info->lack_binlog_rows(); if (info->partition_key_isolation()) { config[PARTITION_KEY_ISOLATION_KEY] = info->partition_key_isolation(); } + config[INDEX_NUM_ROWS_KEY] = info->num_rows(); config[STORAGE_VERSION_KEY] = info->storage_version(); if (info->storage_version() == STORAGE_V2) { config[SEGMENT_INSERT_FILES_KEY] = diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index e4cebc94e3..49c76a08fd 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -21,6 +21,7 @@ #include "arrow/scalar.h" #include "arrow/type_fwd.h" #include "fmt/format.h" +#include "index/Utils.h" #include "log/Log.h" #include "common/Consts.h" @@ -1026,8 +1027,7 @@ GetFieldDatasFromStorageV2(std::vector>& remote_files, // remote files might not followed the sequence of column group id, // so we need to put into map std::unordered_map> column_group_files; - for (int i = 0; i < remote_files.size(); i++) { - auto& remote_chunk_files = remote_files[i]; + for (auto& remote_chunk_files : remote_files) { AssertInfo(remote_chunk_files.size() > 0, "remote files size is 0"); // find second last of / to get group_id @@ -1127,4 +1127,44 @@ GetFieldDatasFromStorageV2(std::vector>& remote_files, return field_data_list; } +std::vector +CacheRawDataAndFillMissing(const MemFileManagerImplPtr& file_manager, + const Config& config) { + // download field data + auto field_datas = file_manager->CacheRawDataToMemory(config); + + // check storage version + auto storage_version = + index::GetValueFromConfig(config, STORAGE_VERSION_KEY) + .value_or(0); + + int64_t lack_binlog_rows = + index::GetValueFromConfig(config, INDEX_NUM_ROWS_KEY) + .value_or(0); + for (auto& field_data : field_datas) { + lack_binlog_rows -= field_data->get_num_rows(); + } + + if (lack_binlog_rows > 0) { + LOG_INFO("create index lack binlog detected, lock row num: {}", + lack_binlog_rows); + auto field_schema = file_manager->GetFieldDataMeta().field_schema; + auto default_value = [&]() -> std::optional { + if (!field_schema.has_default_value()) { + return std::nullopt; + } + return field_schema.default_value(); + }(); + auto field_data = storage::CreateFieldData( + static_cast(field_schema.data_type()), + true, + 1, + lack_binlog_rows); + field_data->FillFieldData(default_value, lack_binlog_rows); + field_datas.insert(field_datas.begin(), field_data); + } + + return field_datas; +} + } // namespace milvus::storage diff --git a/internal/core/src/storage/Util.h b/internal/core/src/storage/Util.h index 9378e774cf..49e30e9ac0 100644 --- a/internal/core/src/storage/Util.h +++ b/internal/core/src/storage/Util.h @@ -26,6 +26,7 @@ #include "knowhere/comp/index_param.h" #include "parquet/schema.h" #include "storage/Event.h" +#include "storage/MemFileManagerImpl.h" #include "storage/PayloadStream.h" #include "storage/FileManager.h" #include "storage/BinlogReader.h" @@ -233,6 +234,10 @@ SortByPath(std::vector>& paths) { }); } +std::vector +CacheRawDataAndFillMissing(const MemFileManagerImplPtr& file_manager, + const Config& config); + // used only for test inline std::shared_ptr ConvertFieldDataToArrowDataWrapper(const FieldDataPtr& field_data) { diff --git a/internal/core/unittest/test_array_bitmap_index.cpp b/internal/core/unittest/test_array_bitmap_index.cpp index a2a237ef40..0ffe854304 100644 --- a/internal/core/unittest/test_array_bitmap_index.cpp +++ b/internal/core/unittest/test_array_bitmap_index.cpp @@ -231,8 +231,9 @@ class ArrayBitmapIndexTest : public testing::Test { config["index_type"] = milvus::index::HYBRID_INDEX_TYPE; config[INSERT_FILES_KEY] = std::vector{log_path}; config["bitmap_cardinality_limit"] = "100"; + config[INDEX_NUM_ROWS_KEY] = nb_; if (has_lack_binlog_row_) { - config["lack_binlog_rows"] = lack_binlog_row_; + config[INDEX_NUM_ROWS_KEY] = nb_ + lack_binlog_row_; } { diff --git a/internal/core/unittest/test_binlog_index.cpp b/internal/core/unittest/test_binlog_index.cpp index c8c118f876..dccdd88455 100644 --- a/internal/core/unittest/test_binlog_index.cpp +++ b/internal/core/unittest/test_binlog_index.cpp @@ -288,7 +288,8 @@ TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) { EXPECT_TRUE(segment->HasIndex(vec_field_id)); EXPECT_EQ(segment->get_row_count(), data_n); // only INDEX_FAISS_IVFFLAT has raw data, thus it should release the raw field data. - EXPECT_EQ(segment->HasFieldData(vec_field_id), index_type != knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); + EXPECT_EQ(segment->HasFieldData(vec_field_id), + index_type != knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63, 0); auto similary = GetKnnSearchRecall(num_queries, binlog_index_sr->seg_offsets_.data(), @@ -385,7 +386,8 @@ TEST_P(BinlogIndexTest, AccuracyWithMapFieldData) { ASSERT_NO_THROW(segment->LoadIndex(load_info)); EXPECT_TRUE(segment->HasIndex(vec_field_id)); EXPECT_EQ(segment->get_row_count(), data_n); - EXPECT_EQ(segment->HasFieldData(vec_field_id), index_type != knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); + EXPECT_EQ(segment->HasFieldData(vec_field_id), + index_type != knowhere::IndexEnum::INDEX_FAISS_IVFFLAT); auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63); auto similary = GetKnnSearchRecall(num_queries, binlog_index_sr->seg_offsets_.data(), diff --git a/internal/core/unittest/test_bitmap_index.cpp b/internal/core/unittest/test_bitmap_index.cpp index 10cb9f1aea..13c1c0d0ed 100644 --- a/internal/core/unittest/test_bitmap_index.cpp +++ b/internal/core/unittest/test_bitmap_index.cpp @@ -17,6 +17,7 @@ #include #include +#include "common/Consts.h" #include "common/Tracer.h" #include "common/Types.h" #include "index/BitmapIndex.h" @@ -159,8 +160,9 @@ class BitmapIndexTest : public testing::Test { Config config; config["index_type"] = milvus::index::BITMAP_INDEX_TYPE; config[INSERT_FILES_KEY] = std::vector{log_path}; + config[INDEX_NUM_ROWS_KEY] = nb_; if (has_lack_binlog_row_) { - config["lack_binlog_rows"] = lack_binlog_row_; + config[INDEX_NUM_ROWS_KEY] = nb_ + lack_binlog_row_; } auto build_index = diff --git a/internal/core/unittest/test_hybrid_index.cpp b/internal/core/unittest/test_hybrid_index.cpp index 25f5d1d662..2ed1d5cc00 100644 --- a/internal/core/unittest/test_hybrid_index.cpp +++ b/internal/core/unittest/test_hybrid_index.cpp @@ -160,8 +160,9 @@ class HybridIndexTestV1 : public testing::Test { config["index_type"] = milvus::index::HYBRID_INDEX_TYPE; config[INSERT_FILES_KEY] = std::vector{log_path}; config["bitmap_cardinality_limit"] = "1000"; + config[INDEX_NUM_ROWS_KEY] = nb_; if (has_lack_binlog_row_) { - config["lack_binlog_rows"] = lack_binlog_row_; + config[INDEX_NUM_ROWS_KEY] = nb_ + lack_binlog_row_; } { diff --git a/internal/core/unittest/test_inverted_index.cpp b/internal/core/unittest/test_inverted_index.cpp index f196f1b79e..2cc6513628 100644 --- a/internal/core/unittest/test_inverted_index.cpp +++ b/internal/core/unittest/test_inverted_index.cpp @@ -167,8 +167,9 @@ test_run() { Config config; config["index_type"] = milvus::index::INVERTED_INDEX_TYPE; config[INSERT_FILES_KEY] = std::vector{log_path}; + config[INDEX_NUM_ROWS_KEY] = nb; if (has_lack_binlog_row_) { - config["lack_binlog_rows"] = lack_binlog_row; + config[INDEX_NUM_ROWS_KEY] = nb + lack_binlog_row; } auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex( @@ -558,8 +559,9 @@ test_string() { Config config; config["index_type"] = milvus::index::INVERTED_INDEX_TYPE; config[INSERT_FILES_KEY] = std::vector{log_path}; + config[INDEX_NUM_ROWS_KEY] = nb; if (has_lack_binlog_row_) { - config["lack_binlog_rows"] = lack_binlog_row; + config[INDEX_NUM_ROWS_KEY] = nb + lack_binlog_row; } auto index = indexbuilder::IndexFactory::GetInstance().CreateIndex( diff --git a/internal/core/unittest/test_regex_query.cpp b/internal/core/unittest/test_regex_query.cpp index 560ebc92b9..61c8627afa 100644 --- a/internal/core/unittest/test_regex_query.cpp +++ b/internal/core/unittest/test_regex_query.cpp @@ -249,7 +249,8 @@ class SealedSegmentRegexQueryTest : public ::testing::Test { } auto index = index::CreateStringIndexSort(); std::vector buffer(arr.ByteSizeLong()); - ASSERT_TRUE(arr.SerializeToArray(buffer.data(), arr.ByteSizeLong())); + ASSERT_TRUE( + arr.SerializeToArray(buffer.data(), arr.ByteSizeLong())); index->BuildWithRawDataForUT(arr.ByteSizeLong(), buffer.data()); LoadIndexInfo info{ .field_id = schema->get_field_id(FieldName("str")).get(), diff --git a/internal/querynodev2/segments/segment_loader.go b/internal/querynodev2/segments/segment_loader.go index 1a22394b4d..b159113639 100644 --- a/internal/querynodev2/segments/segment_loader.go +++ b/internal/querynodev2/segments/segment_loader.go @@ -1713,7 +1713,7 @@ func (loader *segmentLoader) LoadIndex(ctx context.Context, } // TODO add field info sync between segcore and go segment for storage v2 - if loadInfo.GetStorageVersion() != 2 { + if loadInfo.GetStorageVersion() != storage.StorageV2 { fieldInfo, ok := fieldInfos[info.GetFieldID()] if !ok { return merr.WrapErrParameterInvalid("index info with corresponding field info", "missing field info", strconv.FormatInt(fieldInfo.GetFieldID(), 10))