milvus/internal/core/unittest/test_binlog_index.cpp
Bingyi Sun 0c0630cc38
feat: support dropping index without releasing collection (#42941)
issue: #42942

This pr includes the following changes:
1. Added checks for index checker in querycoord to generate drop index
tasks
2. Added drop index interface to querynode
3. To avoid search failure after dropping the index, the querynode
allows the use of lazy mode (warmup=disable) to load raw data even when
indexes contain raw data.
4. In segcore, loading the index no longer deletes raw data; instead, it
evicts it.
5. In expr, the index is pinned to prevent concurrent errors.

---------

Signed-off-by: sunby <sunbingyi1992@gmail.com>
2025-09-02 16:17:52 +08:00

462 lines
19 KiB
C++

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <boost/format.hpp>
#include <optional>
#include "index/IndexFactory.h"
#include "pb/plan.pb.h"
#include "query/Plan.h"
#include "segcore/segcore_init_c.h"
#include "segcore/SegmentSealed.h"
#include "test_utils/cachinglayer_test_utils.h"
#include "test_utils/DataGen.h"
#include "test_utils/storage_test_utils.h"
using namespace milvus;
using namespace milvus::segcore;
std::unique_ptr<float[]>
GenRandomFloatVecData(int rows, int dim, int seed = 42) {
auto vecs = std::make_unique<float[]>(rows * dim);
std::mt19937 rng(seed);
std::uniform_int_distribution<> distrib(0.0, 100.0);
for (int i = 0; i < rows * dim; ++i) vecs[i] = (float)distrib(rng);
return vecs;
}
inline float
GetKnnSearchRecall(
size_t nq, int64_t* gt_ids, size_t gt_k, int64_t* res_ids, size_t res_k) {
uint32_t matched_num = 0;
for (auto i = 0; i < nq; ++i) {
std::vector<int64_t> ids_0(gt_ids + i * gt_k,
gt_ids + i * gt_k + res_k);
std::vector<int64_t> ids_1(res_ids + i * res_k,
res_ids + i * res_k + res_k);
std::sort(ids_0.begin(), ids_0.end());
std::sort(ids_1.begin(), ids_1.end());
std::vector<int64_t> v(std::max(ids_0.size(), ids_1.size()));
std::vector<int64_t>::iterator it;
it = std::set_intersection(
ids_0.begin(), ids_0.end(), ids_1.begin(), ids_1.end(), v.begin());
v.resize(it - v.begin());
matched_num += v.size();
}
return ((float)matched_num) / ((float)nq * res_k);
}
using Param =
std::tuple<DataType,
knowhere::MetricType,
/* IndexType */ std::string,
/* DenseVectorInterminIndexType*/ std::optional<std::string>>;
class BinlogIndexTest : public ::testing::TestWithParam<Param> {
void
SetUp() override {
std::tie(
data_type, metric_type, index_type, dense_vec_intermin_index_type) =
GetParam();
schema = std::make_shared<Schema>();
vec_field_id =
schema->AddDebugField("fakevec", data_type, data_d, metric_type);
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
schema->set_primary_field_id(i64_fid);
vec_field_data = storage::CreateFieldData(data_type, false, data_d);
if (data_type == DataType::VECTOR_FLOAT) {
auto vec_data = GenRandomFloatVecData(data_n, data_d);
vec_field_data->FillFieldData(vec_data.get(), data_n);
raw_dataset = knowhere::GenDataSet(data_n, data_d, vec_data.get());
raw_dataset->SetIsOwner(true);
vec_data.release();
if (dense_vec_intermin_index_type.has_value() &&
dense_vec_intermin_index_type.value() ==
knowhere::IndexEnum::INDEX_FAISS_SCANN_DVR) {
intermin_index_has_raw_data = false;
} else {
intermin_index_has_raw_data = true;
}
} else if (data_type == DataType::VECTOR_SPARSE_U32_F32) {
auto sparse_vecs = GenerateRandomSparseFloatVector(data_n);
vec_field_data->FillFieldData(sparse_vecs.get(), data_n);
data_d = std::dynamic_pointer_cast<
milvus::FieldData<milvus::SparseFloatVector>>(
vec_field_data)
->Dim();
raw_dataset =
knowhere::GenDataSet(data_n, data_d, sparse_vecs.get());
raw_dataset->SetIsOwner(true);
raw_dataset->SetIsSparse(true);
sparse_vecs.release();
intermin_index_has_raw_data = false;
} else {
throw std::runtime_error("not implemented");
}
}
public:
IndexMetaPtr
GetCollectionIndexMeta(std::string index_type) {
std::map<std::string, std::string> index_params = {
{"index_type", index_type},
{"metric_type", metric_type},
{"nlist", "1024"}};
std::map<std::string, std::string> type_params = {{"dim", "128"}};
FieldIndexMeta fieldIndexMeta(
vec_field_id, std::move(index_params), std::move(type_params));
auto& config = SegcoreConfig::default_config();
config.set_chunk_rows(1024);
config.set_enable_interim_segment_index(true);
std::map<FieldId, FieldIndexMeta> filedMap = {
{vec_field_id, fieldIndexMeta}};
IndexMetaPtr metaPtr =
std::make_shared<CollectionIndexMeta>(226985, std::move(filedMap));
return std::move(metaPtr);
}
void
LoadOtherFields() {
auto dataset = DataGen(schema, data_n);
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto load_info = PrepareInsertBinlog(kCollectionID,
kPartitionID,
kSegmentID,
dataset,
cm,
"",
{vec_field_id.get()});
segment->LoadFieldData(load_info);
}
void
LoadVectorField(std::string mmap_dir_path = "") {
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto load_info = PrepareSingleFieldInsertBinlog(kCollectionID,
kPartitionID,
kSegmentID,
vec_field_id.get(),
{vec_field_data},
cm,
mmap_dir_path);
segment->LoadFieldData(load_info);
}
protected:
milvus::SchemaPtr schema;
knowhere::MetricType metric_type;
DataType data_type;
std::optional<std::string> dense_vec_intermin_index_type = std::nullopt;
std::string index_type;
size_t data_n = 10000;
size_t data_d = 128;
size_t topk = 10;
milvus::FieldDataPtr vec_field_data = nullptr;
milvus::segcore::SegmentSealedUPtr segment = nullptr;
milvus::FieldId vec_field_id;
knowhere::DataSetPtr raw_dataset;
bool intermin_index_has_raw_data;
};
INSTANTIATE_TEST_SUITE_P(
MetricTypeParameters,
BinlogIndexTest,
::testing::Values(
std::make_tuple(DataType::VECTOR_FLOAT,
knowhere::metric::L2,
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
knowhere::IndexEnum::
INDEX_FAISS_IVFFLAT_CC), // intermin index has data
std::make_tuple(
DataType::VECTOR_FLOAT,
knowhere::metric::L2,
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
knowhere::IndexEnum::
INDEX_FAISS_SCANN_DVR), // intermin index not has data
std::make_tuple(
DataType::VECTOR_SPARSE_U32_F32,
knowhere::metric::IP,
knowhere::IndexEnum::
INDEX_SPARSE_INVERTED_INDEX, //intermin index not has data
std::nullopt),
std::make_tuple(DataType::VECTOR_SPARSE_U32_F32,
knowhere::metric::IP,
knowhere::IndexEnum::
INDEX_SPARSE_WAND, // intermin index not has data
std::nullopt)));
TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
segment = CreateSealedSegment(schema, collection_index_meta);
LoadOtherFields();
auto& segcore_config = milvus::segcore::SegcoreConfig::default_config();
segcore_config.set_enable_interim_segment_index(true);
if (dense_vec_intermin_index_type.has_value()) {
segcore_config.set_dense_vector_intermin_index_type(
dense_vec_intermin_index_type.value());
}
segcore_config.set_nprobe(32);
// 1. load field data, and build binlog index for binlog data
LoadVectorField();
//assert segment has been built binlog index
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
// 2. search binlog index
auto num_queries = 10;
milvus::proto::plan::PlanNode plan_node;
auto vector_anns = plan_node.mutable_vector_anns();
vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector);
vector_anns->set_placeholder_tag("$0");
vector_anns->set_field_id(vec_field_id.get());
auto query_info = vector_anns->mutable_query_info();
query_info->set_topk(topk);
query_info->set_round_decimal(3);
query_info->set_metric_type(metric_type);
query_info->set_search_params(R"({"nprobe": 1024})");
auto plan_str = plan_node.SerializeAsString();
auto ph_group_raw =
data_type == DataType::VECTOR_FLOAT
? CreatePlaceholderGroupFromBlob(
num_queries,
data_d,
GenRandomFloatVecData(num_queries, data_d).get())
: CreateSparseFloatPlaceholderGroup(num_queries);
auto plan = milvus::query::CreateSearchPlanByExpr(
schema, plan_str.data(), plan_str.size());
auto ph_group =
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
std::vector<const milvus::query::PlaceholderGroup*> ph_group_arr = {
ph_group.get()};
auto nlist = segcore_config.get_nlist();
auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
EXPECT_EQ(binlog_index_sr->seg_offsets_.size(), num_queries * topk);
// 3. update vector index
{
milvus::index::CreateIndexInfo create_index_info;
create_index_info.field_type = data_type;
create_index_info.metric_type = metric_type;
create_index_info.index_type = index_type;
create_index_info.index_engine_version =
knowhere::Version::GetCurrentVersion().VersionNumber();
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
create_index_info, milvus::storage::FileManagerContext());
auto build_conf =
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
{knowhere::meta::DIM, std::to_string(data_d)},
{knowhere::indexparam::NLIST, "1024"}};
indexing->BuildWithDataset(raw_dataset, build_conf);
LoadIndexInfo load_info;
load_info.field_id = vec_field_id.get();
load_info.index_params = GenIndexParams(indexing.get());
load_info.cache_index =
CreateTestCacheIndex("test", std::move(indexing));
load_info.index_params["metric_type"] = metric_type;
ASSERT_NO_THROW(segment->LoadIndex(load_info));
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
auto similary = GetKnnSearchRecall(num_queries,
binlog_index_sr->seg_offsets_.data(),
topk,
ivf_sr->seg_offsets_.data(),
topk);
ASSERT_GT(similary, 0.45);
}
}
TEST_P(BinlogIndexTest, AccuracyWithMapFieldData) {
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
segment = CreateSealedSegment(schema, collection_index_meta);
LoadOtherFields();
auto& segcore_config = milvus::segcore::SegcoreConfig::default_config();
segcore_config.set_enable_interim_segment_index(true);
if (dense_vec_intermin_index_type.has_value()) {
segcore_config.set_dense_vector_intermin_index_type(
dense_vec_intermin_index_type.value());
}
segcore_config.set_nprobe(32);
// 1. load field data, and build binlog index for binlog data
LoadVectorField("./data/mmap-test");
//assert segment has been built binlog index
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
// 2. search binlog index
auto num_queries = 10;
milvus::proto::plan::PlanNode plan_node;
auto vector_anns = plan_node.mutable_vector_anns();
vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector);
vector_anns->set_placeholder_tag("$0");
vector_anns->set_field_id(vec_field_id.get());
auto query_info = vector_anns->mutable_query_info();
query_info->set_topk(topk);
query_info->set_round_decimal(3);
query_info->set_metric_type(metric_type);
query_info->set_search_params(R"({"nprobe": 1024})");
auto plan_str = plan_node.SerializeAsString();
auto ph_group_raw =
data_type == DataType::VECTOR_FLOAT
? CreatePlaceholderGroupFromBlob(
num_queries,
data_d,
GenRandomFloatVecData(num_queries, data_d).get())
: CreateSparseFloatPlaceholderGroup(num_queries);
auto plan = milvus::query::CreateSearchPlanByExpr(
schema, plan_str.data(), plan_str.size());
auto ph_group =
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
std::vector<const milvus::query::PlaceholderGroup*> ph_group_arr = {
ph_group.get()};
auto nlist = segcore_config.get_nlist();
auto binlog_index_sr =
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
EXPECT_EQ(binlog_index_sr->seg_offsets_.size(), num_queries * topk);
// 3. update vector index
{
milvus::index::CreateIndexInfo create_index_info;
create_index_info.field_type = data_type;
create_index_info.metric_type = metric_type;
create_index_info.index_type = index_type;
create_index_info.index_engine_version =
knowhere::Version::GetCurrentVersion().VersionNumber();
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
create_index_info, milvus::storage::FileManagerContext());
auto build_conf =
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
{knowhere::meta::DIM, std::to_string(data_d)},
{knowhere::indexparam::NLIST, "1024"}};
indexing->BuildWithDataset(raw_dataset, build_conf);
LoadIndexInfo load_info;
load_info.field_id = vec_field_id.get();
load_info.index_params = GenIndexParams(indexing.get());
load_info.cache_index =
CreateTestCacheIndex("test", std::move(indexing));
load_info.index_params["metric_type"] = metric_type;
ASSERT_NO_THROW(segment->LoadIndex(load_info));
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63);
auto similary = GetKnnSearchRecall(num_queries,
binlog_index_sr->seg_offsets_.data(),
topk,
ivf_sr->seg_offsets_.data(),
topk);
ASSERT_GT(similary, 0.45);
}
}
TEST_P(BinlogIndexTest, DisableInterimIndex) {
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
segment = CreateSealedSegment(schema, collection_index_meta);
LoadOtherFields();
SegcoreSetEnableInterminSegmentIndex(false);
LoadVectorField();
EXPECT_FALSE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
// load vector index
milvus::index::CreateIndexInfo create_index_info;
create_index_info.field_type = data_type;
create_index_info.metric_type = metric_type;
create_index_info.index_type = index_type;
create_index_info.index_engine_version =
knowhere::Version::GetCurrentVersion().VersionNumber();
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
create_index_info, milvus::storage::FileManagerContext());
auto build_conf =
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
{knowhere::meta::DIM, std::to_string(data_d)},
{knowhere::indexparam::NLIST, "1024"}};
indexing->BuildWithDataset(raw_dataset, build_conf);
LoadIndexInfo load_info;
load_info.field_id = vec_field_id.get();
load_info.index_params = GenIndexParams(indexing.get());
load_info.cache_index = CreateTestCacheIndex("test", std::move(indexing));
load_info.index_params["metric_type"] = metric_type;
ASSERT_NO_THROW(segment->LoadIndex(load_info));
EXPECT_TRUE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
}
TEST_P(BinlogIndexTest, LoadBingLogWihIDMAP) {
IndexMetaPtr collection_index_meta =
GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IDMAP);
segment = CreateSealedSegment(schema, collection_index_meta);
LoadOtherFields();
LoadVectorField();
EXPECT_FALSE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
}
TEST_P(BinlogIndexTest, LoadBinlogWithoutIndexMeta) {
IndexMetaPtr collection_index_meta =
GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IDMAP);
segment = CreateSealedSegment(schema, collection_index_meta);
SegcoreSetEnableInterminSegmentIndex(true);
LoadVectorField();
EXPECT_FALSE(segment->HasIndex(vec_field_id));
EXPECT_EQ(segment->get_row_count(), data_n);
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
}