mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
issue: #42942 This pr includes the following changes: 1. Added checks for index checker in querycoord to generate drop index tasks 2. Added drop index interface to querynode 3. To avoid search failure after dropping the index, the querynode allows the use of lazy mode (warmup=disable) to load raw data even when indexes contain raw data. 4. In segcore, loading the index no longer deletes raw data; instead, it evicts it. 5. In expr, the index is pinned to prevent concurrent errors. --------- Signed-off-by: sunby <sunbingyi1992@gmail.com>
462 lines
19 KiB
C++
462 lines
19 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include <gtest/gtest.h>
|
|
#include <boost/format.hpp>
|
|
#include <optional>
|
|
|
|
#include "index/IndexFactory.h"
|
|
#include "pb/plan.pb.h"
|
|
#include "query/Plan.h"
|
|
#include "segcore/segcore_init_c.h"
|
|
#include "segcore/SegmentSealed.h"
|
|
|
|
#include "test_utils/cachinglayer_test_utils.h"
|
|
#include "test_utils/DataGen.h"
|
|
#include "test_utils/storage_test_utils.h"
|
|
|
|
using namespace milvus;
|
|
using namespace milvus::segcore;
|
|
|
|
std::unique_ptr<float[]>
|
|
GenRandomFloatVecData(int rows, int dim, int seed = 42) {
|
|
auto vecs = std::make_unique<float[]>(rows * dim);
|
|
std::mt19937 rng(seed);
|
|
std::uniform_int_distribution<> distrib(0.0, 100.0);
|
|
for (int i = 0; i < rows * dim; ++i) vecs[i] = (float)distrib(rng);
|
|
return vecs;
|
|
}
|
|
|
|
inline float
|
|
GetKnnSearchRecall(
|
|
size_t nq, int64_t* gt_ids, size_t gt_k, int64_t* res_ids, size_t res_k) {
|
|
uint32_t matched_num = 0;
|
|
for (auto i = 0; i < nq; ++i) {
|
|
std::vector<int64_t> ids_0(gt_ids + i * gt_k,
|
|
gt_ids + i * gt_k + res_k);
|
|
std::vector<int64_t> ids_1(res_ids + i * res_k,
|
|
res_ids + i * res_k + res_k);
|
|
|
|
std::sort(ids_0.begin(), ids_0.end());
|
|
std::sort(ids_1.begin(), ids_1.end());
|
|
|
|
std::vector<int64_t> v(std::max(ids_0.size(), ids_1.size()));
|
|
std::vector<int64_t>::iterator it;
|
|
it = std::set_intersection(
|
|
ids_0.begin(), ids_0.end(), ids_1.begin(), ids_1.end(), v.begin());
|
|
v.resize(it - v.begin());
|
|
matched_num += v.size();
|
|
}
|
|
return ((float)matched_num) / ((float)nq * res_k);
|
|
}
|
|
|
|
using Param =
|
|
std::tuple<DataType,
|
|
knowhere::MetricType,
|
|
/* IndexType */ std::string,
|
|
/* DenseVectorInterminIndexType*/ std::optional<std::string>>;
|
|
class BinlogIndexTest : public ::testing::TestWithParam<Param> {
|
|
void
|
|
SetUp() override {
|
|
std::tie(
|
|
data_type, metric_type, index_type, dense_vec_intermin_index_type) =
|
|
GetParam();
|
|
|
|
schema = std::make_shared<Schema>();
|
|
|
|
vec_field_id =
|
|
schema->AddDebugField("fakevec", data_type, data_d, metric_type);
|
|
auto i64_fid = schema->AddDebugField("counter", DataType::INT64);
|
|
schema->set_primary_field_id(i64_fid);
|
|
vec_field_data = storage::CreateFieldData(data_type, false, data_d);
|
|
|
|
if (data_type == DataType::VECTOR_FLOAT) {
|
|
auto vec_data = GenRandomFloatVecData(data_n, data_d);
|
|
vec_field_data->FillFieldData(vec_data.get(), data_n);
|
|
raw_dataset = knowhere::GenDataSet(data_n, data_d, vec_data.get());
|
|
raw_dataset->SetIsOwner(true);
|
|
vec_data.release();
|
|
if (dense_vec_intermin_index_type.has_value() &&
|
|
dense_vec_intermin_index_type.value() ==
|
|
knowhere::IndexEnum::INDEX_FAISS_SCANN_DVR) {
|
|
intermin_index_has_raw_data = false;
|
|
} else {
|
|
intermin_index_has_raw_data = true;
|
|
}
|
|
} else if (data_type == DataType::VECTOR_SPARSE_U32_F32) {
|
|
auto sparse_vecs = GenerateRandomSparseFloatVector(data_n);
|
|
vec_field_data->FillFieldData(sparse_vecs.get(), data_n);
|
|
data_d = std::dynamic_pointer_cast<
|
|
milvus::FieldData<milvus::SparseFloatVector>>(
|
|
vec_field_data)
|
|
->Dim();
|
|
raw_dataset =
|
|
knowhere::GenDataSet(data_n, data_d, sparse_vecs.get());
|
|
raw_dataset->SetIsOwner(true);
|
|
raw_dataset->SetIsSparse(true);
|
|
sparse_vecs.release();
|
|
intermin_index_has_raw_data = false;
|
|
} else {
|
|
throw std::runtime_error("not implemented");
|
|
}
|
|
}
|
|
|
|
public:
|
|
IndexMetaPtr
|
|
GetCollectionIndexMeta(std::string index_type) {
|
|
std::map<std::string, std::string> index_params = {
|
|
{"index_type", index_type},
|
|
{"metric_type", metric_type},
|
|
{"nlist", "1024"}};
|
|
std::map<std::string, std::string> type_params = {{"dim", "128"}};
|
|
FieldIndexMeta fieldIndexMeta(
|
|
vec_field_id, std::move(index_params), std::move(type_params));
|
|
auto& config = SegcoreConfig::default_config();
|
|
config.set_chunk_rows(1024);
|
|
config.set_enable_interim_segment_index(true);
|
|
std::map<FieldId, FieldIndexMeta> filedMap = {
|
|
{vec_field_id, fieldIndexMeta}};
|
|
IndexMetaPtr metaPtr =
|
|
std::make_shared<CollectionIndexMeta>(226985, std::move(filedMap));
|
|
return std::move(metaPtr);
|
|
}
|
|
|
|
void
|
|
LoadOtherFields() {
|
|
auto dataset = DataGen(schema, data_n);
|
|
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
|
|
.GetRemoteChunkManager();
|
|
auto load_info = PrepareInsertBinlog(kCollectionID,
|
|
kPartitionID,
|
|
kSegmentID,
|
|
dataset,
|
|
cm,
|
|
"",
|
|
{vec_field_id.get()});
|
|
segment->LoadFieldData(load_info);
|
|
}
|
|
|
|
void
|
|
LoadVectorField(std::string mmap_dir_path = "") {
|
|
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
|
|
.GetRemoteChunkManager();
|
|
auto load_info = PrepareSingleFieldInsertBinlog(kCollectionID,
|
|
kPartitionID,
|
|
kSegmentID,
|
|
vec_field_id.get(),
|
|
{vec_field_data},
|
|
cm,
|
|
mmap_dir_path);
|
|
segment->LoadFieldData(load_info);
|
|
}
|
|
|
|
protected:
|
|
milvus::SchemaPtr schema;
|
|
knowhere::MetricType metric_type;
|
|
DataType data_type;
|
|
std::optional<std::string> dense_vec_intermin_index_type = std::nullopt;
|
|
std::string index_type;
|
|
size_t data_n = 10000;
|
|
size_t data_d = 128;
|
|
size_t topk = 10;
|
|
milvus::FieldDataPtr vec_field_data = nullptr;
|
|
milvus::segcore::SegmentSealedUPtr segment = nullptr;
|
|
milvus::FieldId vec_field_id;
|
|
knowhere::DataSetPtr raw_dataset;
|
|
bool intermin_index_has_raw_data;
|
|
};
|
|
|
|
INSTANTIATE_TEST_SUITE_P(
|
|
MetricTypeParameters,
|
|
BinlogIndexTest,
|
|
::testing::Values(
|
|
std::make_tuple(DataType::VECTOR_FLOAT,
|
|
knowhere::metric::L2,
|
|
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
|
|
knowhere::IndexEnum::
|
|
INDEX_FAISS_IVFFLAT_CC), // intermin index has data
|
|
std::make_tuple(
|
|
DataType::VECTOR_FLOAT,
|
|
knowhere::metric::L2,
|
|
knowhere::IndexEnum::INDEX_FAISS_IVFFLAT,
|
|
knowhere::IndexEnum::
|
|
INDEX_FAISS_SCANN_DVR), // intermin index not has data
|
|
std::make_tuple(
|
|
DataType::VECTOR_SPARSE_U32_F32,
|
|
knowhere::metric::IP,
|
|
knowhere::IndexEnum::
|
|
INDEX_SPARSE_INVERTED_INDEX, //intermin index not has data
|
|
std::nullopt),
|
|
std::make_tuple(DataType::VECTOR_SPARSE_U32_F32,
|
|
knowhere::metric::IP,
|
|
knowhere::IndexEnum::
|
|
INDEX_SPARSE_WAND, // intermin index not has data
|
|
std::nullopt)));
|
|
|
|
TEST_P(BinlogIndexTest, AccuracyWithLoadFieldData) {
|
|
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
|
|
|
|
segment = CreateSealedSegment(schema, collection_index_meta);
|
|
LoadOtherFields();
|
|
|
|
auto& segcore_config = milvus::segcore::SegcoreConfig::default_config();
|
|
segcore_config.set_enable_interim_segment_index(true);
|
|
if (dense_vec_intermin_index_type.has_value()) {
|
|
segcore_config.set_dense_vector_intermin_index_type(
|
|
dense_vec_intermin_index_type.value());
|
|
}
|
|
segcore_config.set_nprobe(32);
|
|
// 1. load field data, and build binlog index for binlog data
|
|
LoadVectorField();
|
|
|
|
//assert segment has been built binlog index
|
|
EXPECT_TRUE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
|
|
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
|
|
|
|
// 2. search binlog index
|
|
auto num_queries = 10;
|
|
|
|
milvus::proto::plan::PlanNode plan_node;
|
|
auto vector_anns = plan_node.mutable_vector_anns();
|
|
vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector);
|
|
vector_anns->set_placeholder_tag("$0");
|
|
vector_anns->set_field_id(vec_field_id.get());
|
|
auto query_info = vector_anns->mutable_query_info();
|
|
query_info->set_topk(topk);
|
|
query_info->set_round_decimal(3);
|
|
query_info->set_metric_type(metric_type);
|
|
query_info->set_search_params(R"({"nprobe": 1024})");
|
|
auto plan_str = plan_node.SerializeAsString();
|
|
|
|
auto ph_group_raw =
|
|
data_type == DataType::VECTOR_FLOAT
|
|
? CreatePlaceholderGroupFromBlob(
|
|
num_queries,
|
|
data_d,
|
|
GenRandomFloatVecData(num_queries, data_d).get())
|
|
: CreateSparseFloatPlaceholderGroup(num_queries);
|
|
|
|
auto plan = milvus::query::CreateSearchPlanByExpr(
|
|
schema, plan_str.data(), plan_str.size());
|
|
auto ph_group =
|
|
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
|
|
|
std::vector<const milvus::query::PlaceholderGroup*> ph_group_arr = {
|
|
ph_group.get()};
|
|
auto nlist = segcore_config.get_nlist();
|
|
auto binlog_index_sr =
|
|
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
|
|
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
|
|
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
|
|
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
|
|
EXPECT_EQ(binlog_index_sr->seg_offsets_.size(), num_queries * topk);
|
|
|
|
// 3. update vector index
|
|
{
|
|
milvus::index::CreateIndexInfo create_index_info;
|
|
create_index_info.field_type = data_type;
|
|
create_index_info.metric_type = metric_type;
|
|
create_index_info.index_type = index_type;
|
|
create_index_info.index_engine_version =
|
|
knowhere::Version::GetCurrentVersion().VersionNumber();
|
|
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
|
|
create_index_info, milvus::storage::FileManagerContext());
|
|
|
|
auto build_conf =
|
|
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
|
|
{knowhere::meta::DIM, std::to_string(data_d)},
|
|
{knowhere::indexparam::NLIST, "1024"}};
|
|
indexing->BuildWithDataset(raw_dataset, build_conf);
|
|
|
|
LoadIndexInfo load_info;
|
|
load_info.field_id = vec_field_id.get();
|
|
load_info.index_params = GenIndexParams(indexing.get());
|
|
load_info.cache_index =
|
|
CreateTestCacheIndex("test", std::move(indexing));
|
|
load_info.index_params["metric_type"] = metric_type;
|
|
ASSERT_NO_THROW(segment->LoadIndex(load_info));
|
|
EXPECT_TRUE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
|
|
auto similary = GetKnnSearchRecall(num_queries,
|
|
binlog_index_sr->seg_offsets_.data(),
|
|
topk,
|
|
ivf_sr->seg_offsets_.data(),
|
|
topk);
|
|
ASSERT_GT(similary, 0.45);
|
|
}
|
|
}
|
|
|
|
TEST_P(BinlogIndexTest, AccuracyWithMapFieldData) {
|
|
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
|
|
|
|
segment = CreateSealedSegment(schema, collection_index_meta);
|
|
LoadOtherFields();
|
|
|
|
auto& segcore_config = milvus::segcore::SegcoreConfig::default_config();
|
|
segcore_config.set_enable_interim_segment_index(true);
|
|
if (dense_vec_intermin_index_type.has_value()) {
|
|
segcore_config.set_dense_vector_intermin_index_type(
|
|
dense_vec_intermin_index_type.value());
|
|
}
|
|
segcore_config.set_nprobe(32);
|
|
// 1. load field data, and build binlog index for binlog data
|
|
LoadVectorField("./data/mmap-test");
|
|
|
|
//assert segment has been built binlog index
|
|
EXPECT_TRUE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
|
|
|
|
// 2. search binlog index
|
|
auto num_queries = 10;
|
|
|
|
milvus::proto::plan::PlanNode plan_node;
|
|
auto vector_anns = plan_node.mutable_vector_anns();
|
|
vector_anns->set_vector_type(milvus::proto::plan::VectorType::FloatVector);
|
|
vector_anns->set_placeholder_tag("$0");
|
|
vector_anns->set_field_id(vec_field_id.get());
|
|
|
|
auto query_info = vector_anns->mutable_query_info();
|
|
query_info->set_topk(topk);
|
|
query_info->set_round_decimal(3);
|
|
query_info->set_metric_type(metric_type);
|
|
query_info->set_search_params(R"({"nprobe": 1024})");
|
|
auto plan_str = plan_node.SerializeAsString();
|
|
|
|
auto ph_group_raw =
|
|
data_type == DataType::VECTOR_FLOAT
|
|
? CreatePlaceholderGroupFromBlob(
|
|
num_queries,
|
|
data_d,
|
|
GenRandomFloatVecData(num_queries, data_d).get())
|
|
: CreateSparseFloatPlaceholderGroup(num_queries);
|
|
|
|
auto plan = milvus::query::CreateSearchPlanByExpr(
|
|
schema, plan_str.data(), plan_str.size());
|
|
auto ph_group =
|
|
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
|
|
|
std::vector<const milvus::query::PlaceholderGroup*> ph_group_arr = {
|
|
ph_group.get()};
|
|
auto nlist = segcore_config.get_nlist();
|
|
auto binlog_index_sr =
|
|
segment->Search(plan.get(), ph_group.get(), 1L << 63, 0);
|
|
ASSERT_EQ(binlog_index_sr->total_nq_, num_queries);
|
|
EXPECT_EQ(binlog_index_sr->unity_topK_, topk);
|
|
EXPECT_EQ(binlog_index_sr->distances_.size(), num_queries * topk);
|
|
EXPECT_EQ(binlog_index_sr->seg_offsets_.size(), num_queries * topk);
|
|
|
|
// 3. update vector index
|
|
{
|
|
milvus::index::CreateIndexInfo create_index_info;
|
|
create_index_info.field_type = data_type;
|
|
create_index_info.metric_type = metric_type;
|
|
create_index_info.index_type = index_type;
|
|
create_index_info.index_engine_version =
|
|
knowhere::Version::GetCurrentVersion().VersionNumber();
|
|
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
|
|
create_index_info, milvus::storage::FileManagerContext());
|
|
|
|
auto build_conf =
|
|
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
|
|
{knowhere::meta::DIM, std::to_string(data_d)},
|
|
{knowhere::indexparam::NLIST, "1024"}};
|
|
indexing->BuildWithDataset(raw_dataset, build_conf);
|
|
|
|
LoadIndexInfo load_info;
|
|
load_info.field_id = vec_field_id.get();
|
|
load_info.index_params = GenIndexParams(indexing.get());
|
|
load_info.cache_index =
|
|
CreateTestCacheIndex("test", std::move(indexing));
|
|
load_info.index_params["metric_type"] = metric_type;
|
|
ASSERT_NO_THROW(segment->LoadIndex(load_info));
|
|
EXPECT_TRUE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
auto ivf_sr = segment->Search(plan.get(), ph_group.get(), 1L << 63);
|
|
auto similary = GetKnnSearchRecall(num_queries,
|
|
binlog_index_sr->seg_offsets_.data(),
|
|
topk,
|
|
ivf_sr->seg_offsets_.data(),
|
|
topk);
|
|
ASSERT_GT(similary, 0.45);
|
|
}
|
|
}
|
|
|
|
TEST_P(BinlogIndexTest, DisableInterimIndex) {
|
|
IndexMetaPtr collection_index_meta = GetCollectionIndexMeta(index_type);
|
|
|
|
segment = CreateSealedSegment(schema, collection_index_meta);
|
|
LoadOtherFields();
|
|
SegcoreSetEnableInterminSegmentIndex(false);
|
|
|
|
LoadVectorField();
|
|
|
|
EXPECT_FALSE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
|
|
// load vector index
|
|
milvus::index::CreateIndexInfo create_index_info;
|
|
create_index_info.field_type = data_type;
|
|
create_index_info.metric_type = metric_type;
|
|
create_index_info.index_type = index_type;
|
|
create_index_info.index_engine_version =
|
|
knowhere::Version::GetCurrentVersion().VersionNumber();
|
|
auto indexing = milvus::index::IndexFactory::GetInstance().CreateIndex(
|
|
create_index_info, milvus::storage::FileManagerContext());
|
|
|
|
auto build_conf =
|
|
knowhere::Json{{knowhere::meta::METRIC_TYPE, metric_type},
|
|
{knowhere::meta::DIM, std::to_string(data_d)},
|
|
{knowhere::indexparam::NLIST, "1024"}};
|
|
|
|
indexing->BuildWithDataset(raw_dataset, build_conf);
|
|
|
|
LoadIndexInfo load_info;
|
|
load_info.field_id = vec_field_id.get();
|
|
load_info.index_params = GenIndexParams(indexing.get());
|
|
load_info.cache_index = CreateTestCacheIndex("test", std::move(indexing));
|
|
load_info.index_params["metric_type"] = metric_type;
|
|
|
|
ASSERT_NO_THROW(segment->LoadIndex(load_info));
|
|
EXPECT_TRUE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
}
|
|
|
|
TEST_P(BinlogIndexTest, LoadBingLogWihIDMAP) {
|
|
IndexMetaPtr collection_index_meta =
|
|
GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IDMAP);
|
|
|
|
segment = CreateSealedSegment(schema, collection_index_meta);
|
|
LoadOtherFields();
|
|
LoadVectorField();
|
|
|
|
EXPECT_FALSE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
|
|
}
|
|
|
|
TEST_P(BinlogIndexTest, LoadBinlogWithoutIndexMeta) {
|
|
IndexMetaPtr collection_index_meta =
|
|
GetCollectionIndexMeta(knowhere::IndexEnum::INDEX_FAISS_IDMAP);
|
|
|
|
segment = CreateSealedSegment(schema, collection_index_meta);
|
|
SegcoreSetEnableInterminSegmentIndex(true);
|
|
|
|
LoadVectorField();
|
|
|
|
EXPECT_FALSE(segment->HasIndex(vec_field_id));
|
|
EXPECT_EQ(segment->get_row_count(), data_n);
|
|
EXPECT_TRUE(segment->HasFieldData(vec_field_id));
|
|
}
|