mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
issue: #43427 This pr's main goal is merge #37417 to milvus 2.5 without conflicts. # Main Goals 1. Create and describe collections with geospatial type 2. Insert geospatial data into the insert binlog 3. Load segments containing geospatial data into memory 4. Enable query and search can display geospatial data 5. Support using GIS funtions like ST_EQUALS in query 6. Support R-Tree index for geometry type # Solution 1. **Add Type**: Modify the Milvus core by adding a Geospatial type in both the C++ and Go code layers, defining the Geospatial data structure and the corresponding interfaces. 2. **Dependency Libraries**: Introduce necessary geospatial data processing libraries. In the C++ source code, use Conan package management to include the GDAL library. In the Go source code, add the go-geom library to the go.mod file. 3. **Protocol Interface**: Revise the Milvus protocol to provide mechanisms for Geospatial message serialization and deserialization. 4. **Data Pipeline**: Facilitate interaction between the client and proxy using the WKT format for geospatial data. The proxy will convert all data into WKB format for downstream processing, providing column data interfaces, segment encapsulation, segment loading, payload writing, and cache block management. 5. **Query Operators**: Implement simple display and support for filter queries. Initially, focus on filtering based on spatial relationships for a single column of geospatial literal values, providing parsing and execution for query expressions.Now only support brutal search 7. **Client Modification**: Enable the client to handle user input for geospatial data and facilitate end-to-end testing.Check the modification in pymilvus. --------- Signed-off-by: Yinwei Li <yinwei.li@zilliz.com> Signed-off-by: Cai Zhang <cai.zhang@zilliz.com> Co-authored-by: ZhuXi <150327960+Yinwei-Yu@users.noreply.github.com>
834 lines
32 KiB
C++
834 lines
32 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
||
//
|
||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
||
// with the License. You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
||
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
||
// or implied. See the License for the specific language governing permissions and limitations under the License
|
||
|
||
#include <gtest/gtest.h>
|
||
#include <boost/filesystem.hpp>
|
||
#include <vector>
|
||
#include <string>
|
||
#include <fstream>
|
||
|
||
#include "index/RTreeIndex.h"
|
||
#include "storage/Util.h"
|
||
#include "storage/FileManager.h"
|
||
#include "common/Types.h"
|
||
#include "test_utils/TmpPath.h"
|
||
#include "pb/schema.pb.h"
|
||
#include "pb/plan.pb.h"
|
||
#include "common/Geometry.h"
|
||
#include "common/EasyAssert.h"
|
||
#include "index/IndexFactory.h"
|
||
#include "storage/InsertData.h"
|
||
#include "storage/PayloadReader.h"
|
||
#include "storage/DiskFileManagerImpl.h"
|
||
#include "test_utils/DataGen.h"
|
||
#include "query/ExecPlanNodeVisitor.h"
|
||
#include "common/Consts.h"
|
||
#include "test_utils/storage_test_utils.h"
|
||
#include "index/Utils.h"
|
||
#include "storage/ThreadPools.h"
|
||
#include "test_utils/cachinglayer_test_utils.h"
|
||
|
||
// Helper: create simple POINT(x,y) WKB (little-endian)
|
||
static std::string
|
||
CreatePointWKB(double x, double y) {
|
||
std::vector<uint8_t> wkb;
|
||
// Byte order – little endian (1)
|
||
wkb.push_back(0x01);
|
||
// Geometry type – Point (1) – 32-bit little endian
|
||
uint32_t geom_type = 1;
|
||
uint8_t* type_bytes = reinterpret_cast<uint8_t*>(&geom_type);
|
||
wkb.insert(wkb.end(), type_bytes, type_bytes + sizeof(uint32_t));
|
||
// X coordinate
|
||
uint8_t* x_bytes = reinterpret_cast<uint8_t*>(&x);
|
||
wkb.insert(wkb.end(), x_bytes, x_bytes + sizeof(double));
|
||
// Y coordinate
|
||
uint8_t* y_bytes = reinterpret_cast<uint8_t*>(&y);
|
||
wkb.insert(wkb.end(), y_bytes, y_bytes + sizeof(double));
|
||
return std::string(reinterpret_cast<const char*>(wkb.data()), wkb.size());
|
||
}
|
||
|
||
// Helper: create simple WKB from WKT
|
||
static std::string
|
||
CreateWkbFromWkt(const std::string& wkt) {
|
||
auto ctx = GEOS_init_r();
|
||
auto wkb = milvus::Geometry(ctx, wkt.c_str()).to_wkb_string();
|
||
GEOS_finish_r(ctx);
|
||
return wkb;
|
||
}
|
||
|
||
static milvus::Geometry
|
||
CreateGeometryFromWkt(const std::string& wkt) {
|
||
auto ctx = GEOS_init_r();
|
||
auto geom = milvus::Geometry(ctx, wkt.c_str());
|
||
GEOS_finish_r(ctx);
|
||
return geom;
|
||
}
|
||
|
||
// Helper: write an InsertData parquet file to "remote" storage managed by chunk_manager_
|
||
static std::string
|
||
WriteGeometryInsertFile(const milvus::storage::ChunkManagerPtr& cm,
|
||
const milvus::storage::FieldDataMeta& field_meta,
|
||
const std::string& remote_path,
|
||
const std::vector<std::string>& wkbs,
|
||
bool nullable = false,
|
||
const uint8_t* valid_bitmap = nullptr) {
|
||
auto field_data =
|
||
milvus::storage::CreateFieldData(milvus::storage::DataType::GEOMETRY,
|
||
milvus::storage::DataType::NONE,
|
||
nullable);
|
||
if (nullable && valid_bitmap != nullptr) {
|
||
field_data->FillFieldData(wkbs.data(), valid_bitmap, wkbs.size(), 0);
|
||
} else {
|
||
field_data->FillFieldData(wkbs.data(), wkbs.size());
|
||
}
|
||
auto payload_reader =
|
||
std::make_shared<milvus::storage::PayloadReader>(field_data);
|
||
milvus::storage::InsertData insert_data(payload_reader);
|
||
insert_data.SetFieldDataMeta(field_meta);
|
||
insert_data.SetTimestamps(0, 100);
|
||
|
||
auto bytes = insert_data.Serialize(milvus::storage::StorageType::Remote);
|
||
std::vector<uint8_t> buf(bytes.begin(), bytes.end());
|
||
cm->Write(remote_path, buf.data(), buf.size());
|
||
return remote_path;
|
||
}
|
||
|
||
class RTreeIndexTest : public ::testing::Test {
|
||
protected:
|
||
void
|
||
SetUp() override {
|
||
temp_path_ = milvus::test::TmpPath{};
|
||
// create storage config that writes to temp dir
|
||
storage_config_.storage_type = "local";
|
||
storage_config_.root_path = temp_path_.get().string();
|
||
chunk_manager_ = milvus::storage::CreateChunkManager(storage_config_);
|
||
|
||
// prepare field & index meta – minimal info for DiskFileManagerImpl
|
||
field_meta_ = milvus::storage::FieldDataMeta{1, 1, 1, 100};
|
||
// set geometry data type in field schema for index schema checks
|
||
field_meta_.field_schema.set_data_type(
|
||
::milvus::proto::schema::DataType::Geometry);
|
||
index_meta_ = milvus::storage::IndexMeta{.segment_id = 1,
|
||
.field_id = 100,
|
||
.build_id = 1,
|
||
.index_version = 1};
|
||
}
|
||
|
||
void
|
||
TearDown() override {
|
||
// Clean up chunk manager files and index directories
|
||
try {
|
||
// Remove all files in the storage root path
|
||
if (chunk_manager_) {
|
||
auto root_path = storage_config_.root_path;
|
||
if (boost::filesystem::exists(root_path)) {
|
||
for (auto& entry :
|
||
boost::filesystem::directory_iterator(root_path)) {
|
||
if (boost::filesystem::is_regular_file(entry)) {
|
||
boost::filesystem::remove(entry);
|
||
} else if (boost::filesystem::is_directory(entry)) {
|
||
boost::filesystem::remove_all(entry);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
boost::filesystem::remove_all("/tmp/milvus/rtree-index/");
|
||
} catch (const std::exception& e) {
|
||
// Log error but don't fail the test
|
||
std::cout << "Warning: Failed to clean up test files: " << e.what()
|
||
<< std::endl;
|
||
}
|
||
// TmpPath destructor will also remove the temp directory
|
||
}
|
||
|
||
// Helper method to clean up index files
|
||
void
|
||
CleanupIndexFiles(const std::vector<std::string>& index_files,
|
||
const std::string& test_name = "") {
|
||
try {
|
||
for (const auto& file : index_files) {
|
||
if (chunk_manager_->Exist(file)) {
|
||
chunk_manager_->Remove(file);
|
||
}
|
||
}
|
||
} catch (const std::exception& e) {
|
||
std::cout << "Warning: Failed to clean up " << test_name
|
||
<< " index files: " << e.what() << std::endl;
|
||
}
|
||
}
|
||
|
||
milvus::storage::StorageConfig storage_config_;
|
||
milvus::storage::ChunkManagerPtr chunk_manager_;
|
||
milvus::storage::FieldDataMeta field_meta_;
|
||
milvus::storage::IndexMeta index_meta_;
|
||
milvus::test::TmpPath temp_path_;
|
||
};
|
||
|
||
TEST_F(RTreeIndexTest, Build_Upload_Load) {
|
||
// ---------- Build via BuildWithRawDataForUT ----------
|
||
milvus::storage::FileManagerContext ctx_build(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree_build(ctx_build);
|
||
|
||
std::vector<std::string> wkbs = {CreatePointWKB(1.0, 1.0),
|
||
CreatePointWKB(2.0, 2.0)};
|
||
rtree_build.BuildWithRawDataForUT(wkbs.size(), wkbs.data());
|
||
|
||
ASSERT_EQ(rtree_build.Count(), 2);
|
||
|
||
// ---------- Upload ----------
|
||
auto stats = rtree_build.Upload({});
|
||
ASSERT_NE(stats, nullptr);
|
||
ASSERT_GT(stats->GetIndexFiles().size(), 0);
|
||
|
||
// ---------- Load back ----------
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
|
||
milvus::tracer::TraceContext trace_ctx; // empty context
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
|
||
ASSERT_EQ(rtree_load.Count(), 2);
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Load_WithFileNamesOnly) {
|
||
// Build & upload first
|
||
milvus::storage::FileManagerContext ctx_build(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree_build(ctx_build);
|
||
|
||
std::vector<std::string> wkbs2 = {CreatePointWKB(10.0, 10.0),
|
||
CreatePointWKB(20.0, 20.0)};
|
||
rtree_build.BuildWithRawDataForUT(wkbs2.size(), wkbs2.data());
|
||
|
||
auto stats = rtree_build.Upload({});
|
||
|
||
// gather only filenames (strip parent path)
|
||
std::vector<std::string> filenames;
|
||
for (const auto& path : stats->GetIndexFiles()) {
|
||
filenames.emplace_back(
|
||
boost::filesystem::path(path).filename().string());
|
||
// make sure file exists in remote storage
|
||
ASSERT_TRUE(chunk_manager_->Exist(path));
|
||
ASSERT_GT(chunk_manager_->Size(path), 0);
|
||
}
|
||
|
||
// Load using filename only list
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = filenames; // no directory info
|
||
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
|
||
ASSERT_EQ(rtree_load.Count(), 2);
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_EmptyInput_ShouldThrow) {
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
std::vector<std::string> empty;
|
||
EXPECT_THROW(rtree.BuildWithRawDataForUT(0, empty.data()),
|
||
milvus::SegcoreError);
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_WithInvalidWKB_Upload_Load) {
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
std::string bad = CreatePointWKB(0.0, 0.0);
|
||
bad.resize(bad.size() / 2); // truncate to make invalid
|
||
|
||
std::vector<std::string> wkbs = {
|
||
CreateWkbFromWkt("POINT(1 1)"), bad, CreateWkbFromWkt("POINT(2 2)")};
|
||
rtree.BuildWithRawDataForUT(wkbs.size(), wkbs.data());
|
||
|
||
// Upload and then load back to let loader compute count from wrapper
|
||
auto stats = rtree.Upload({});
|
||
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
|
||
// Only 2 valid points should be present
|
||
ASSERT_EQ(rtree_load.Count(), 2);
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_VariousGeometries) {
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
std::vector<std::string> wkbs = {
|
||
CreateWkbFromWkt("POINT(-1.5 2.5)"),
|
||
CreateWkbFromWkt("LINESTRING(0 0,1 1,2 3)"),
|
||
CreateWkbFromWkt("POLYGON((0 0,2 0,2 2,0 2,0 0))"),
|
||
CreateWkbFromWkt("POINT(1000000 -1000000)"),
|
||
CreateWkbFromWkt("POINT(0 0)")};
|
||
|
||
rtree.BuildWithRawDataForUT(wkbs.size(), wkbs.data());
|
||
ASSERT_EQ(rtree.Count(), wkbs.size());
|
||
|
||
auto stats = rtree.Upload({});
|
||
ASSERT_FALSE(stats->GetIndexFiles().empty());
|
||
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
ASSERT_EQ(rtree_load.Count(), wkbs.size());
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_ConfigAndMetaJson) {
|
||
// Prepare one insert file via storage pipeline
|
||
std::vector<std::string> wkbs = {CreateWkbFromWkt("POINT(0 0)"),
|
||
CreateWkbFromWkt("POINT(1 1)")};
|
||
auto remote_file = (temp_path_.get() / "geom.parquet").string();
|
||
WriteGeometryInsertFile(chunk_manager_, field_meta_, remote_file, wkbs);
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
nlohmann::json build_cfg;
|
||
build_cfg["insert_files"] = std::vector<std::string>{remote_file};
|
||
|
||
rtree.Build(build_cfg);
|
||
auto stats = rtree.Upload({});
|
||
|
||
// Cache remote index files locally
|
||
milvus::storage::DiskFileManagerImpl diskfm(
|
||
{field_meta_, index_meta_, chunk_manager_});
|
||
auto index_files = stats->GetIndexFiles();
|
||
auto load_priority =
|
||
milvus::index::GetValueFromConfig<milvus::proto::common::LoadPriority>(
|
||
build_cfg, milvus::LOAD_PRIORITY)
|
||
.value_or(milvus::proto::common::LoadPriority::HIGH);
|
||
diskfm.CacheIndexToDisk(index_files, load_priority);
|
||
auto local_paths = diskfm.GetLocalFilePaths();
|
||
ASSERT_FALSE(local_paths.empty());
|
||
// Determine base path like RTreeIndex::Load
|
||
auto ends_with = [](const std::string& value, const std::string& suffix) {
|
||
return value.size() >= suffix.size() &&
|
||
value.compare(
|
||
value.size() - suffix.size(), suffix.size(), suffix) == 0;
|
||
};
|
||
|
||
std::string base_path;
|
||
for (const auto& p : local_paths) {
|
||
if (ends_with(p, ".bgi")) {
|
||
base_path = p.substr(0, p.size() - 4);
|
||
break;
|
||
}
|
||
}
|
||
if (base_path.empty()) {
|
||
for (const auto& p : local_paths) {
|
||
if (ends_with(p, ".meta.json")) {
|
||
base_path =
|
||
p.substr(0, p.size() - std::string(".meta.json").size());
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
if (base_path.empty()) {
|
||
base_path = local_paths.front();
|
||
}
|
||
// Parse local meta json
|
||
std::ifstream ifs(base_path + ".meta.json");
|
||
ASSERT_TRUE(ifs.good());
|
||
nlohmann::json meta = nlohmann::json::parse(ifs);
|
||
ASSERT_EQ(meta["dimension"], 2);
|
||
|
||
// Clean up config and meta test files
|
||
CleanupIndexFiles(stats->GetIndexFiles(), "config test");
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Load_MixedFileNamesAndPaths) {
|
||
// Build and upload
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
std::vector<std::string> wkbs = {CreatePointWKB(6.0, 6.0),
|
||
CreatePointWKB(7.0, 7.0)};
|
||
rtree.BuildWithRawDataForUT(wkbs.size(), wkbs.data());
|
||
auto stats = rtree.Upload({});
|
||
|
||
// Use full list, but replace one with filename-only
|
||
auto mixed = stats->GetIndexFiles();
|
||
ASSERT_FALSE(mixed.empty());
|
||
mixed[0] = boost::filesystem::path(mixed[0]).filename().string();
|
||
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = mixed;
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
ASSERT_EQ(rtree_load.Count(), wkbs.size());
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Load_NonexistentRemote_ShouldThrow) {
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
// nonexist file
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = std::vector<std::string>{
|
||
(temp_path_.get() / "does_not_exist.bgi_0").string()};
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
EXPECT_THROW(rtree_load.Load(trace_ctx, cfg), milvus::SegcoreError);
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_EndToEnd_FromInsertFiles) {
|
||
// prepare remote file via InsertData serialization
|
||
std::vector<std::string> wkbs = {CreateWkbFromWkt("POINT(0 0)"),
|
||
CreateWkbFromWkt("POINT(2 2)")};
|
||
auto remote_file = (temp_path_.get() / "geom3.parquet").string();
|
||
WriteGeometryInsertFile(chunk_manager_, field_meta_, remote_file, wkbs);
|
||
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
nlohmann::json build_cfg;
|
||
build_cfg["insert_files"] = std::vector<std::string>{remote_file};
|
||
|
||
rtree.Build(build_cfg);
|
||
ASSERT_EQ(rtree.Count(), wkbs.size());
|
||
|
||
auto stats = rtree.Upload({});
|
||
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
ASSERT_EQ(rtree_load.Count(), wkbs.size());
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_Upload_Load_LargeDataset) {
|
||
// Generate ~10k POINT geometries
|
||
const size_t N = 10000;
|
||
std::vector<std::string> wkbs;
|
||
wkbs.reserve(N);
|
||
for (size_t i = 0; i < N; ++i) {
|
||
// POINT(i i)
|
||
wkbs.emplace_back(CreateWkbFromWkt("POINT(" + std::to_string(i) + " " +
|
||
std::to_string(i) + ")"));
|
||
}
|
||
|
||
// Write one insert file into remote storage
|
||
auto remote_file = (temp_path_.get() / "geom_large.parquet").string();
|
||
WriteGeometryInsertFile(chunk_manager_, field_meta_, remote_file, wkbs);
|
||
|
||
// Build from insert_files (not using BuildWithRawDataForUT)
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
nlohmann::json build_cfg;
|
||
build_cfg["insert_files"] = std::vector<std::string>{remote_file};
|
||
|
||
rtree.Build(build_cfg);
|
||
|
||
ASSERT_EQ(rtree.Count(), static_cast<int64_t>(N));
|
||
|
||
// Upload index
|
||
auto stats = rtree.Upload({});
|
||
ASSERT_GT(stats->GetIndexFiles().size(), 0);
|
||
|
||
// Load index back and verify
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg_load;
|
||
cfg_load["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg_load);
|
||
|
||
ASSERT_EQ(rtree_load.Count(), static_cast<int64_t>(N));
|
||
|
||
// Clean up large dataset index files to avoid conflicts
|
||
CleanupIndexFiles(stats->GetIndexFiles(), "large dataset");
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Build_BulkLoad_Nulls_And_BadWKB) {
|
||
// five geometries:
|
||
// 1. valid
|
||
// 2. valid but will be marked null
|
||
// 3. valid
|
||
// 4. will be truncated to make invalid
|
||
// 5. valid
|
||
std::vector<std::string> wkbs = {
|
||
CreateWkbFromWkt("POINT(0 0)"), // valid
|
||
CreateWkbFromWkt("POINT(1 1)"), // valid
|
||
CreateWkbFromWkt("POINT(2 2)"), // valid
|
||
CreatePointWKB(3.0, 3.0), // will be truncated to make invalid
|
||
CreateWkbFromWkt("POINT(4 4)") // valid
|
||
};
|
||
// make bad WKB: truncate the 4th geometry
|
||
wkbs[3].resize(wkbs[3].size() / 2);
|
||
|
||
// write to remote storage file (chunk manager's root directory)
|
||
auto remote_file = (temp_path_.get() / "geom_bulk.parquet").string();
|
||
WriteGeometryInsertFile(chunk_manager_, field_meta_, remote_file, wkbs);
|
||
|
||
// build (default to bulk load)
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
nlohmann::json build_cfg;
|
||
build_cfg["insert_files"] = std::vector<std::string>{remote_file};
|
||
|
||
rtree.Build(build_cfg);
|
||
|
||
// expect: 3 geometries (0, 2, 4) are valid and parsable, 1st geometry is marked null and skipped, 3rd geometry is bad WKB and skipped
|
||
ASSERT_EQ(rtree.Count(), 4);
|
||
|
||
// upload -> load back and verify consistency
|
||
auto stats = rtree.Upload({});
|
||
ASSERT_GT(stats->GetIndexFiles().size(), 0);
|
||
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
ASSERT_EQ(rtree_load.Count(), 4);
|
||
}
|
||
|
||
// The following two tests only test the coarse query (R-Tree) and not the exact query (GDAL)
|
||
|
||
TEST_F(RTreeIndexTest, Query_CoarseAndExact_Equals_Intersects_Within) {
|
||
// Build a small index in-memory (via UT API)
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
// Prepare simple geometries: two points and a square polygon
|
||
std::vector<std::string> wkbs;
|
||
wkbs.emplace_back(CreateWkbFromWkt("POINT(0 0)")); // id 0
|
||
wkbs.emplace_back(CreateWkbFromWkt("POINT(2 2)")); // id 1
|
||
wkbs.emplace_back(
|
||
CreateWkbFromWkt("POLYGON((0 0, 0 3, 3 3, 3 0, 0 0))")); // id 2 square
|
||
|
||
rtree.BuildWithRawDataForUT(wkbs.size(), wkbs.data(), {});
|
||
ASSERT_EQ(rtree.Count(), 3);
|
||
|
||
// Upload and then load into a new index instance for querying
|
||
auto stats = rtree.Upload({});
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
|
||
// Helper to run Query
|
||
auto run_query = [&](::milvus::proto::plan::GISFunctionFilterExpr_GISOp op,
|
||
const std::string& wkt) {
|
||
auto ds = std::make_shared<milvus::Dataset>();
|
||
ds->Set(milvus::index::OPERATOR_TYPE, op);
|
||
ds->Set(milvus::index::MATCH_VALUE, CreateGeometryFromWkt(wkt));
|
||
return rtree_load.Query(ds);
|
||
};
|
||
|
||
// Equals with same point should match id 0 only
|
||
{
|
||
auto bm =
|
||
run_query(::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Equals,
|
||
"POINT(0 0)");
|
||
EXPECT_TRUE(bm[0]);
|
||
EXPECT_FALSE(bm[1]);
|
||
EXPECT_TRUE(
|
||
bm[2]); //This is true because POINT(0 0) is within the square (0 0, 0 3, 3 3, 3 0, 0 0) and we have not done exact spatial query yet
|
||
}
|
||
|
||
// Intersects: square intersects point (on boundary considered intersect)
|
||
{
|
||
auto bm = run_query(
|
||
::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Intersects,
|
||
"POLYGON((0 0, 0 1, 1 1, 1 0, 0 0))");
|
||
// square(0..1) intersects POINT(0,0) and POLYGON(0..3)
|
||
// but not POINT(2,2)
|
||
EXPECT_TRUE(bm[0]); // point (0,0)
|
||
EXPECT_FALSE(bm[1]); // point (2,2)
|
||
EXPECT_TRUE(bm[2]); // big polygon
|
||
}
|
||
|
||
// Within: point within the big square
|
||
{
|
||
auto bm =
|
||
run_query(::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Within,
|
||
"POLYGON((0 0, 0 3, 3 3, 3 0, 0 0))");
|
||
EXPECT_TRUE(
|
||
bm[0]); // (0,0) is within or on boundary considered within by GDAL Within?
|
||
// GDAL Within returns true only if strictly inside (no boundary). If boundary excluded, (0,0) may be false.
|
||
// To make assertion robust across GEOS versions, simply check big polygon within itself should be true.
|
||
auto bm_poly =
|
||
run_query(::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Within,
|
||
"POLYGON((0 0, 0 3, 3 3, 3 0, 0 0))");
|
||
EXPECT_TRUE(bm_poly[2]);
|
||
}
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, Query_Touches_Contains_Crosses_Overlaps) {
|
||
milvus::storage::FileManagerContext ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
milvus::index::RTreeIndex<std::string> rtree(ctx);
|
||
|
||
// Two overlapping squares and one disjoint square
|
||
std::vector<std::string> wkbs;
|
||
wkbs.emplace_back(
|
||
CreateWkbFromWkt("POLYGON((0 0, 0 2, 2 2, 2 0, 0 0))")); // id 0
|
||
wkbs.emplace_back(CreateWkbFromWkt(
|
||
"POLYGON((1 1, 1 3, 3 3, 3 1, 1 1))")); // id 1 overlaps with 0
|
||
wkbs.emplace_back(CreateWkbFromWkt(
|
||
"POLYGON((4 4, 4 5, 5 5, 5 4, 4 4))")); // id 2 disjoint
|
||
|
||
rtree.BuildWithRawDataForUT(wkbs.size(), wkbs.data(), {});
|
||
ASSERT_EQ(rtree.Count(), 3);
|
||
|
||
// Upload and load a new instance for querying
|
||
auto stats = rtree.Upload({});
|
||
milvus::storage::FileManagerContext ctx_load(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
ctx_load.set_for_loading_index(true);
|
||
milvus::index::RTreeIndex<std::string> rtree_load(ctx_load);
|
||
nlohmann::json cfg;
|
||
cfg["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx;
|
||
rtree_load.Load(trace_ctx, cfg);
|
||
|
||
auto run_query = [&](::milvus::proto::plan::GISFunctionFilterExpr_GISOp op,
|
||
const std::string& wkt) {
|
||
auto ds = std::make_shared<milvus::Dataset>();
|
||
ds->Set(milvus::index::OPERATOR_TYPE, op);
|
||
ds->Set(milvus::index::MATCH_VALUE, CreateGeometryFromWkt(wkt));
|
||
return rtree_load.Query(ds);
|
||
};
|
||
|
||
// Overlaps: query polygon overlapping both 0 and 1
|
||
{
|
||
auto bm = run_query(
|
||
::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Overlaps,
|
||
"POLYGON((0.5 0.5, 0.5 2.5, 2.5 2.5, 2.5 0.5, 0.5 0.5))");
|
||
EXPECT_TRUE(bm[0]);
|
||
EXPECT_TRUE(bm[1]);
|
||
EXPECT_FALSE(bm[2]);
|
||
}
|
||
|
||
// Contains: big polygon contains small polygon
|
||
{
|
||
auto bm = run_query(
|
||
::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Contains,
|
||
"POLYGON(( -1 -1, -1 4, 4 4, 4 -1, -1 -1))");
|
||
EXPECT_TRUE(bm[0]);
|
||
EXPECT_TRUE(bm[1]);
|
||
EXPECT_TRUE(bm[2]);
|
||
}
|
||
|
||
// Touches: polygon that only touches at the corner (2,2) with id1
|
||
{
|
||
auto bm = run_query(
|
||
::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Touches,
|
||
"POLYGON((2 2, 2 3, 3 3, 3 2, 2 2))");
|
||
// This touches id1 at (2,2); depending on GEOS, touches excludes interior intersection
|
||
// The id0 might also touch at (2,2). We only assert at least one touch.
|
||
EXPECT_TRUE(bm[0] || bm[1]);
|
||
}
|
||
|
||
// Crosses: a segment crossing the first polygon
|
||
{
|
||
auto bm = run_query(
|
||
::milvus::proto::plan::GISFunctionFilterExpr_GISOp_Crosses,
|
||
"LINESTRING( -1 1, 3 1 )");
|
||
EXPECT_TRUE(bm[0]);
|
||
}
|
||
}
|
||
|
||
TEST_F(RTreeIndexTest, GIS_Index_Exact_Filtering) {
|
||
using namespace milvus;
|
||
using namespace milvus::query;
|
||
using namespace milvus::segcore;
|
||
|
||
// 1) Create schema: id (INT64, primary), vector, geometry
|
||
auto schema = std::make_shared<Schema>();
|
||
auto pk_id = schema->AddDebugField("id", DataType::INT64);
|
||
auto dim = 16;
|
||
auto vec_id = schema->AddDebugField(
|
||
"vec", DataType::VECTOR_FLOAT, dim, knowhere::metric::L2);
|
||
auto geo_id = schema->AddDebugField("geo", DataType::GEOMETRY);
|
||
schema->set_primary_field_id(pk_id);
|
||
|
||
int N = 200;
|
||
int num_iters = 1;
|
||
auto full_ds = DataGen(schema, N * num_iters);
|
||
auto sealed =
|
||
CreateSealedWithFieldDataLoaded(schema, full_ds, false, {geo_id.get()});
|
||
|
||
// Prepare controlled geometry WKBs mirroring the shapes used in growing
|
||
std::vector<std::string> wkbs;
|
||
wkbs.reserve(N * num_iters);
|
||
auto ctx = GEOS_init_r();
|
||
for (int i = 0; i < N * num_iters; ++i) {
|
||
if (i % 4 == 0) {
|
||
wkbs.emplace_back(
|
||
milvus::Geometry(ctx, "POINT(0 0)").to_wkb_string());
|
||
} else if (i % 4 == 1) {
|
||
wkbs.emplace_back(
|
||
milvus::Geometry(ctx, "POLYGON((-1 -1,1 -1,1 1,-1 1,-1 -1))")
|
||
.to_wkb_string());
|
||
} else if (i % 4 == 2) {
|
||
wkbs.emplace_back(
|
||
milvus::Geometry(ctx,
|
||
"POLYGON((10 10,20 10,20 20,10 20,10 10))")
|
||
.to_wkb_string());
|
||
} else {
|
||
wkbs.emplace_back(
|
||
milvus::Geometry(ctx, "LINESTRING(-1 0,1 0)").to_wkb_string());
|
||
}
|
||
}
|
||
|
||
// Clean up GEOS context immediately after creating WKB data
|
||
GEOS_finish_r(ctx);
|
||
|
||
// now load the controlled geometry data into sealed
|
||
auto geo_field_data =
|
||
milvus::storage::CreateFieldData(milvus::storage::DataType::GEOMETRY,
|
||
milvus::storage::DataType::NONE,
|
||
false);
|
||
geo_field_data->FillFieldData(wkbs.data(), wkbs.size());
|
||
|
||
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
|
||
.GetRemoteChunkManager();
|
||
auto load_info = PrepareSingleFieldInsertBinlog(
|
||
1, 1, 1, geo_id.get(), {geo_field_data}, cm);
|
||
sealed->LoadFieldData(load_info);
|
||
|
||
// build geometry R-Tree index files and load into sealed
|
||
// Write a single parquet for geometry to simulate build input
|
||
// wkbs already prepared above
|
||
auto remote_file = (temp_path_.get() / "rtree_e2e.parquet").string();
|
||
WriteGeometryInsertFile(chunk_manager_, field_meta_, remote_file, wkbs);
|
||
|
||
// build index files by invoking RTreeIndex::Build
|
||
milvus::storage::FileManagerContext fm_ctx(
|
||
field_meta_, index_meta_, chunk_manager_);
|
||
auto rtree_index =
|
||
std::make_unique<milvus::index::RTreeIndex<std::string>>(fm_ctx);
|
||
nlohmann::json build_cfg;
|
||
build_cfg["insert_files"] = std::vector<std::string>{remote_file};
|
||
build_cfg["index_type"] = milvus::index::RTREE_INDEX_TYPE;
|
||
|
||
rtree_index->Build(build_cfg);
|
||
auto stats = rtree_index->Upload({});
|
||
|
||
// load geometry index into sealed segment
|
||
milvus::segcore::LoadIndexInfo info{};
|
||
info.collection_id = 1;
|
||
info.partition_id = 1;
|
||
info.segment_id = 1;
|
||
info.field_id = geo_id.get();
|
||
info.field_type = DataType::GEOMETRY;
|
||
info.index_id = 1;
|
||
info.index_build_id = 1;
|
||
info.index_version = 1;
|
||
info.schema = proto::schema::FieldSchema();
|
||
info.schema.set_data_type(proto::schema::DataType::Geometry);
|
||
info.index_params["index_type"] = milvus::index::RTREE_INDEX_TYPE;
|
||
|
||
nlohmann::json cfg_load;
|
||
cfg_load["index_files"] = stats->GetIndexFiles();
|
||
milvus::tracer::TraceContext trace_ctx_load;
|
||
rtree_index->Load(trace_ctx_load, cfg_load);
|
||
|
||
info.cache_index =
|
||
CreateTestCacheIndex("rtree_index_key", std::move(rtree_index));
|
||
sealed->LoadIndex(info);
|
||
|
||
// 3) Build a GIS filter expression and run exact filtering via segcore
|
||
auto test_op = [&](const std::string& wkt,
|
||
proto::plan::GISFunctionFilterExpr_GISOp op,
|
||
std::function<bool(int)> expected) {
|
||
auto gis_expr = std::make_shared<milvus::expr::GISFunctionFilterExpr>(
|
||
milvus::expr::ColumnInfo(geo_id, DataType::GEOMETRY), op, wkt);
|
||
auto plan = std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID,
|
||
gis_expr);
|
||
BitsetType bits =
|
||
ExecuteQueryExpr(plan, sealed.get(), N * num_iters, MAX_TIMESTAMP);
|
||
ASSERT_EQ(bits.size(), N * num_iters);
|
||
for (int i = 0; i < N * num_iters; ++i) {
|
||
EXPECT_EQ(bool(bits[i]), expected(i)) << "i=" << i;
|
||
}
|
||
};
|
||
|
||
// exact within: polygon around origin should include indices 0,1,3
|
||
test_op("POLYGON((-2 -2,2 -2,2 2,-2 2,-2 -2))",
|
||
proto::plan::GISFunctionFilterExpr_GISOp_Within,
|
||
[](int i) { return (i % 4 == 0) || (i % 4 == 1) || (i % 4 == 3); });
|
||
|
||
// exact intersects: point (0,0) should intersect point, polygon containing it, and line through it
|
||
test_op("POINT(0 0)",
|
||
proto::plan::GISFunctionFilterExpr_GISOp_Intersects,
|
||
[](int i) { return (i % 4 == 0) || (i % 4 == 1) || (i % 4 == 3); });
|
||
|
||
// exact equals: only the point equals
|
||
test_op("POINT(0 0)",
|
||
proto::plan::GISFunctionFilterExpr_GISOp_Equals,
|
||
[](int i) { return (i % 4 == 0); });
|
||
|
||
// Explicit cleanup for this test to avoid conflicts
|
||
sealed.reset(); // Release the sealed segment first
|
||
|
||
// Clean up any remaining index files
|
||
CleanupIndexFiles(stats->GetIndexFiles(), "GIS filtering test");
|
||
} |