enhance: align the behavior of exist expr between brute force and index (#44030)

https://github.com/milvus-io/milvus/issues/44031

---------

Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
Bingyi Sun 2025-09-01 15:03:52 +08:00 committed by GitHub
parent e2f34d7b78
commit c420e7bd27
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 153 additions and 4 deletions

View File

@ -40,6 +40,12 @@
#include "rapidjson/stringbuffer.h"
namespace milvus {
bool
isObjectEmpty(simdjson::ondemand::value value);
bool
isDocEmpty(simdjson::ondemand::document document);
// function to extract specific keys and convert them to json
// rapidjson is suitable for extract and reconstruct serialization
// instead of simdjson which not suitable for serialization
@ -210,10 +216,12 @@ class Json {
exist(std::string_view pointer) const {
auto doc = this->doc();
if (pointer.empty()) {
return doc.error() == simdjson::SUCCESS && !doc.is_null();
return doc.error() == simdjson::SUCCESS &&
!isDocEmpty(std::move(doc));
} else {
auto res = doc.at_pointer(pointer);
return res.error() == simdjson::SUCCESS && !res.is_null();
return res.error() == simdjson::SUCCESS &&
!isObjectEmpty(res.value());
}
}
@ -308,4 +316,61 @@ class Json {
own_data_{}; // this could be empty, then the Json will be just s view on bytes
simdjson::padded_string_view data_{};
};
inline bool
isObjectEmpty(simdjson::ondemand::value value) {
if (value.is_null()) {
return true;
}
if (value.type().value() == simdjson::ondemand::json_type::object) {
auto object = value.get_object();
for (auto field : object) {
if (!isObjectEmpty(field.value())) {
return false;
}
}
return true;
}
if (value.type().value() == simdjson::ondemand::json_type::array) {
auto array = value.get_array();
for (auto element : array) {
if (!isObjectEmpty(std::move(element))) {
return false;
}
}
return true;
}
return false;
}
inline bool
isDocEmpty(simdjson::ondemand::document document) {
if (document.is_null()) {
return true;
}
if (document.type().value() == simdjson::ondemand::json_type::object) {
auto object = document.get_object();
for (auto field : object) {
if (!isObjectEmpty(field.value())) {
return false;
}
}
return true;
}
if (document.type().value() == simdjson::ondemand::json_type::array) {
auto array = document.get_array();
for (auto element : array) {
if (!isObjectEmpty(std::move(element))) {
return false;
}
}
return true;
}
return false;
}
} // namespace milvus

View File

@ -16762,9 +16762,9 @@ TEST_P(JsonIndexExistsTest, TestExistsExpr) {
// bool: exists or not
std::vector<std::tuple<std::vector<std::string>, bool, uint32_t>>
test_cases = {
{{"a"}, true, 0b1111111000000100},
{{"a"}, true, 0b1111101000000100},
{{"a", "b"}, true, 0b0000100000000000},
{{"a"}, false, 0b0000000111111011},
{{"a"}, false, 0b0000010111111011},
{{"a", "b"}, false, 0b1111011111111111},
};
@ -17053,3 +17053,87 @@ TEST_P(JsonIndexBinaryExprTest, TestBinaryRangeExpr) {
EXPECT_TRUE(res == expect_result);
}
}
TEST(JsonNonIndexExistsTest, TestExistsExprSealedNoIndex) {
std::vector<std::string> json_strs = {
R"({"a": 1.0})",
R"({"a": "abc"})",
R"({"a": 3.0})",
R"({"a": true})",
R"({"a": {"b": 1}})",
R"({"a": []})",
R"({"a": ["a", "b"]})",
R"({"a": null})",
R"(1)",
R"("abc")",
R"(1.0)",
R"(true)",
R"([1, 2, 3])",
R"({"a": 1, "b": 2})",
R"({})",
R"(null)",
R"({"a": {}})",
R"({"a": {"b": {}}})",
R"({"a": [{}, {}]})",
R"({"a": [[], []]})",
R"({"a": [{"b": {}}, {"c": {}}]})",
};
// bool: exists or not
std::vector<std::tuple<std::vector<std::string>, bool, uint32_t>>
test_cases = {
{{"a"}, true, 0b111110100000010000000},
{{"a", "b"}, true, 0b000010000000000000000},
{{"a"}, false, 0b000001011111101111111},
{{"a", "b"}, false, 0b111101111111111111111},
};
auto schema = std::make_shared<Schema>();
auto vec_fid = schema->AddDebugField(
"fakevec", DataType::VECTOR_FLOAT, 16, knowhere::metric::L2);
auto i64_fid = schema->AddDebugField("age64", DataType::INT64);
auto json_fid = schema->AddDebugField("json", DataType::JSON);
schema->set_primary_field_id(i64_fid);
auto seg = CreateSealedSegment(schema);
auto json_field =
std::make_shared<FieldData<milvus::Json>>(DataType::JSON, false);
std::vector<milvus::Json> jsons;
for (auto& json_str : json_strs) {
jsons.push_back(milvus::Json(simdjson::padded_string(json_str)));
}
json_field->add_json_data(jsons);
auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto load_info = PrepareSingleFieldInsertBinlog(
1, 1, 1, json_fid.get(), {json_field}, cm);
seg->LoadFieldData(load_info);
for (auto& [nested_path, exists, expect] : test_cases) {
BitsetType expect_res;
expect_res.resize(json_strs.size());
for (int i = json_strs.size() - 1; expect > 0; i--) {
expect_res.set(i, (expect & 1) != 0);
expect >>= 1;
}
std::shared_ptr<expr::ITypeFilterExpr> exists_expr;
if (exists) {
exists_expr = std::make_shared<expr::ExistsExpr>(
expr::ColumnInfo(json_fid, DataType::JSON, nested_path));
} else {
auto child_expr = std::make_shared<expr::ExistsExpr>(
expr::ColumnInfo(json_fid, DataType::JSON, nested_path));
exists_expr = std::make_shared<expr::LogicalUnaryExpr>(
expr::LogicalUnaryExpr::OpType::LogicalNot, child_expr);
}
auto plan = std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID,
exists_expr);
auto result =
ExecuteQueryExpr(plan, seg.get(), json_strs.size(), MAX_TIMESTAMP);
EXPECT_TRUE(result == expect_res);
}
}