From 3dd5deb70aa34cd5dd66d821fb517b399179effa Mon Sep 17 00:00:00 2001 From: zhagnlu Date: Mon, 13 Oct 2025 17:25:59 +0800 Subject: [PATCH] fix:disable using shredding for json_path contains digital (#44724) #44132 Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/common/Utils.h | 14 +++++++ internal/core/src/common/bson_view.h | 7 ++-- .../src/exec/expression/BinaryRangeExpr.cpp | 3 +- .../core/src/exec/expression/ExistsExpr.cpp | 3 +- internal/core/src/exec/expression/Expr.h | 18 ++++++++- .../src/exec/expression/JsonContainsExpr.cpp | 18 ++++++--- .../core/src/exec/expression/TermExpr.cpp | 3 +- .../core/src/exec/expression/UnaryExpr.cpp | 39 +++++++++++++++---- 8 files changed, 83 insertions(+), 22 deletions(-) diff --git a/internal/core/src/common/Utils.h b/internal/core/src/common/Utils.h index eadbd608f8..d1fb6d6c31 100644 --- a/internal/core/src/common/Utils.h +++ b/internal/core/src/common/Utils.h @@ -213,6 +213,20 @@ Join(const std::vector& items, const std::string& delimiter) { return ss.str(); } +inline bool +IsInteger(const std::string& str) { + if (str.empty()) + return false; + + try { + size_t pos; + std::stoi(str, &pos); + return pos == str.length(); + } catch (...) { + return false; + } +} + inline std::string PrintBitsetTypeView(const BitsetTypeView& view) { std::stringstream ss; diff --git a/internal/core/src/common/bson_view.h b/internal/core/src/common/bson_view.h index 050dcfa509..1e7788df0f 100644 --- a/internal/core/src/common/bson_view.h +++ b/internal/core/src/common/bson_view.h @@ -304,10 +304,9 @@ class BsonView { AssertInfo(offset < size_, "bson offset out of range"); const uint8_t* ptr = data_ + offset; - // check type - AssertInfo(static_cast(*ptr) == bsoncxx::type::k_array, - "ParseAsArrayAtOffset expects an array at offset {}", - offset); + if (static_cast(*ptr) != bsoncxx::type::k_array) { + return std::nullopt; + } ptr++; // skip key diff --git a/internal/core/src/exec/expression/BinaryRangeExpr.cpp b/internal/core/src/exec/expression/BinaryRangeExpr.cpp index 41a405cdeb..f1bb32265c 100644 --- a/internal/core/src/exec/expression/BinaryRangeExpr.cpp +++ b/internal/core/src/exec/expression/BinaryRangeExpr.cpp @@ -422,7 +422,8 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) { const auto& bitmap_input = context.get_bitmap_input(); auto* input = context.get_offset_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecRangeVisitorImplForJsonStats(); } auto real_batch_size = diff --git a/internal/core/src/exec/expression/ExistsExpr.cpp b/internal/core/src/exec/expression/ExistsExpr.cpp index 183f7b0ff8..ded3cb3226 100644 --- a/internal/core/src/exec/expression/ExistsExpr.cpp +++ b/internal/core/src/exec/expression/ExistsExpr.cpp @@ -116,7 +116,8 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (CanUseJsonStats(context, field_id) && !has_offset_input_) { + if (CanUseJsonStats(context, field_id, expr_->column_.nested_path_) && + !has_offset_input_) { return EvalJsonExistsForDataSegmentByStats(); } auto real_batch_size = diff --git a/internal/core/src/exec/expression/Expr.h b/internal/core/src/exec/expression/Expr.h index 6d57823d21..a26f17094b 100644 --- a/internal/core/src/exec/expression/Expr.h +++ b/internal/core/src/exec/expression/Expr.h @@ -1404,8 +1404,22 @@ class SegmentExpr : public Expr { } bool - CanUseJsonStats(EvalCtx& context, FieldId field_id) const { - return PlanUseJsonStats(context) && HasJsonStats(field_id); + CanUseJsonStats(EvalCtx& context, + FieldId field_id, + const std::vector& nested_path) const { + // if path contains integer, we can't use json stats such as "a.1.b", "a.1", + // because we can't know the integer is a key or a array indice + auto path_contains_integer = [](const std::vector& path) { + for (auto i = 0; i < path.size(); i++) { + if (milvus::IsInteger(path[i])) { + return true; + } + } + return false; + }; + + return PlanUseJsonStats(context) && HasJsonStats(field_id) && + !path_contains_integer(nested_path); } virtual bool diff --git a/internal/core/src/exec/expression/JsonContainsExpr.cpp b/internal/core/src/exec/expression/JsonContainsExpr.cpp index 5f41b90d42..5b6ad53ca1 100644 --- a/internal/core/src/exec/expression/JsonContainsExpr.cpp +++ b/internal/core/src/exec/expression/JsonContainsExpr.cpp @@ -295,7 +295,8 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) { const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsByStats(); } @@ -509,7 +510,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsArrayByStats(); } auto real_batch_size = @@ -796,7 +798,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) { const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsAllByStats(); } auto real_batch_size = @@ -991,7 +994,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsAllWithDiffTypeByStats(); } auto real_batch_size = @@ -1315,7 +1319,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsAllArrayByStats(); } auto real_batch_size = @@ -1521,7 +1526,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonContainsWithDiffTypeByStats(); } auto real_batch_size = diff --git a/internal/core/src/exec/expression/TermExpr.cpp b/internal/core/src/exec/expression/TermExpr.cpp index 9243fd15c7..518ec18a05 100644 --- a/internal/core/src/exec/expression/TermExpr.cpp +++ b/internal/core/src/exec/expression/TermExpr.cpp @@ -709,7 +709,8 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecJsonInVariableByStats(); } diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index 4bfe9f547f..0a27080ad1 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -663,7 +663,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) { const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; - if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { + if (!has_offset_input_ && + CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) { return ExecRangeVisitorImplJsonByStats(); } @@ -992,7 +993,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); - cached_index_chunk_res_ = std::make_shared(active_count_); + cached_index_chunk_res_ = + (op_type == proto::plan::OpType::NotEqual) + ? std::make_shared(active_count_, true) + : std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); @@ -1117,14 +1121,18 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (array_index != INVALID_ARRAY_INDEX) { auto array_value = bson.ParseAsArrayAtOffset(value_offset); if (!array_value.has_value()) { - res_view[row_id] = false; + // For NotEqual: path not exists means "not equal", keep true + // For Equal: path not exists means no match, set false + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); return; } auto sub_array = milvus::BsonView::GetNthElementInArray< bsoncxx::array::view>(array_value.value().data(), array_index); if (!sub_array.has_value()) { - res_view[row_id] = false; + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); return; } res_view[row_id] = @@ -1134,7 +1142,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { } else { auto array_value = bson.ParseAsArrayAtOffset(value_offset); if (!array_value.has_value()) { - res_view[row_id] = false; + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); return; } res_view[row_id] = @@ -1147,7 +1156,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (array_index != INVALID_ARRAY_INDEX) { auto array_value = bson.ParseAsArrayAtOffset(value_offset); if (!array_value.has_value()) { - res_view[row_id] = false; + // Path not exists: NotEqual->true, others->false + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); return; } get_value = milvus::BsonView::GetNthElementInArray( @@ -1161,6 +1172,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (get_value.has_value()) { res_view[row_id] = UnaryCompare( get_value.value(), val, op_type); + } else { + // Type mismatch: NotEqual->true, others->false + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); } return; } @@ -1172,6 +1187,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (get_value.has_value()) { res_view[row_id] = UnaryCompare( get_value.value(), val, op_type); + } else { + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); } return; } @@ -1187,6 +1205,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (get_value.has_value()) { res_view[row_id] = UnaryCompare( get_value.value(), val, op_type); + } else { + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); } return; } @@ -1197,13 +1218,17 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() { if (get_value.has_value()) { res_view[row_id] = UnaryCompare( get_value.value(), val, op_type); + } else { + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); } return; } } } if (!get_value.has_value()) { - res_view[row_id] = false; + res_view[row_id] = + (op_type == proto::plan::OpType::NotEqual); return; } res_view[row_id] =