fix:disable using shredding for json_path contains digital (#44724)

#44132

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2025-10-13 17:25:59 +08:00 committed by GitHub
parent 53e8f150e8
commit 3dd5deb70a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 83 additions and 22 deletions

View File

@ -213,6 +213,20 @@ Join(const std::vector<T>& items, const std::string& delimiter) {
return ss.str();
}
inline bool
IsInteger(const std::string& str) {
if (str.empty())
return false;
try {
size_t pos;
std::stoi(str, &pos);
return pos == str.length();
} catch (...) {
return false;
}
}
inline std::string
PrintBitsetTypeView(const BitsetTypeView& view) {
std::stringstream ss;

View File

@ -304,10 +304,9 @@ class BsonView {
AssertInfo(offset < size_, "bson offset out of range");
const uint8_t* ptr = data_ + offset;
// check type
AssertInfo(static_cast<bsoncxx::type>(*ptr) == bsoncxx::type::k_array,
"ParseAsArrayAtOffset expects an array at offset {}",
offset);
if (static_cast<bsoncxx::type>(*ptr) != bsoncxx::type::k_array) {
return std::nullopt;
}
ptr++;
// skip key

View File

@ -422,7 +422,8 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input();
auto* input = context.get_offset_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecRangeVisitorImplForJsonStats<ValueType>();
}
auto real_batch_size =

View File

@ -116,7 +116,8 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (CanUseJsonStats(context, field_id) && !has_offset_input_) {
if (CanUseJsonStats(context, field_id, expr_->column_.nested_path_) &&
!has_offset_input_) {
return EvalJsonExistsForDataSegmentByStats();
}
auto real_batch_size =

View File

@ -1404,8 +1404,22 @@ class SegmentExpr : public Expr {
}
bool
CanUseJsonStats(EvalCtx& context, FieldId field_id) const {
return PlanUseJsonStats(context) && HasJsonStats(field_id);
CanUseJsonStats(EvalCtx& context,
FieldId field_id,
const std::vector<std::string>& nested_path) const {
// if path contains integer, we can't use json stats such as "a.1.b", "a.1",
// because we can't know the integer is a key or a array indice
auto path_contains_integer = [](const std::vector<std::string>& path) {
for (auto i = 0; i < path.size(); i++) {
if (milvus::IsInteger(path[i])) {
return true;
}
}
return false;
};
return PlanUseJsonStats(context) && HasJsonStats(field_id) &&
!path_contains_integer(nested_path);
}
virtual bool

View File

@ -295,7 +295,8 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsByStats<ExprValueType>();
}
@ -509,7 +510,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsArrayByStats();
}
auto real_batch_size =
@ -796,7 +798,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsAllByStats<ExprValueType>();
}
auto real_batch_size =
@ -991,7 +994,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsAllWithDiffTypeByStats();
}
auto real_batch_size =
@ -1315,7 +1319,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsAllArrayByStats();
}
auto real_batch_size =
@ -1521,7 +1526,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonContainsWithDiffTypeByStats();
}
auto real_batch_size =

View File

@ -709,7 +709,8 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(EvalCtx& context) {
auto* input = context.get_offset_input();
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecJsonInVariableByStats<ValueType>();
}

View File

@ -663,7 +663,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
const auto& bitmap_input = context.get_bitmap_input();
FieldId field_id = expr_->column_.field_id_;
if (!has_offset_input_ && CanUseJsonStats(context, field_id)) {
if (!has_offset_input_ &&
CanUseJsonStats(context, field_id, expr_->column_.nested_path_)) {
return ExecRangeVisitorImplJsonByStats<ExprValueType>();
}
@ -992,7 +993,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id);
auto* index = pinned_json_stats_.get();
Assert(index != nullptr);
cached_index_chunk_res_ = std::make_shared<TargetBitmap>(active_count_);
cached_index_chunk_res_ =
(op_type == proto::plan::OpType::NotEqual)
? std::make_shared<TargetBitmap>(active_count_, true)
: std::make_shared<TargetBitmap>(active_count_);
cached_index_chunk_valid_res_ =
std::make_shared<TargetBitmap>(active_count_, true);
TargetBitmapView res_view(*cached_index_chunk_res_);
@ -1117,14 +1121,18 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (array_index != INVALID_ARRAY_INDEX) {
auto array_value = bson.ParseAsArrayAtOffset(value_offset);
if (!array_value.has_value()) {
res_view[row_id] = false;
// For NotEqual: path not exists means "not equal", keep true
// For Equal: path not exists means no match, set false
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
return;
}
auto sub_array = milvus::BsonView::GetNthElementInArray<
bsoncxx::array::view>(array_value.value().data(),
array_index);
if (!sub_array.has_value()) {
res_view[row_id] = false;
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
return;
}
res_view[row_id] =
@ -1134,7 +1142,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
} else {
auto array_value = bson.ParseAsArrayAtOffset(value_offset);
if (!array_value.has_value()) {
res_view[row_id] = false;
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
return;
}
res_view[row_id] =
@ -1147,7 +1156,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (array_index != INVALID_ARRAY_INDEX) {
auto array_value = bson.ParseAsArrayAtOffset(value_offset);
if (!array_value.has_value()) {
res_view[row_id] = false;
// Path not exists: NotEqual->true, others->false
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
return;
}
get_value = milvus::BsonView::GetNthElementInArray<GetType>(
@ -1161,6 +1172,10 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (get_value.has_value()) {
res_view[row_id] = UnaryCompare(
get_value.value(), val, op_type);
} else {
// Type mismatch: NotEqual->true, others->false
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
}
return;
}
@ -1172,6 +1187,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (get_value.has_value()) {
res_view[row_id] = UnaryCompare(
get_value.value(), val, op_type);
} else {
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
}
return;
}
@ -1187,6 +1205,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (get_value.has_value()) {
res_view[row_id] = UnaryCompare(
get_value.value(), val, op_type);
} else {
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
}
return;
}
@ -1197,13 +1218,17 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonByStats() {
if (get_value.has_value()) {
res_view[row_id] = UnaryCompare(
get_value.value(), val, op_type);
} else {
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
}
return;
}
}
}
if (!get_value.has_value()) {
res_view[row_id] = false;
res_view[row_id] =
(op_type == proto::plan::OpType::NotEqual);
return;
}
res_view[row_id] =