fix: prevent crash when contains_all/any is used with empty array (#41831)

issue: https://github.com/milvus-io/milvus/issues/41348

related and optimized by https://github.com/milvus-io/milvus/pull/41347

master pr: https://github.com/milvus-io/milvus/pull/41739
2.5 pr: #41756

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
Co-authored-by: Sangho Park <hoyaspark@gmail.com>
This commit is contained in:
cai.zhang 2025-05-14 14:29:35 +08:00 committed by GitHub
parent 2c8eb28828
commit e57cf1c8b3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 100 additions and 0 deletions

View File

@ -25,6 +25,31 @@ void
PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
auto input = context.get_offset_input();
SetHasOffsetInput((input != nullptr));
if (expr_->vals_.empty()) {
auto next_batch_size = GetNextBatchSize();
auto real_batch_size = has_offset_input_
? context.get_offset_input()->size()
: next_batch_size;
if (real_batch_size == 0) {
result = nullptr;
return;
}
auto res_vec =
std::make_shared<ColumnVector>(TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
TargetBitmapView res(res_vec->GetRawData(), real_batch_size);
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
res.set();
valid_res.set();
result = res_vec;
current_data_chunk_pos_ += real_batch_size;
return;
}
switch (expr_->column_.data_type_) {
case DataType::ARRAY: {
if (is_index_mode_ && !has_offset_input_) {

View File

@ -1430,6 +1430,73 @@ TEST(Expr, TestArrayContains) {
}
}
TEST(Expr, TestArrayContainsEmptyValues) {
auto schema = std::make_shared<Schema>();
auto int_array_fid =
schema->AddDebugField("int_array", DataType::ARRAY, DataType::INT8);
auto long_array_fid =
schema->AddDebugField("long_array", DataType::ARRAY, DataType::INT64);
auto bool_array_fid =
schema->AddDebugField("bool_array", DataType::ARRAY, DataType::BOOL);
auto float_array_fid =
schema->AddDebugField("float_array", DataType::ARRAY, DataType::FLOAT);
auto double_array_fid = schema->AddDebugField(
"double_array", DataType::ARRAY, DataType::DOUBLE);
auto string_array_fid = schema->AddDebugField(
"string_array", DataType::ARRAY, DataType::VARCHAR);
schema->set_primary_field_id(schema->AddDebugField("id", DataType::INT64));
std::vector<FieldId> fields = {
int_array_fid,
long_array_fid,
bool_array_fid,
float_array_fid,
double_array_fid,
string_array_fid,
};
auto dummy_seg = CreateGrowingSegment(schema, empty_index_meta);
int N = 1000;
std::vector<int> age_col;
int num_iters = 100;
for (int iter = 0; iter < num_iters; ++iter) {
auto raw_data = DataGen(schema, N, iter);
dummy_seg->PreInsert(N);
dummy_seg->Insert(iter * N,
N,
raw_data.row_ids_.data(),
raw_data.timestamps_.data(),
raw_data.raw_);
}
auto seg_promote = dynamic_cast<SegmentGrowingImpl*>(dummy_seg.get());
std::vector<proto::plan::GenericValue> empty_values;
for (auto field_id : fields) {
auto start = std::chrono::steady_clock::now();
auto expr = std::make_shared<milvus::expr::JsonContainsExpr>(
expr::ColumnInfo(field_id, DataType::ARRAY),
proto::plan::JSONContainsExpr_JSONOp_ContainsAny,
true,
empty_values);
BitsetType final;
auto plan =
std::make_shared<plan::FilterBitsNode>(DEFAULT_PLANNODE_ID, expr);
final =
ExecuteQueryExpr(plan, seg_promote, N * num_iters, MAX_TIMESTAMP);
std::cout << "cost"
<< std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::steady_clock::now() - start)
.count()
<< std::endl;
EXPECT_EQ(final.size(), N * num_iters);
for (int i = 0; i < N * num_iters; ++i) {
ASSERT_EQ(final[i], true);
}
}
}
TEST(Expr, TestArrayBinaryArith) {
auto schema = std::make_shared<Schema>();
auto i64_fid = schema->AddDebugField("id", DataType::INT64);

View File

@ -164,6 +164,10 @@ TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAny) {
auto ref = [this, &elems](size_t offset) -> bool {
std::unordered_set<TypeParam> row(this->vec_of_array_[offset].begin(),
this->vec_of_array_[offset].end());
if (elems.empty()) {
return true;
}
for (const auto& elem : elems) {
if (row.find(elem) != row.end()) {
return true;
@ -212,6 +216,10 @@ TYPED_TEST_P(ArrayInvertedIndexTest, ArrayContainsAll) {
auto ref = [this, &elems](size_t offset) -> bool {
std::unordered_set<TypeParam> row(this->vec_of_array_[offset].begin(),
this->vec_of_array_[offset].end());
if (elems.empty()) {
return true;
}
for (const auto& elem : elems) {
if (row.find(elem) == row.end()) {
return false;