fix:fix json_contains(path, int) bug (#44814)

#44816

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2025-10-14 00:19:59 +08:00 committed by GitHub
parent df6a4dc1a0
commit 2f178f810f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 109 additions and 41 deletions

View File

@ -15,6 +15,7 @@
// limitations under the License.
#include "JsonContainsExpr.h"
#include <cmath>
#include <utility>
#include "common/Types.h"
@ -339,6 +340,17 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
if constexpr (std::is_same_v<GetType, int64_t>) {
auto double_val = it.template get<double>();
if (!double_val.error() &&
double_val.value() ==
std::floor(double_val.value())) {
if (elements->In(static_cast<int64_t>(
double_val.value())) > 0) {
return true;
}
}
}
continue;
}
if (elements->In(val.value()) > 0) {
@ -843,6 +855,18 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
if constexpr (std::is_same_v<GetType, int64_t>) {
auto double_val = it.template get<double>();
if (!double_val.error() &&
double_val.value() ==
std::floor(double_val.value())) {
tmp_elements.erase(
static_cast<int64_t>(double_val.value()));
if (tmp_elements.size() == 0) {
return true;
}
}
}
continue;
}
tmp_elements.erase(val.value());
@ -965,6 +989,22 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllByStats() {
auto value = milvus::BsonView::GetValueFromBsonView<GetType>(
element.get_value());
if (!value.has_value()) {
if constexpr (std::is_same_v<GetType, int64_t>) {
auto double_value =
milvus::BsonView::GetValueFromBsonView<double>(
element.get_value());
if (double_value.has_value()) {
if (double_value.value() ==
std::floor(double_value.value())) {
tmp_elements.erase(
static_cast<int64_t>(double_value.value()));
}
if (tmp_elements.size() == 0) {
res_view[row_offset] = true;
return;
}
}
}
continue;
}
tmp_elements.erase(value.value());
@ -1058,6 +1098,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
auto double_val = it.template get<double>();
if (!double_val.error() &&
double_val.value() == element.int64_val()) {
tmp_elements_index.erase(i);
}
continue;
}
if (val.value() == element.int64_val()) {
@ -1237,8 +1282,9 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByStats() {
break;
}
case proto::plan::GenericValue::kInt64Val: {
// get double/int64 from bson
auto val =
milvus::BsonView::GetValueFromBsonView<int64_t>(
milvus::BsonView::GetValueFromBsonView<double>(
sub_value.get_value());
if (!val.has_value()) {
continue;
@ -1588,6 +1634,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
auto double_val = it.template get<double>();
if (!double_val.error() &&
double_val.value() == element.int64_val()) {
return true;
}
continue;
}
if (val.value() == element.int64_val()) {
@ -1751,7 +1802,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByStats() {
}
case proto::plan::GenericValue::kInt64Val: {
auto val =
milvus::BsonView::GetValueFromBsonView<int64_t>(
milvus::BsonView::GetValueFromBsonView<double>(
sub_value.get_value());
if (!val.has_value()) {
continue;

View File

@ -2610,40 +2610,52 @@ def gen_json_field_expressions_and_templates():
return expressions
def gen_json_field_expressions_all_single_operator():
def gen_json_field_expressions_all_single_operator(json_cast_type=None):
"""
Gen a list of filter in expression-format(as a string)
:param json_cast_type: Optional parameter to specify the JSON cast type (e.g., "ARRAY_DOUBLE")
"""
expressions = ["json_field['a'] <= 1", "json_field['a'] <= 1.0", "json_field['a'] >= 1", "json_field['a'] >= 1.0",
"json_field['a'] < 2", "json_field['a'] < 2.0", "json_field['a'] > 0", "json_field['a'] > 0.0",
"json_field['a'] <= '1'", "json_field['a'] >= '1'", "json_field['a'] < '2'", "json_field['a'] > '0'",
"json_field['a'] == 1", "json_field['a'] == 1.0", "json_field['a'] == True",
"json_field['a'] == 9707199254740993.0", "json_field['a'] == 9707199254740992",
"json_field['a'] == '1'",
"json_field['a'] != '1'", "json_field['a'] like '1%'", "json_field['a'] like '%1'",
"json_field['a'] like '%1%'", "json_field['a'] LIKE '1%'", "json_field['a'] LIKE '%1'",
"json_field['a'] LIKE '%1%'", "EXISTS json_field['a']", "exists json_field['a']",
"EXISTS json_field['a']['b']", "exists json_field['a']['b']", "json_field['a'] + 1 >= 2",
"json_field['a'] - 1 <= 0", "json_field['a'] + 1.0 >= 2", "json_field['a'] - 1.0 <= 0",
"json_field['a'] * 2 == 2", "json_field['a'] * 1.0 == 1.0", "json_field / 1 == 1",
"json_field['a'] / 1.0 == 1", "json_field['a'] % 10 == 1", "json_field['a'] == 1**2",
"json_field['a'][0] == 1 && json_field['a'][1] == 2",
"json_field['a'][0] == 1 and json_field['a'][1] == 2",
"json_field['a'][0]['b'] >=1 && json_field['a'][2] == 3",
"json_field['a'][0]['b'] >=1 and json_field['a'][2] == 3",
"json_field['a'] == 1 || json_field['a'] == '1'", "json_field['a'] == 1 or json_field['a'] == '1'",
"json_field['a'][0]['b'] >=1 || json_field['a']['b'] >=1",
"json_field['a'][0]['b'] >=1 or json_field['a']['b'] >=1",
"json_field['a'] in [1]", "json_contains(json_field['a'], 1)", "JSON_CONTAINS(json_field['a'], 1)",
"json_contains_all(json_field['a'], [2.0, '4'])", "JSON_CONTAINS_ALL(json_field['a'], [2.0, '4'])",
"json_contains_any(json_field['a'], [2.0, '4'])", "JSON_CONTAINS_ANY(json_field['a'], [2.0, '4'])",
"array_contains(json_field['a'], 2)", "ARRAY_CONTAINS(json_field['a'], 2)",
"array_contains_all(json_field['a'], [1.0, 2])", "ARRAY_CONTAINS_ALL(json_field['a'], [1.0, 2])",
"array_contains_any(json_field['a'], [1.0, 2])", "ARRAY_CONTAINS_ANY(json_field['a'], [1.0, 2])",
"array_length(json_field['a']) < 10", "ARRAY_LENGTH(json_field['a']) < 10",
"json_field is null", "json_field IS NULL", "json_field is not null", "json_field IS NOT NULL",
"json_field['a'] is null", "json_field['a'] IS NULL", "json_field['a'] is not null", "json_field['a'] IS NOT NULL"
]
if json_cast_type == "ARRAY_DOUBLE":
# For ARRAY_DOUBLE type, use array-specific expressions
expressions = [
"json_contains(json_field['a'], 1)", "JSON_CONTAINS(json_field['a'], 1)",
"json_contains(json_field['a'], 1.0)", "json_contains(json_field['a'], 2)",
"json_contains_all(json_field['a'], [1, 2])", "JSON_CONTAINS_ALL(json_field['a'], [1, 2])",
"json_contains_all(json_field['a'], [1.0, 2.0])", "json_contains_all(json_field['a'], [2, 4])",
"json_contains_any(json_field['a'], [1, 2])", "JSON_CONTAINS_ANY(json_field['a'], [1, 2])",
"json_contains_any(json_field['a'], [1.0, 2.0])", "json_contains_any(json_field['a'], [2, 4])",
"array_contains(json_field['a'], 1)", "ARRAY_CONTAINS(json_field['a'], 1)",
"array_contains(json_field['a'], 1.0)", "array_contains(json_field['a'], 2)",
"array_contains_all(json_field['a'], [1, 2])", "ARRAY_CONTAINS_ALL(json_field['a'], [1, 2])",
"array_contains_all(json_field['a'], [1.0, 2.0])", "array_contains_all(json_field['a'], [2, 4])",
"array_contains_any(json_field['a'], [1, 2])", "ARRAY_CONTAINS_ANY(json_field['a'], [1, 2])",
"array_contains_any(json_field['a'], [1.0, 2.0])", "array_contains_any(json_field['a'], [2, 4])",
"array_length(json_field['a']) < 10", "ARRAY_LENGTH(json_field['a']) < 10"
]
else:
expressions = ["json_field['a'] <= 1", "json_field['a'] <= 1.0", "json_field['a'] >= 1", "json_field['a'] >= 1.0",
"json_field['a'] < 2", "json_field['a'] < 2.0", "json_field['a'] > 0", "json_field['a'] > 0.0",
"json_field['a'] <= '1'", "json_field['a'] >= '1'", "json_field['a'] < '2'", "json_field['a'] > '0'",
"json_field['a'] == 1", "json_field['a'] == 1.0", "json_field['a'] == True",
"json_field['a'] == 9707199254740993.0", "json_field['a'] == 9707199254740992",
"json_field['a'] == '1'",
"json_field['a'] != '1'", "json_field['a'] like '1%'", "json_field['a'] like '%1'",
"json_field['a'] like '%1%'", "json_field['a'] LIKE '1%'", "json_field['a'] LIKE '%1'",
"json_field['a'] LIKE '%1%'", "EXISTS json_field['a']", "exists json_field['a']",
"EXISTS json_field['a']['b']", "exists json_field['a']['b']", "json_field['a'] + 1 >= 2",
"json_field['a'] - 1 <= 0", "json_field['a'] + 1.0 >= 2", "json_field['a'] - 1.0 <= 0",
"json_field['a'] * 2 == 2", "json_field['a'] * 1.0 == 1.0", "json_field / 1 == 1",
"json_field['a'] / 1.0 == 1", "json_field['a'] % 10 == 1", "json_field['a'] == 1**2",
"json_field['a'][0] == 1 && json_field['a'][1] == 2",
"json_field['a'][0] == 1 and json_field['a'][1] == 2",
"json_field['a'][0]['b'] >=1 && json_field['a'][2] == 3",
"json_field['a'][0]['b'] >=1 and json_field['a'][2] == 3",
"json_field['a'] == 1 || json_field['a'] == '1'", "json_field['a'] == 1 or json_field['a'] == '1'",
"json_field['a'][0]['b'] >=1 || json_field['a']['b'] >=1",
"json_field['a'][0]['b'] >=1 or json_field['a']['b'] >=1",
"json_field['a'] in [1]", "json_field is null", "json_field IS NULL", "json_field is not null", "json_field IS NOT NULL",
"json_field['a'] is null", "json_field['a'] IS NULL", "json_field['a'] is not null", "json_field['a'] IS NOT NULL"
]
return expressions

View File

@ -4104,8 +4104,8 @@ class TestMilvusClientQueryJsonPathIndex(TestMilvusClientV2Base):
def supported_varchar_scalar_index(self, request):
yield request.param
# @pytest.fixture(scope="function", params=["DOUBLE", "VARCHAR", "json"", "bool"])
@pytest.fixture(scope="function", params=["DOUBLE"])
# @pytest.fixture(scope="function", params=["DOUBLE", "VARCHAR", "json"", "bool", "ARRAY_DOUBLE"])
@pytest.fixture(scope="function", params=["DOUBLE", "ARRAY_DOUBLE"])
def supported_json_cast_type(self, request):
yield request.param
@ -4167,10 +4167,11 @@ class TestMilvusClientQueryJsonPathIndex(TestMilvusClientV2Base):
# 3. flush if specified
if is_flush:
self.flush(client, collection_name)
time.sleep(300)
# 4. query when there is no json path index under all expressions
# skip negative expression for issue 40685
# "my_json['a'] != 1", "my_json['a'] != 1.0", "my_json['a'] != '1'", "my_json['a'] != 1.1", "my_json['a'] not in [1]"
express_list = cf.gen_json_field_expressions_all_single_operator()
express_list = cf.gen_json_field_expressions_all_single_operator(supported_json_cast_type)
compare_dict = {}
for i in range(len(express_list)):
json_list = []
@ -4195,11 +4196,15 @@ class TestMilvusClientQueryJsonPathIndex(TestMilvusClientV2Base):
# 6. prepare index params with json path index
index_name = "json_index"
index_params = self.prepare_index_params(client)[0]
json_path_list = [f"{json_field_name}", f"{json_field_name}[0]", f"{json_field_name}[1]",
f"{json_field_name}[6]", f"{json_field_name}['a']", f"{json_field_name}['a']['b']",
f"{json_field_name}['a'][0]", f"{json_field_name}['a'][6]", f"{json_field_name}['a'][0]['b']",
f"{json_field_name}['a']['b']['c']", f"{json_field_name}['a']['b'][0]['d']",
f"{json_field_name}[10000]", f"{json_field_name}['a']['c'][0]['d']"]
if supported_json_cast_type == "ARRAY_DOUBLE":
# For ARRAY_DOUBLE type, use array paths
json_path_list = [f"{json_field_name}['a']"]
else:
json_path_list = [f"{json_field_name}", f"{json_field_name}[0]", f"{json_field_name}[1]",
f"{json_field_name}[6]", f"{json_field_name}['a']", f"{json_field_name}['a']['b']",
f"{json_field_name}['a'][0]", f"{json_field_name}['a'][6]", f"{json_field_name}['a'][0]['b']",
f"{json_field_name}['a']['b']['c']", f"{json_field_name}['a']['b'][0]['d']",
f"{json_field_name}[10000]", f"{json_field_name}['a']['c'][0]['d']"]
index_params.add_index(field_name=default_vector_field_name, index_type="AUTOINDEX", metric_type="COSINE")
for i in range(len(json_path_list)):
index_params.add_index(field_name=json_field_name, index_name=index_name + f'{i}',