fix:fix unescaped bug for json stats (#44421)

#42533

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2025-09-17 20:54:01 +08:00 committed by GitHub
parent 5cd2d99799
commit 9b6703626d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 59 additions and 28 deletions

View File

@ -1301,30 +1301,14 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForPk(EvalCtx& context) {
auto op_type = expr_->op_type_;
PkType pk = value_arg_.GetValue<IndexInnerType>();
auto query_timestamp = context.get_exec_context()
->get_query_context()
->get_query_timestamp();
switch (op_type) {
case proto::plan::GreaterThan:
case proto::plan::GreaterEqual:
case proto::plan::LessThan:
case proto::plan::LessEqual:
case proto::plan::Equal:
segment_->pk_range(op_type, pk, cache_view);
break;
case proto::plan::NotEqual: {
segment_->pk_range(proto::plan::Equal, pk, cache_view);
cache_view.flip();
break;
}
default:
ThrowInfo(
OpTypeInvalid,
fmt::format("unsupported operator type for unary expr: {}",
op_type));
if (op_type == proto::plan::NotEqual) {
segment_->pk_range(proto::plan::Equal, pk, cache_view);
cache_view.flip();
} else {
segment_->pk_range(op_type, pk, cache_view);
}
}
TargetBitmap result;
result.append(
*cached_index_chunk_res_, current_data_global_pos_, real_batch_size);

View File

@ -442,12 +442,11 @@ JsonKeyStats::TraverseJsonForBuildStats(
}
index = j;
} else if (current.type == JSMN_STRING) {
auto value =
std::string(json + current.start, current.end - current.start);
auto unescaped = UnescapeJsonString(value);
Assert(current.size == 0);
AddKeyStats(
path,
JSONType::STRING,
std::string(json + current.start, current.end - current.start),
values);
AddKeyStats(path, JSONType::STRING, unescaped, values);
index++;
}
}

View File

@ -26,6 +26,8 @@
#include "common/jsmn.h"
#include "arrow/api.h"
#include "common/EasyAssert.h"
#include <simdjson.h>
#include <cstring>
namespace milvus::index {
@ -49,6 +51,47 @@ enum class JSONType {
OBJECT
};
inline bool
JsonStringHasEscape(std::string_view s) {
// Any JSON escape must start with a backslash
return std::memchr(s.data(), '\\', s.size()) != nullptr;
}
// Unescape a JSON-escaped string slice (without surrounding quotes)
// Returns a decoded UTF-8 std::string or throws on error
inline std::string
UnescapeJsonString(const std::string& escaped) {
if (!JsonStringHasEscape(escaped)) {
return escaped;
}
try {
simdjson::dom::parser parser;
std::string quoted;
quoted.resize(escaped.size() + 2);
quoted[0] = '"';
std::memcpy(&quoted[1], escaped.data(), escaped.size());
quoted[quoted.size() - 1] = '"';
simdjson::dom::element elem = parser.parse(quoted);
if (elem.type() != simdjson::dom::element_type::STRING) {
ThrowInfo(ErrorCode::UnexpectedError,
"input is not a JSON string: {}",
escaped);
}
return std::string(std::string_view(elem.get_string()));
} catch (const simdjson::simdjson_error& e) {
ThrowInfo(ErrorCode::UnexpectedError,
"Failed to unescape json string (simdjson): {}, {}",
escaped,
e.what());
} catch (const std::exception& e) {
ThrowInfo(ErrorCode::UnexpectedError,
"Failed to unescape json string: {}, {}",
escaped,
e.what());
}
return {};
}
inline std::string
ToString(JSONType type) {
switch (type) {

View File

@ -76,7 +76,8 @@ TEST(TraverseJsonForBuildStatsTest,
{"id": 34495370646 ,"type":"PublicEvent","actor":{"id":126890008,"login":"gegangene","display_login":"gegangene","gravatar_id":"",
"url":"https:\/\/api.github.com\/users\/gegangene","avatar_url":"https:\/\/avatars.githubusercontent.com\/u\/126890008?"},
"repo":{"id":737601171,"name":"gegangene\/scheduler","url":"https:\/\/api.github.com\/repos\/gegangene\/scheduler"},
"payload":{},"public":true,"created_at":"2024-01-01T00:01:28Z"}
"payload":{},"public":true,"created_at":"2024-01-01T00:01:28Z",
"msg":"line1\nline2\t\u4e2d\u6587 \/ backslash \\"}
)";
auto tokens = Tokenize(json);
@ -113,4 +114,8 @@ TEST(TraverseJsonForBuildStatsTest,
expect_has("/payload", JSONType::OBJECT, "{}");
expect_has("/public", JSONType::BOOL, "true");
expect_has("/created_at", JSONType::STRING, "2024-01-01T00:01:28Z");
expect_has("/repo/url",
JSONType::STRING,
"https://api.github.com/repos/gegangene/scheduler");
expect_has("/msg", JSONType::STRING, "line1\nline2\t中文 / backslash \\");
}