fix: fix bug for shredding json when empty but not null json (#45214)

pr: #45221

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2025-11-04 11:13:34 +08:00 committed by GitHub
parent 4bc2dc86a0
commit cb2bc2b41b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 59 additions and 3 deletions

View File

@ -135,14 +135,23 @@ JsonKeyStats::TraverseJsonForStats(const char* json,
std::vector<std::string>& path,
std::map<JsonKey, KeyStatsInfo>& infos) {
jsmntok current = tokens[0];
Assert(current.type != JSMN_UNDEFINED);
AssertInfo(current.type != JSMN_UNDEFINED,
"current token type is undefined for json: {}.",
json);
if (current.type == JSMN_OBJECT) {
if (!path.empty()) {
AddKeyStatsInfo(path, JSONType::OBJECT, nullptr, infos);
}
int j = 1;
for (int i = 0; i < current.size; i++) {
Assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0);
AssertInfo(tokens[j].type == JSMN_STRING && tokens[j].size != 0,
"current token type is not string for json: {} at "
"type: {}, size: {}, value: {}",
json,
int(tokens[j].type),
tokens[j].size,
std::string(json + tokens[j].start,
tokens[j].end - tokens[j].start));
std::string key(json + tokens[j].start,
tokens[j].end - tokens[j].start);
path.push_back(key);
@ -221,6 +230,10 @@ JsonKeyStats::CollectSingleJsonStatsInfo(
break;
}
if (num_tokens == 0) {
return;
}
int index = 0;
std::vector<std::string> paths;
TraverseJsonForStats(json_str, tokens.data(), index, paths, infos);
@ -385,7 +398,7 @@ JsonKeyStats::TraverseJsonForBuildStats(
for (int i = 0; i < current.size; i++) {
AssertInfo(tokens[j].type == JSMN_STRING && tokens[j].size != 0,
"current token type is not string for json: {} at "
"index: {}, type: {}, size: {} value: {}",
"type: {}, size: {}, value: {}",
json,
int(tokens[j].type),
tokens[j].size,
@ -505,6 +518,10 @@ JsonKeyStats::BuildKeyStatsForRow(const char* json_str, uint32_t row_id) {
break;
}
if (num_tokens == 0) {
return;
}
int index = 0;
std::vector<std::string> paths;
std::map<JsonKey, std::string> values;
@ -576,6 +593,14 @@ JsonKeyStats::BuildKeyStats(const std::vector<FieldDataPtr>& field_datas,
static_cast<const milvus::Json*>(data->RawValue(i))
->data()
.data();
// some situations, such as empty json string,
// should be handled as null row
if (strlen(json_str) == 0) {
BuildKeyStatsForNullRow();
continue;
}
BuildKeyStatsForRow(json_str, row_id);
}
row_id++;

View File

@ -18,6 +18,7 @@
// Forward declaration of test accessor in global namespace for friend declaration
class TraverseJsonForBuildStatsAccessor;
class CollectSingleJsonStatsInfoAccessor;
#include <string>
#include <boost/filesystem.hpp>
@ -707,6 +708,7 @@ class JsonKeyStats : public ScalarIndex<std::string> {
// Friend accessor for unit tests to call private methods safely.
friend class ::TraverseJsonForBuildStatsAccessor;
friend class ::CollectSingleJsonStatsInfoAccessor;
};
using CacheJsonKeyStatsPtr =

View File

@ -39,6 +39,17 @@ class TraverseJsonForBuildStatsAccessor {
}
};
// Friend accessor declared in JsonKeyStats to invoke private method for UT
class CollectSingleJsonStatsInfoAccessor {
public:
static void
Call(JsonKeyStats& s,
const char* json,
std::map<JsonKey, milvus::index::KeyStatsInfo>& infos) {
s.CollectSingleJsonStatsInfo(json, infos);
}
};
namespace {
// Helper to tokenize JSON using jsmn
@ -120,3 +131,21 @@ TEST(TraverseJsonForBuildStatsTest,
"https://api.github.com/repos/gegangene/scheduler");
expect_has("/msg", JSONType::STRING, "line1\nline2\t中文 / backslash \\");
}
TEST(CollectSingleJsonStatsInfoTest, EmptyJsonStringThrows) {
const char* json = "";
milvus::storage::FieldDataMeta field_meta{1, 2, 3, 100, {}};
milvus::storage::IndexMeta index_meta{3, 100, 1, 1};
milvus::storage::StorageConfig storage_config;
storage_config.storage_type = "local";
storage_config.root_path = "/tmp/test-collect-single-json-stats-info";
auto cm = milvus::storage::CreateChunkManager(storage_config);
auto fs = milvus::storage::InitArrowFileSystem(storage_config);
milvus::storage::FileManagerContext ctx(field_meta, index_meta, cm, fs);
JsonKeyStats stats(ctx, true);
std::map<JsonKey, milvus::index::KeyStatsInfo> infos;
EXPECT_NO_THROW(
{ CollectSingleJsonStatsInfoAccessor::Call(stats, json, infos); });
}