enhance: Record simdjson error (#41003)

issue: #35528

---------

Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
Bingyi Sun 2025-03-31 17:56:19 +08:00 committed by GitHub
parent c02892e9fb
commit 27ff3a42e7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 155 additions and 8 deletions

View File

@ -46,14 +46,7 @@ JsonInvertedIndex<T>::build_index_for_json(
value_result<GetType> res = json_column->at<GetType>(nested_path_);
auto err = res.error();
if (err != simdjson::SUCCESS) {
AssertInfo(
err == simdjson::INCORRECT_TYPE ||
err == simdjson::NO_SUCH_FIELD ||
err == simdjson::INVALID_JSON_POINTER,
"Failed to parse json, err: {}, json: {}, pointer: {}",
err,
*json_column,
nested_path_);
error_recorder_.Record(*json_column, nested_path_, err);
if (err == simdjson::NO_SUCH_FIELD ||
err == simdjson::INVALID_JSON_POINTER) {
folly::SharedMutex::WriteHolder lock(this->mutex_);
@ -72,6 +65,8 @@ JsonInvertedIndex<T>::build_index_for_json(
}
}
}
error_recorder_.PrintErrStats();
}
template class JsonInvertedIndex<bool>;

View File

@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <cstdint>
#include "common/FieldDataInterface.h"
#include "index/InvertedIndexTantivy.h"
#include "index/ScalarIndex.h"
@ -18,6 +19,49 @@
#include "tantivy-binding.h"
namespace milvus::index {
class JsonInvertedIndexParseErrorRecorder {
public:
struct ErrorInstance {
std::string json_str;
std::string pointer;
};
struct ErrorStats {
int64_t count;
ErrorInstance first_instance;
};
void
Record(const std::string_view& json_str,
const std::string& pointer,
const simdjson::error_code& error_code) {
error_map_[error_code].count++;
if (error_map_[error_code].count == 1) {
error_map_[error_code].first_instance = {std::string(json_str),
pointer};
}
}
void
PrintErrStats() {
if (error_map_.empty()) {
LOG_INFO("No error found");
return;
}
for (const auto& [error_code, stats] : error_map_) {
LOG_INFO("Error code: {}, count: {}, first instance: {}",
error_code,
stats.count,
stats.first_instance.json_str);
}
}
std::unordered_map<simdjson::error_code, ErrorStats>&
GetErrorMap() {
return error_map_;
}
private:
std::unordered_map<simdjson::error_code, ErrorStats> error_map_;
};
template <typename T>
class JsonInvertedIndex : public index::InvertedIndexTantivy<T> {
@ -67,9 +111,15 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy<T> {
return static_cast<enum DataType>(cast_type_);
}
JsonInvertedIndexParseErrorRecorder&
GetErrorRecorder() {
return error_recorder_;
}
private:
std::string nested_path_;
proto::schema::DataType cast_type_;
JsonInvertedIndexParseErrorRecorder error_recorder_;
};
} // namespace milvus::index

View File

@ -96,6 +96,7 @@ set(MILVUS_TEST_FILES
test_cached_search_iterator.cpp
test_build_inverted_index_with_single_segment.cpp
test_random_sample.cpp
test_json_index.cpp
)
if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -0,0 +1,101 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "common/JsonCastType.h"
#include "common/Schema.h"
#include "index/IndexFactory.h"
#include "index/JsonInvertedIndex.h"
#include <gtest/gtest.h>
#include <cstdint>
using namespace milvus;
using namespace milvus::index;
TEST(JsonIndexTest, TestBuildNonExistJsonPath) {
std::string json_path = "hello";
auto schema = std::make_shared<Schema>();
auto json_fid = schema->AddDebugField("json", DataType::JSON);
auto file_manager_ctx = storage::FileManagerContext();
file_manager_ctx.fieldDataMeta.field_schema.set_data_type(
milvus::proto::schema::JSON);
file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get());
file_manager_ctx.fieldDataMeta.field_id = json_fid.get();
auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex(
index::INVERTED_INDEX_TYPE,
JsonCastType::DOUBLE,
json_path,
file_manager_ctx);
auto json_index = std::unique_ptr<JsonInvertedIndex<int32_t>>(
static_cast<JsonInvertedIndex<int32_t>*>(inv_index.release()));
std::vector<std::string> json_raw_data = {R"({"hello": 1})",
R"({"world": 2})"};
std::vector<milvus::Json> jsons;
for (auto& json : json_raw_data) {
jsons.push_back(milvus::Json(simdjson::padded_string(json)));
}
auto json_field =
std::make_shared<FieldData<milvus::Json>>(DataType::JSON, false);
json_field->add_json_data(jsons);
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
}
TEST(JsonIndexTest, TestJSONErrRecorder) {
std::vector<std::string> json_raw_data = {
R"(1)",
R"({"a": true})",
R"({"a": 1.0})",
R"({"a": 1})",
R"({"a": null})",
R"({"a": [1,2,3]})",
R"({"a": [1.0,2,3]})",
R"({"a": {"b": 1}})",
R"({"a": "1"})",
R"({"a": 1, "a": 1.0})",
};
std::string json_path = "/a";
auto schema = std::make_shared<Schema>();
auto json_fid = schema->AddDebugField("json", DataType::JSON);
auto file_manager_ctx = storage::FileManagerContext();
file_manager_ctx.fieldDataMeta.field_schema.set_data_type(
milvus::proto::schema::JSON);
file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get());
file_manager_ctx.fieldDataMeta.field_id = json_fid.get();
auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex(
index::INVERTED_INDEX_TYPE,
JsonCastType::DOUBLE,
json_path,
file_manager_ctx);
auto json_index = std::unique_ptr<JsonInvertedIndex<double>>(
static_cast<JsonInvertedIndex<double>*>(inv_index.release()));
std::vector<milvus::Json> jsons;
for (auto& json : json_raw_data) {
jsons.push_back(milvus::Json(simdjson::padded_string(json)));
}
auto json_field =
std::make_shared<FieldData<milvus::Json>>(DataType::JSON, false);
json_field->add_json_data(jsons);
json_index->BuildWithFieldData({json_field});
auto error_map = json_index->GetErrorRecorder().GetErrorMap();
EXPECT_EQ(error_map.size(), 2);
EXPECT_EQ(error_map[simdjson::error_code::INCORRECT_TYPE].count, 6);
EXPECT_EQ(error_map[simdjson::error_code::INVALID_JSON_POINTER].count, 1);
}