diff --git a/internal/core/src/index/JsonInvertedIndex.cpp b/internal/core/src/index/JsonInvertedIndex.cpp index fa5ed77aba..d248015e1c 100644 --- a/internal/core/src/index/JsonInvertedIndex.cpp +++ b/internal/core/src/index/JsonInvertedIndex.cpp @@ -46,14 +46,7 @@ JsonInvertedIndex::build_index_for_json( value_result res = json_column->at(nested_path_); auto err = res.error(); if (err != simdjson::SUCCESS) { - AssertInfo( - err == simdjson::INCORRECT_TYPE || - err == simdjson::NO_SUCH_FIELD || - err == simdjson::INVALID_JSON_POINTER, - "Failed to parse json, err: {}, json: {}, pointer: {}", - err, - *json_column, - nested_path_); + error_recorder_.Record(*json_column, nested_path_, err); if (err == simdjson::NO_SUCH_FIELD || err == simdjson::INVALID_JSON_POINTER) { folly::SharedMutex::WriteHolder lock(this->mutex_); @@ -72,6 +65,8 @@ JsonInvertedIndex::build_index_for_json( } } } + + error_recorder_.PrintErrStats(); } template class JsonInvertedIndex; diff --git a/internal/core/src/index/JsonInvertedIndex.h b/internal/core/src/index/JsonInvertedIndex.h index f8300d8c5d..a76decedcd 100644 --- a/internal/core/src/index/JsonInvertedIndex.h +++ b/internal/core/src/index/JsonInvertedIndex.h @@ -10,6 +10,7 @@ // or implied. See the License for the specific language governing permissions and limitations under the License #pragma once +#include #include "common/FieldDataInterface.h" #include "index/InvertedIndexTantivy.h" #include "index/ScalarIndex.h" @@ -18,6 +19,49 @@ #include "tantivy-binding.h" namespace milvus::index { +class JsonInvertedIndexParseErrorRecorder { + public: + struct ErrorInstance { + std::string json_str; + std::string pointer; + }; + struct ErrorStats { + int64_t count; + ErrorInstance first_instance; + }; + void + Record(const std::string_view& json_str, + const std::string& pointer, + const simdjson::error_code& error_code) { + error_map_[error_code].count++; + if (error_map_[error_code].count == 1) { + error_map_[error_code].first_instance = {std::string(json_str), + pointer}; + } + } + + void + PrintErrStats() { + if (error_map_.empty()) { + LOG_INFO("No error found"); + return; + } + for (const auto& [error_code, stats] : error_map_) { + LOG_INFO("Error code: {}, count: {}, first instance: {}", + error_code, + stats.count, + stats.first_instance.json_str); + } + } + + std::unordered_map& + GetErrorMap() { + return error_map_; + } + + private: + std::unordered_map error_map_; +}; template class JsonInvertedIndex : public index::InvertedIndexTantivy { @@ -67,9 +111,15 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy { return static_cast(cast_type_); } + JsonInvertedIndexParseErrorRecorder& + GetErrorRecorder() { + return error_recorder_; + } + private: std::string nested_path_; proto::schema::DataType cast_type_; + JsonInvertedIndexParseErrorRecorder error_recorder_; }; } // namespace milvus::index diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index b513fc7f99..c78f608228 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -96,6 +96,7 @@ set(MILVUS_TEST_FILES test_cached_search_iterator.cpp test_build_inverted_index_with_single_segment.cpp test_random_sample.cpp + test_json_index.cpp ) if ( INDEX_ENGINE STREQUAL "cardinal" ) diff --git a/internal/core/unittest/test_json_index.cpp b/internal/core/unittest/test_json_index.cpp new file mode 100644 index 0000000000..d75917f485 --- /dev/null +++ b/internal/core/unittest/test_json_index.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include "common/JsonCastType.h" +#include "common/Schema.h" +#include "index/IndexFactory.h" +#include "index/JsonInvertedIndex.h" + +#include +#include +using namespace milvus; +using namespace milvus::index; + +TEST(JsonIndexTest, TestBuildNonExistJsonPath) { + std::string json_path = "hello"; + auto schema = std::make_shared(); + auto json_fid = schema->AddDebugField("json", DataType::JSON); + auto file_manager_ctx = storage::FileManagerContext(); + file_manager_ctx.fieldDataMeta.field_schema.set_data_type( + milvus::proto::schema::JSON); + file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); + file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); + auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( + index::INVERTED_INDEX_TYPE, + JsonCastType::DOUBLE, + json_path, + file_manager_ctx); + auto json_index = std::unique_ptr>( + static_cast*>(inv_index.release())); + + std::vector json_raw_data = {R"({"hello": 1})", + R"({"world": 2})"}; + + std::vector jsons; + for (auto& json : json_raw_data) { + jsons.push_back(milvus::Json(simdjson::padded_string(json))); + } + + auto json_field = + std::make_shared>(DataType::JSON, false); + json_field->add_json_data(jsons); + json_index->BuildWithFieldData({json_field}); + json_index->finish(); + json_index->create_reader(); +} + +TEST(JsonIndexTest, TestJSONErrRecorder) { + std::vector json_raw_data = { + R"(1)", + R"({"a": true})", + R"({"a": 1.0})", + R"({"a": 1})", + R"({"a": null})", + R"({"a": [1,2,3]})", + R"({"a": [1.0,2,3]})", + R"({"a": {"b": 1}})", + R"({"a": "1"})", + R"({"a": 1, "a": 1.0})", + }; + + std::string json_path = "/a"; + auto schema = std::make_shared(); + auto json_fid = schema->AddDebugField("json", DataType::JSON); + + auto file_manager_ctx = storage::FileManagerContext(); + file_manager_ctx.fieldDataMeta.field_schema.set_data_type( + milvus::proto::schema::JSON); + file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); + file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); + + auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( + index::INVERTED_INDEX_TYPE, + JsonCastType::DOUBLE, + json_path, + file_manager_ctx); + auto json_index = std::unique_ptr>( + static_cast*>(inv_index.release())); + + std::vector jsons; + for (auto& json : json_raw_data) { + jsons.push_back(milvus::Json(simdjson::padded_string(json))); + } + + auto json_field = + std::make_shared>(DataType::JSON, false); + json_field->add_json_data(jsons); + json_index->BuildWithFieldData({json_field}); + + auto error_map = json_index->GetErrorRecorder().GetErrorMap(); + EXPECT_EQ(error_map.size(), 2); + EXPECT_EQ(error_map[simdjson::error_code::INCORRECT_TYPE].count, 6); + EXPECT_EQ(error_map[simdjson::error_code::INVALID_JSON_POINTER].count, 1); +}