From 10fe53ff59b2810f47fcb837f7681fb9dbc64e1b Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 25 Jul 2025 10:28:54 +0800 Subject: [PATCH] feat: support json for ngram (#43170) Ref https://github.com/milvus-io/milvus/issues/42053 This PR enable ngram to support json data type. --------- Signed-off-by: SpadeA --- internal/core/src/exec/expression/Expr.h | 12 +- .../core/src/exec/expression/UnaryExpr.cpp | 37 ++- internal/core/src/exec/expression/UnaryExpr.h | 3 + internal/core/src/index/IndexFactory.cpp | 53 ++-- internal/core/src/index/IndexFactory.h | 10 +- internal/core/src/index/IndexInfo.h | 5 +- internal/core/src/index/JsonIndexBuilder.cpp | 140 ++++++++++ internal/core/src/index/JsonIndexBuilder.h | 88 ++++++ internal/core/src/index/JsonInvertedIndex.cpp | 87 ++---- .../core/src/index/NgramInvertedIndex.cpp | 259 ++++++++++++++---- internal/core/src/index/NgramInvertedIndex.h | 35 ++- .../src/segcore/ChunkedSegmentSealedImpl.cpp | 53 +++- .../src/segcore/ChunkedSegmentSealedImpl.h | 19 ++ .../core/src/segcore/SegmentInterface.cpp | 17 ++ internal/core/src/segcore/SegmentInterface.h | 22 ++ internal/core/src/segcore/SegmentSealed.h | 8 + internal/core/unittest/test_expr.cpp | 32 ++- .../core/unittest/test_json_flat_index.cpp | 8 +- internal/core/unittest/test_json_index.cpp | 28 +- internal/core/unittest/test_ngram_query.cpp | 136 +++++++++ .../indexparamcheck/ngram_index_checker.go | 9 +- 21 files changed, 847 insertions(+), 214 deletions(-) create mode 100644 internal/core/src/index/JsonIndexBuilder.cpp create mode 100644 internal/core/src/index/JsonIndexBuilder.h diff --git a/internal/core/src/exec/expression/Expr.h b/internal/core/src/exec/expression/Expr.h index 18c19b5906..6fde7f9745 100644 --- a/internal/core/src/exec/expression/Expr.h +++ b/internal/core/src/exec/expression/Expr.h @@ -1283,11 +1283,13 @@ class SegmentExpr : public Expr { bool CanUseNgramIndex(FieldId field_id) const { - if (segment_->type() != SegmentType::Sealed) { - return false; - } - auto cast_ptr = dynamic_cast(segment_); - return (cast_ptr != nullptr && cast_ptr->HasNgramIndex(field_id)); + return segment_->HasNgramIndex(field_id); + } + + bool + CanUseNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const { + return segment_->HasNgramIndexForJson(field_id, nested_path); } protected: diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index f92c905002..f11c7cb202 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -197,8 +197,19 @@ PhyUnaryRangeFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { } case DataType::JSON: { auto val_type = expr_->val_.val_case(); - if (CanUseIndexForJson(FromValCase(val_type)) && + auto val_type_inner = FromValCase(val_type); + if (CanExecNgramMatchForJson(val_type_inner) && !has_offset_input_) { + auto res = ExecNgramMatch(); + // If nullopt is returned, it means the query cannot be + // optimized by ngram index. Forward it to the normal path. + if (res.has_value()) { + result = res.value(); + break; + } + } + + if (CanUseIndexForJson(val_type_inner) && !has_offset_input_) { switch (val_type) { case proto::plan::GenericValue::ValCase::kBoolVal: result = ExecRangeVisitorImplForIndex(); @@ -1885,6 +1896,7 @@ PhyUnaryRangeFilterExpr::CanUseIndexForJson(DataType val_type) { val_type); switch (val_type) { case DataType::STRING: + case DataType::VARCHAR: use_index_ = has_index && expr_->op_type_ != proto::plan::OpType::Match && expr_->op_type_ != proto::plan::OpType::PostfixMatch && @@ -1974,6 +1986,18 @@ PhyUnaryRangeFilterExpr::CanExecNgramMatch(proto::plan::OpType op_type) { !has_offset_input_ && CanUseNgramIndex(field_id_); } +bool +PhyUnaryRangeFilterExpr::CanExecNgramMatchForJson(DataType val_type) { + return (val_type == DataType::STRING || val_type == DataType::VARCHAR) && + (expr_->op_type_ == proto::plan::OpType::InnerMatch || + expr_->op_type_ == proto::plan::OpType::Match || + expr_->op_type_ == proto::plan::OpType::PrefixMatch || + expr_->op_type_ == proto::plan::OpType::PostfixMatch) && + !has_offset_input_ && + CanUseNgramIndexForJson( + field_id_, milvus::Json::pointer(expr_->column_.nested_path_)); +} + std::optional PhyUnaryRangeFilterExpr::ExecNgramMatch() { if (!arg_inited_) { @@ -1988,8 +2012,15 @@ PhyUnaryRangeFilterExpr::ExecNgramMatch() { } if (cached_ngram_match_res_ == nullptr) { - auto pinned_index = segment_->GetNgramIndex(field_id_); - auto index = pinned_index.get(); + index::NgramInvertedIndex* index; + if (expr_->column_.data_type_ == DataType::JSON) { + auto pinned_index = segment_->GetNgramIndexForJson( + field_id_, milvus::Json::pointer(expr_->column_.nested_path_)); + index = pinned_index.get(); + } else { + auto pinned_index = segment_->GetNgramIndex(field_id_); + index = pinned_index.get(); + } AssertInfo(index != nullptr, "ngram index should not be null, field_id: {}", field_id_.get()); diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index 3f55896791..cde6864c63 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -509,6 +509,9 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr { bool CanExecNgramMatch(proto::plan::OpType op_type); + bool + CanExecNgramMatchForJson(DataType val_type); + std::optional ExecNgramMatch(); diff --git a/internal/core/src/index/IndexFactory.cpp b/internal/core/src/index/IndexFactory.cpp index 60339fcf37..17ee9c921d 100644 --- a/internal/core/src/index/IndexFactory.cpp +++ b/internal/core/src/index/IndexFactory.cpp @@ -67,28 +67,6 @@ IndexFactory::CreatePrimitiveScalarIndex( return CreateScalarIndexSort(file_manager_context); } -IndexBasePtr -IndexFactory::CreateNgramIndex( - DataType data_type, - const NgramParams& params, - const storage::FileManagerContext& file_manager_context) { - switch (data_type) { - case DataType::VARCHAR: - case DataType::STRING: - return std::make_unique(file_manager_context, - params); - - case DataType::JSON: - ThrowInfo( - NotImplemented, - fmt::format("building ngram index in json is not implemented")); - default: - ThrowInfo(DataTypeInvalid, - fmt::format("invalid data type to build ngram index: {}", - data_type)); - } -} - template <> ScalarIndexPtr IndexFactory::CreatePrimitiveScalarIndex( @@ -364,8 +342,8 @@ IndexFactory::CreatePrimitiveScalarIndex( case DataType::VARCHAR: { auto& ngram_params = create_index_info.ngram_params; if (ngram_params.has_value()) { - return CreateNgramIndex( - data_type, ngram_params.value(), file_manager_context); + return std::make_unique( + file_manager_context, ngram_params.value()); } return CreatePrimitiveScalarIndex( create_index_info, file_manager_context); @@ -405,14 +383,15 @@ IndexFactory::CreateComplexScalarIndex( IndexBasePtr IndexFactory::CreateJsonIndex( - IndexType index_type, - JsonCastType cast_dtype, - const std::string& nested_path, - const storage::FileManagerContext& file_manager_context, - const std::string& json_cast_function) { - AssertInfo(index_type == INVERTED_INDEX_TYPE, + const CreateIndexInfo& create_index_info, + const storage::FileManagerContext& file_manager_context) { + AssertInfo(create_index_info.index_type == INVERTED_INDEX_TYPE || + create_index_info.index_type == NGRAM_INDEX_TYPE, "Invalid index type for json index"); + const auto& cast_dtype = create_index_info.json_cast_type; + const auto& nested_path = create_index_info.json_path; + const auto& json_cast_function = create_index_info.json_cast_function; switch (cast_dtype.element_type()) { case JsonCastType::DataType::BOOL: return std::make_unique>( @@ -426,12 +405,18 @@ IndexFactory::CreateJsonIndex( nested_path, file_manager_context, JsonCastFunction::FromString(json_cast_function)); - case JsonCastType::DataType::VARCHAR: + case JsonCastType::DataType::VARCHAR: { + auto& ngram_params = create_index_info.ngram_params; + if (ngram_params.has_value()) { + return std::make_unique( + file_manager_context, ngram_params.value(), nested_path); + } return std::make_unique>( cast_dtype, nested_path, file_manager_context, JsonCastFunction::FromString(json_cast_function)); + } case JsonCastType::DataType::JSON: return std::make_unique(file_manager_context, nested_path); @@ -462,11 +447,7 @@ IndexFactory::CreateScalarIndex( file_manager_context); } case DataType::JSON: { - return CreateJsonIndex(create_index_info.index_type, - create_index_info.json_cast_type, - create_index_info.json_path, - file_manager_context, - create_index_info.json_cast_function); + return CreateJsonIndex(create_index_info, file_manager_context); } default: ThrowInfo(DataTypeInvalid, "Invalid data type:{}", data_type); diff --git a/internal/core/src/index/IndexFactory.h b/internal/core/src/index/IndexFactory.h index c24cbb6e1a..b682423b51 100644 --- a/internal/core/src/index/IndexFactory.h +++ b/internal/core/src/index/IndexFactory.h @@ -116,13 +116,9 @@ class IndexFactory { storage::FileManagerContext()); IndexBasePtr - CreateJsonIndex( - IndexType index_type, - JsonCastType cast_dtype, - const std::string& nested_path, - const storage::FileManagerContext& file_manager_context = - storage::FileManagerContext(), - const std::string& json_cast_function = UNKNOW_CAST_FUNCTION_NAME); + CreateJsonIndex(const CreateIndexInfo& create_index_info, + const storage::FileManagerContext& file_manager_context = + storage::FileManagerContext()); IndexBasePtr CreateScalarIndex(const CreateIndexInfo& create_index_info, diff --git a/internal/core/src/index/IndexInfo.h b/internal/core/src/index/IndexInfo.h index cc9dd37916..ba071fd930 100644 --- a/internal/core/src/index/IndexInfo.h +++ b/internal/core/src/index/IndexInfo.h @@ -17,6 +17,7 @@ #include "common/JsonCastType.h" #include "common/Types.h" +#include "common/Consts.h" namespace milvus::index { @@ -37,8 +38,8 @@ struct CreateIndexInfo { uint32_t tantivy_index_version{7}; JsonCastType json_cast_type{JsonCastType::UNKNOWN}; std::string json_path; - std::string json_cast_function; - std::optional ngram_params; + std::string json_cast_function{UNKNOW_CAST_FUNCTION_NAME}; + std::optional ngram_params{std::nullopt}; }; } // namespace milvus::index diff --git a/internal/core/src/index/JsonIndexBuilder.cpp b/internal/core/src/index/JsonIndexBuilder.cpp new file mode 100644 index 0000000000..10f61e3443 --- /dev/null +++ b/internal/core/src/index/JsonIndexBuilder.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include +#include "common/JsonUtils.h" +#include "index/JsonIndexBuilder.h" +#include "simdjson/error.h" + +namespace milvus::index { + +template +void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder) { + int64_t offset = 0; + using SIMDJSON_T = + std::conditional_t, std::string_view, T>; + + auto tokens = parse_json_pointer(nested_path); + + bool is_array = cast_type.data_type() == JsonCastType::DataType::ARRAY; + + for (const auto& data : field_datas) { + auto n = data->get_num_rows(); + for (int64_t i = 0; i < n; i++) { + auto json_column = static_cast(data->RawValue(i)); + if (schema.nullable() && !data->is_valid(i)) { + null_adder(offset++); + continue; + } + + auto exists = path_exists(json_column->dom_doc(), tokens); + if (!exists || !json_column->exist(nested_path)) { + error_recorder( + *json_column, nested_path, simdjson::NO_SUCH_FIELD); + null_adder(offset++); + continue; + } + + folly::fbvector values; + if (is_array) { + auto doc = json_column->dom_doc(); + auto array_res = doc.at_pointer(nested_path).get_array(); + if (array_res.error() != simdjson::SUCCESS) { + error_recorder( + *json_column, nested_path, array_res.error()); + } else { + auto array_values = array_res.value(); + for (auto value : array_values) { + auto val = value.template get(); + + if (val.error() == simdjson::SUCCESS) { + values.push_back(static_cast(val.value())); + } + } + } + } else { + if (cast_function.match()) { + auto res = JsonCastFunction::CastJsonValue( + cast_function, *json_column, nested_path); + if (res.has_value()) { + values.push_back(res.value()); + } + } else { + value_result res = + json_column->at(nested_path); + if (res.error() != simdjson::SUCCESS) { + error_recorder(*json_column, nested_path, res.error()); + } else { + values.push_back(static_cast(res.value())); + } + } + } + + data_adder(values, offset++); + } + } +} + +template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +} // namespace milvus::index \ No newline at end of file diff --git a/internal/core/src/index/JsonIndexBuilder.h b/internal/core/src/index/JsonIndexBuilder.h new file mode 100644 index 0000000000..3e262e9d40 --- /dev/null +++ b/internal/core/src/index/JsonIndexBuilder.h @@ -0,0 +1,88 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#pragma once +#include +#include "common/JsonCastType.h" +#include "common/JsonCastFunction.h" +#include "common/Json.h" +#include "folly/FBVector.h" + +namespace milvus::index { + +template +using JsonDataAdder = + std::function& values, int64_t offset)>; + +using JsonErrorRecorder = std::function; + +using JsonNullAdder = std::function; + +// A helper function for processing json data for building inverted index +template +void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +extern template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +extern template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +extern template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +extern template void +ProcessJsonFieldData( + const std::vector>& field_datas, + const proto::schema::FieldSchema& schema, + const std::string& nested_path, + const JsonCastType& cast_type, + JsonCastFunction cast_function, + JsonDataAdder data_adder, + JsonNullAdder null_adder, + JsonErrorRecorder error_recorder); + +} // namespace milvus::index \ No newline at end of file diff --git a/internal/core/src/index/JsonInvertedIndex.cpp b/internal/core/src/index/JsonInvertedIndex.cpp index ba6ec97a35..e34a88115e 100644 --- a/internal/core/src/index/JsonInvertedIndex.cpp +++ b/internal/core/src/index/JsonInvertedIndex.cpp @@ -21,6 +21,7 @@ #include "log/Log.h" #include "common/JsonUtils.h" #include "simdjson/error.h" +#include "index/JsonIndexBuilder.h" namespace milvus::index { @@ -42,74 +43,30 @@ template void JsonInvertedIndex::build_index_for_json( const std::vector>& field_datas) { - int64_t offset = 0; LOG_INFO("Start to build json inverted index for field: {}", nested_path_); - using SIMDJSON_T = - std::conditional_t, std::string_view, T>; - auto tokens = parse_json_pointer(nested_path_); - - bool is_array = cast_type_.data_type() == JsonCastType::DataType::ARRAY; - - for (const auto& data : field_datas) { - auto n = data->get_num_rows(); - for (int64_t i = 0; i < n; i++) { - auto json_column = static_cast(data->RawValue(i)); - if (this->schema_.nullable() && !data->is_valid(i)) { - this->null_offset_.push_back(offset); - this->wrapper_->template add_array_data( - nullptr, 0, offset++); - continue; - } - - auto exists = path_exists(json_column->dom_doc(), tokens); - if (!exists || !json_column->exist(nested_path_)) { - error_recorder_.Record( - *json_column, nested_path_, simdjson::NO_SUCH_FIELD); - this->null_offset_.push_back(offset); - this->wrapper_->template add_array_data( - nullptr, 0, offset++); - continue; - } - folly::fbvector values; - if (is_array) { - auto doc = json_column->dom_doc(); - auto array_res = doc.at_pointer(nested_path_).get_array(); - if (array_res.error() != simdjson::SUCCESS) { - error_recorder_.Record( - *json_column, nested_path_, array_res.error()); - } else { - auto array_values = array_res.value(); - for (auto value : array_values) { - auto val = value.template get(); - - if (val.error() == simdjson::SUCCESS) { - values.push_back(static_cast(val.value())); - } - } - } - } else { - if (cast_function_.match()) { - auto res = JsonCastFunction::CastJsonValue( - cast_function_, *json_column, nested_path_); - if (res.has_value()) { - values.push_back(res.value()); - } - } else { - value_result res = - json_column->at(nested_path_); - if (res.error() != simdjson::SUCCESS) { - error_recorder_.Record( - *json_column, nested_path_, res.error()); - } else { - values.push_back(static_cast(res.value())); - } - } - } + ProcessJsonFieldData( + field_datas, + this->schema_, + nested_path_, + cast_type_, + cast_function_, + // add data + [this](const folly::fbvector& values, int64_t offset) { this->wrapper_->template add_array_data( - values.data(), values.size(), offset++); - } - } + values.data(), values.size(), offset); + }, + // handle null + [this](int64_t offset) { + this->null_offset_.push_back(offset); + this->wrapper_->template add_array_data(nullptr, 0, offset); + }, + // handle error + [this](const Json& json, + const std::string& nested_path, + simdjson::error_code error) { + this->error_recorder_.Record(json, nested_path, error); + }); error_recorder_.PrintErrStats(); } diff --git a/internal/core/src/index/NgramInvertedIndex.cpp b/internal/core/src/index/NgramInvertedIndex.cpp index e1402827fd..c5cd172d14 100644 --- a/internal/core/src/index/NgramInvertedIndex.cpp +++ b/internal/core/src/index/NgramInvertedIndex.cpp @@ -11,8 +11,16 @@ #include "index/NgramInvertedIndex.h" #include "exec/expression/Expr.h" +#include "index/JsonIndexBuilder.h" namespace milvus::index { + +const JsonCastType JSON_CAST_TYPE = JsonCastType::FromString("VARCHAR"); +// ngram index doesn't need cast function +const JsonCastFunction JSON_CAST_FUNCTION = + JsonCastFunction::FromString("unknown"); + +// for string/varchar type NgramInvertedIndex::NgramInvertedIndex(const storage::FileManagerContext& ctx, const NgramParams& params) : min_gram_(params.min_gram), max_gram_(params.max_gram) { @@ -34,14 +42,69 @@ NgramInvertedIndex::NgramInvertedIndex(const storage::FileManagerContext& ctx, } } +// for json type +NgramInvertedIndex::NgramInvertedIndex(const storage::FileManagerContext& ctx, + const NgramParams& params, + const std::string& nested_path) + : NgramInvertedIndex(ctx, params) { + nested_path_ = nested_path; +} + void NgramInvertedIndex::BuildWithFieldData(const std::vector& datas) { AssertInfo(schema_.data_type() == proto::schema::DataType::String || - schema_.data_type() == proto::schema::DataType::VarChar, + schema_.data_type() == proto::schema::DataType::VarChar || + schema_.data_type() == proto::schema::DataType::JSON, "schema data type is {}", schema_.data_type()); + LOG_INFO("Start to build ngram index, data type {}, field id: {}", + schema_.data_type(), + field_id_); + index_build_begin_ = std::chrono::system_clock::now(); - InvertedIndexTantivy::BuildWithFieldData(datas); + if (schema_.data_type() == proto::schema::DataType::JSON) { + BuildWithJsonFieldData(datas); + } else { + InvertedIndexTantivy::BuildWithFieldData(datas); + } +} + +void +NgramInvertedIndex::BuildWithJsonFieldData( + const std::vector& field_datas) { + AssertInfo(schema_.data_type() == proto::schema::DataType::JSON, + "schema data should be json, but is {}", + schema_.data_type()); + LOG_INFO("Start to build ngram index for json, field_id: {}, field: {}", + field_id_, + nested_path_); + + index_build_begin_ = std::chrono::system_clock::now(); + ProcessJsonFieldData( + field_datas, + this->schema_, + nested_path_, + JSON_CAST_TYPE, + JSON_CAST_FUNCTION, + // add data + [this](const folly::fbvector& values, int64_t offset) { + this->wrapper_->template add_array_data( + values.data(), values.size(), offset); + }, + // handle null + [this](int64_t offset) { + this->null_offset_.push_back(offset); + this->wrapper_->template add_array_data( + nullptr, 0, offset); + }, + // handle error + [this](const Json& json, + const std::string& nested_path, + simdjson::error_code error) { + this->error_recorder_.Record(json, nested_path, error); + }); + + error_recorder_.PrintErrStats(); } IndexStatsPtr @@ -51,9 +114,12 @@ NgramInvertedIndex::Upload(const Config& config) { auto index_build_duration = std::chrono::duration(index_build_end - index_build_begin_) .count(); - LOG_INFO("index build done for ngram index, field id: {}, duration: {}s", - field_id_, - index_build_duration); + LOG_INFO( + "index build done for ngram index, data type {}, field id: {}, " + "duration: {}s", + schema_.data_type(), + field_id_, + index_build_duration); return InvertedIndexTantivy::Upload(config); } @@ -116,30 +182,88 @@ NgramInvertedIndex::ExecuteQuery(const std::string& literal, } switch (op_type) { - case proto::plan::OpType::InnerMatch: { - auto predicate = [&literal](const std::string_view& data) { - return data.find(literal) != std::string::npos; - }; - bool need_post_filter = literal.length() > max_gram_; - return ExecuteQueryWithPredicate( - literal, segment, predicate, need_post_filter); - } case proto::plan::OpType::Match: return MatchQuery(literal, segment); + case proto::plan::OpType::InnerMatch: { + bool need_post_filter = literal.length() > max_gram_; + + if (schema_.data_type() == proto::schema::DataType::JSON) { + auto predicate = [&literal, this](const milvus::Json& data) { + auto x = + data.template at(this->nested_path_); + if (x.error()) { + return false; + } + auto data_val = x.value(); + return data_val.find(literal) != std::string::npos; + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, need_post_filter); + } else { + auto predicate = [&literal](const std::string_view& data) { + return data.find(literal) != std::string::npos; + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, need_post_filter); + } + } case proto::plan::OpType::PrefixMatch: { - auto predicate = [&literal](const std::string_view& data) { - return data.length() >= literal.length() && - std::equal(literal.begin(), literal.end(), data.begin()); - }; - return ExecuteQueryWithPredicate(literal, segment, predicate, true); + if (schema_.data_type() == proto::schema::DataType::JSON) { + auto predicate = [&literal, this](const milvus::Json& data) { + auto x = + data.template at(this->nested_path_); + if (x.error()) { + return false; + } + auto data_val = x.value(); + return data_val.length() >= literal.length() && + std::equal(literal.begin(), + literal.end(), + data_val.begin()); + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, true); + } else { + auto predicate = [&literal](const std::string_view& data) { + return data.length() >= literal.length() && + std::equal( + literal.begin(), literal.end(), data.begin()); + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, true); + } } case proto::plan::OpType::PostfixMatch: { - auto predicate = [&literal](const std::string_view& data) { - return data.length() >= literal.length() && - std::equal( - literal.rbegin(), literal.rend(), data.rbegin()); - }; - return ExecuteQueryWithPredicate(literal, segment, predicate, true); + if (schema_.data_type() == proto::schema::DataType::JSON) { + auto predicate = [&literal, this](const milvus::Json& data) { + auto x = + data.template at(this->nested_path_); + if (x.error()) { + return false; + } + auto data_val = x.value(); + return data_val.length() >= literal.length() && + std::equal(literal.rbegin(), + literal.rend(), + data_val.rbegin()); + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, true); + } else { + auto predicate = [&literal](const std::string_view& data) { + return data.length() >= literal.length() && + std::equal( + literal.rbegin(), literal.rend(), data.rbegin()); + }; + + return ExecuteQueryWithPredicate( + literal, segment, predicate, true); + } } default: LOG_WARN("unsupported op type for ngram index: {}", op_type); @@ -147,12 +271,12 @@ NgramInvertedIndex::ExecuteQuery(const std::string& literal, } } +template inline void -handle_batch(const std::string_view* data, - const int32_t* offsets, +handle_batch(const T* data, const int size, TargetBitmapView res, - std::function predicate) { + std::function predicate) { auto next_off_option = res.find_first(); while (next_off_option.has_value()) { auto next_off = next_off_option.value(); @@ -166,34 +290,35 @@ handle_batch(const std::string_view* data, } } +template std::optional NgramInvertedIndex::ExecuteQueryWithPredicate( const std::string& literal, exec::SegmentExpr* segment, - std::function predicate, + std::function predicate, bool need_post_filter) { TargetBitmap bitset{static_cast(Count())}; wrapper_->ngram_match_query(literal, min_gram_, max_gram_, &bitset); - TargetBitmapView res(bitset); - TargetBitmap valid(res.size(), true); - TargetBitmapView valid_res(valid.data(), valid.size()); - if (need_post_filter) { + TargetBitmapView res(bitset); + TargetBitmap valid(res.size(), true); + TargetBitmapView valid_res(valid.data(), valid.size()); + auto execute_batch = [&predicate]( - const std::string_view* data, + const T* data, // `valid_data` is not used as the results returned by ngram_match_query are all valid const bool* _valid_data, - const int32_t* offsets, + const int32_t* _offsets, const int size, TargetBitmapView res, // the same with `valid_data` TargetBitmapView _valid_res) { - handle_batch(data, offsets, size, res, predicate); + handle_batch(data, size, res, predicate); }; - segment->ProcessAllDataChunk( + segment->ProcessAllDataChunk( execute_batch, std::nullptr_t{}, res, valid_res); } @@ -250,24 +375,52 @@ NgramInvertedIndex::MatchQuery(const std::string& literal, auto regex_pattern = translator(literal); RegexMatcher matcher(regex_pattern); - auto predicate = [&matcher](const std::string_view& data) { - return matcher(data); - }; - - auto execute_batch = - [&predicate]( - const std::string_view* data, - // `_valid_data` is not used as the results returned by ngram_match_query are all valid - const bool* _valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - // the same with `_valid_data` - TargetBitmapView _valid_res) { - handle_batch(data, offsets, size, res, predicate); + if (schema_.data_type() == proto::schema::DataType::JSON) { + auto predicate = [&literal, &matcher, this](const milvus::Json& data) { + auto x = data.template at(this->nested_path_); + if (x.error()) { + return false; + } + auto data_val = x.value(); + return matcher(data_val); }; - segment->ProcessAllDataChunk( - execute_batch, std::nullptr_t{}, res, valid_res); + + auto execute_batch_json = + [&predicate]( + const milvus::Json* data, + // `valid_data` is not used as the results returned by ngram_match_query are all valid + const bool* _valid_data, + const int32_t* _offsets, + const int size, + TargetBitmapView res, + // the same with `valid_data` + TargetBitmapView _valid_res, + std::string val) { + handle_batch(data, size, res, predicate); + }; + + segment->ProcessAllDataChunk( + execute_batch_json, std::nullptr_t{}, res, valid_res, literal); + } else { + auto predicate = [&matcher](const std::string_view& data) { + return matcher(data); + }; + + auto execute_batch = + [&predicate]( + const std::string_view* data, + // `valid_data` is not used as the results returned by ngram_match_query are all valid + const bool* _valid_data, + const int32_t* _offsets, + const int size, + TargetBitmapView res, + // the same with `valid_data` + TargetBitmapView _valid_res) { + handle_batch(data, size, res, predicate); + }; + segment->ProcessAllDataChunk( + execute_batch, std::nullptr_t{}, res, valid_res); + } return std::optional(std::move(bitset)); } diff --git a/internal/core/src/index/NgramInvertedIndex.h b/internal/core/src/index/NgramInvertedIndex.h index 33418157b1..43bc828b0d 100644 --- a/internal/core/src/index/NgramInvertedIndex.h +++ b/internal/core/src/index/NgramInvertedIndex.h @@ -13,7 +13,7 @@ #include #include #include - +#include "index/JsonInvertedIndex.h" #include "index/InvertedIndexTantivy.h" namespace milvus::exec { @@ -23,9 +23,15 @@ class SegmentExpr; namespace milvus::index { class NgramInvertedIndex : public InvertedIndexTantivy { public: + // for string/varchar type explicit NgramInvertedIndex(const storage::FileManagerContext& ctx, const NgramParams& params); + // for json type + explicit NgramInvertedIndex(const storage::FileManagerContext& ctx, + const NgramParams& params, + const std::string& nested_path); + IndexStatsPtr Upload(const Config& config = {}) override; @@ -35,18 +41,31 @@ class NgramInvertedIndex : public InvertedIndexTantivy { void BuildWithFieldData(const std::vector& datas) override; + void + BuildWithJsonFieldData(const std::vector& datas); + std::optional ExecuteQuery(const std::string& literal, proto::plan::OpType op_type, exec::SegmentExpr* segment); + void + finish() { + this->wrapper_->finish(); + } + + void + create_reader(SetBitsetFn set_bitset) { + this->wrapper_->create_reader(set_bitset); + } + private: + template std::optional - ExecuteQueryWithPredicate( - const std::string& literal, - exec::SegmentExpr* segment, - std::function predicate, - bool need_post_filter); + ExecuteQueryWithPredicate(const std::string& literal, + exec::SegmentExpr* segment, + std::function predicate, + bool need_post_filter); // Match is something like xxx%xxx%xxx, xxx%xxx, %xxx%xxx, xxx_x etc. std::optional @@ -57,5 +76,9 @@ class NgramInvertedIndex : public InvertedIndexTantivy { uintptr_t max_gram_{0}; int64_t field_id_{0}; std::chrono::time_point index_build_begin_; + + // for json type + std::string nested_path_; + JsonInvertedIndexParseErrorRecorder error_recorder_; }; } // namespace milvus::index \ No newline at end of file diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index 0f0cd2fdf8..e10623ab72 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -172,14 +172,27 @@ ChunkedSegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) { if (field_meta.get_data_type() == DataType::JSON) { auto path = info.index_params.at(JSON_PATH); - JsonIndex index; - index.nested_path = path; - index.field_id = field_id; - index.index = std::move(const_cast(info).cache_index); - index.cast_type = - JsonCastType::FromString(info.index_params.at(JSON_CAST_TYPE)); - json_indices.push_back(std::move(index)); - return; + if (auto it = info.index_params.find(index::INDEX_TYPE); + it != info.index_params.end() && + it->second == index::NGRAM_INDEX_TYPE) { + if (ngram_indexings_.find(field_id) == ngram_indexings_.end()) { + ngram_indexings_[field_id] = + std::unordered_map(); + } + ngram_indexings_[field_id][path] = + std::move(const_cast(info).cache_index); + return; + } else { + JsonIndex index; + index.nested_path = path; + index.field_id = field_id; + index.index = + std::move(const_cast(info).cache_index); + index.cast_type = + JsonCastType::FromString(info.index_params.at(JSON_CAST_TYPE)); + json_indices.push_back(std::move(index)); + return; + } } if (auto it = info.index_params.find(index::INDEX_TYPE); @@ -637,6 +650,29 @@ ChunkedSegmentSealedImpl::GetNgramIndex(FieldId field_id) const { return PinWrapper(ca, index); } +PinWrapper +ChunkedSegmentSealedImpl::GetNgramIndexForJson( + FieldId field_id, const std::string& nested_path) const { + std::shared_lock lck(mutex_); + auto iter = ngram_indexings_.find(field_id); + if (iter == ngram_indexings_.end() || + iter->second.find(nested_path) == iter->second.end()) { + return PinWrapper(nullptr); + } + + auto slot = iter->second.at(nested_path).get(); + lck.unlock(); + + auto ca = SemiInlineGet(slot->PinCells({0})); + auto index = dynamic_cast(ca->get_cell_of(0)); + AssertInfo(index != nullptr, + "ngram index cache for json is corrupted, field_id: {}, " + "nested_path: {}", + field_id.get(), + nested_path); + return PinWrapper(ca, index); +} + int64_t ChunkedSegmentSealedImpl::get_row_count() const { std::shared_lock lck(mutex_); @@ -1127,6 +1163,7 @@ ChunkedSegmentSealedImpl::ClearData() { ngram_fields_.clear(); scalar_indexings_.clear(); vector_indexings_.clear(); + ngram_indexings_.clear(); insert_record_.clear(); fields_.clear(); variable_fields_avg_size_.clear(); diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h index aba051624f..aa4dd6df86 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h @@ -123,9 +123,22 @@ class ChunkedSegmentSealedImpl : public SegmentSealed { return ngram_fields_.find(field_id) != ngram_fields_.end(); } + bool + HasNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const override { + std::shared_lock lck(mutex_); + return ngram_indexings_.find(field_id) != ngram_indexings_.end() && + ngram_indexings_.at(field_id).find(nested_path) != + ngram_indexings_.at(field_id).end(); + } + PinWrapper GetNgramIndex(FieldId field_id) const override; + PinWrapper + GetNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const override; + // TODO(tiered storage 1): should return a PinWrapper void BulkGetJsonData(FieldId field_id, @@ -432,6 +445,12 @@ class ChunkedSegmentSealedImpl : public SegmentSealed { // TODO: generate index for scalar std::optional num_rows_; + // ngram indexings for json type + std::unordered_map< + FieldId, + std::unordered_map> + ngram_indexings_; + // fields that has ngram index std::unordered_set ngram_fields_{}; diff --git a/internal/core/src/segcore/SegmentInterface.cpp b/internal/core/src/segcore/SegmentInterface.cpp index 988b3f8d5e..d607b7d2c3 100644 --- a/internal/core/src/segcore/SegmentInterface.cpp +++ b/internal/core/src/segcore/SegmentInterface.cpp @@ -542,4 +542,21 @@ SegmentInternalInterface::GetNgramIndex(FieldId field_id) const { return PinWrapper(nullptr); } +PinWrapper +SegmentInternalInterface::GetNgramIndexForJson( + FieldId field_id, const std::string& nested_path) const { + return PinWrapper(nullptr); +} + +bool +SegmentInternalInterface::HasNgramIndex(FieldId field_id) const { + return false; +} + +bool +SegmentInternalInterface::HasNgramIndexForJson( + FieldId field_id, const std::string& nested_path) const { + return false; +} + } // namespace milvus::segcore diff --git a/internal/core/src/segcore/SegmentInterface.h b/internal/core/src/segcore/SegmentInterface.h index c7a6962b7f..48d908debd 100644 --- a/internal/core/src/segcore/SegmentInterface.h +++ b/internal/core/src/segcore/SegmentInterface.h @@ -155,6 +155,17 @@ class SegmentInterface { virtual PinWrapper GetNgramIndex(FieldId field_id) const = 0; + virtual PinWrapper + GetNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const = 0; + + virtual bool + HasNgramIndex(FieldId field_id) const = 0; + + virtual bool + HasNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const = 0; + virtual void LazyCheckSchema(SchemaPtr sch) = 0; @@ -369,6 +380,17 @@ class SegmentInternalInterface : public SegmentInterface { virtual PinWrapper GetNgramIndex(FieldId field_id) const override; + virtual PinWrapper + GetNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const override; + + virtual bool + HasNgramIndex(FieldId field_id) const override; + + virtual bool + HasNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const override; + public: virtual void vector_search(SearchInfo& search_info, diff --git a/internal/core/src/segcore/SegmentSealed.h b/internal/core/src/segcore/SegmentSealed.h index 9442c8f622..b7f9bbd574 100644 --- a/internal/core/src/segcore/SegmentSealed.h +++ b/internal/core/src/segcore/SegmentSealed.h @@ -107,9 +107,17 @@ class SegmentSealed : public SegmentInternalInterface { virtual bool HasNgramIndex(FieldId field_id) const = 0; + virtual bool + HasNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const = 0; + virtual PinWrapper GetNgramIndex(FieldId field_id) const override = 0; + virtual PinWrapper + GetNgramIndexForJson(FieldId field_id, + const std::string& nested_path) const override = 0; + SegmentType type() const override { return SegmentType::Sealed; diff --git a/internal/core/unittest/test_expr.cpp b/internal/core/unittest/test_expr.cpp index 552c56db17..a6cacf20d9 100644 --- a/internal/core/unittest/test_expr.cpp +++ b/internal/core/unittest/test_expr.cpp @@ -16530,9 +16530,11 @@ TYPED_TEST(JsonIndexTestFixture, TestJsonIndexUnaryExpr) { file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - this->cast_type, - this->json_path, + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = this->cast_type, + .json_path = this->json_path, + }, file_manager_ctx); using json_index_type = @@ -16664,10 +16666,13 @@ TEST(JsonIndexTest, TestJsonNotEqualExpr) { file_manager_ctx.fieldDataMeta.field_schema.set_data_type( milvus::proto::schema::JSON); file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); + auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("DOUBLE"), - "/a", + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("DOUBLE"), + .json_path = "/a", + }, file_manager_ctx); using json_index_type = index::JsonInvertedIndex; @@ -16772,9 +16777,11 @@ TEST_P(JsonIndexExistsTest, TestExistsExpr) { file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); file_manager_ctx.fieldDataMeta.field_schema.set_nullable(true); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("DOUBLE"), - json_index_path, + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("DOUBLE"), + .json_path = json_index_path, + }, file_manager_ctx); using json_index_type = index::JsonInvertedIndex; @@ -16958,7 +16965,12 @@ TEST_P(JsonIndexBinaryExprTest, TestBinaryRangeExpr) { file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, GetParam(), "/a", file_manager_ctx); + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = GetParam(), + .json_path = "/a", + }, + file_manager_ctx); using json_index_type = index::JsonInvertedIndex; auto json_index = std::unique_ptr( diff --git a/internal/core/unittest/test_json_flat_index.cpp b/internal/core/unittest/test_json_flat_index.cpp index 79f1a08a3a..34b1324270 100644 --- a/internal/core/unittest/test_json_flat_index.cpp +++ b/internal/core/unittest/test_json_flat_index.cpp @@ -609,9 +609,11 @@ class JsonFlatIndexExprTest : public ::testing::Test { json_fid_.get()); file_manager_ctx.fieldDataMeta.field_schema.set_nullable(true); auto index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("JSON"), - json_index_path, + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("JSON"), + .json_path = json_index_path, + }, file_manager_ctx); json_index_ = std::unique_ptr( diff --git a/internal/core/unittest/test_json_index.cpp b/internal/core/unittest/test_json_index.cpp index 9fd7ec243e..a329cc75a0 100644 --- a/internal/core/unittest/test_json_index.cpp +++ b/internal/core/unittest/test_json_index.cpp @@ -57,9 +57,11 @@ TEST(JsonIndexTest, TestJSONErrRecorder) { file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("DOUBLE"), - json_path, + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("DOUBLE"), + .json_path = json_path, + }, file_manager_ctx); auto json_index = std::unique_ptr>( static_cast*>(inv_index.release())); @@ -118,9 +120,11 @@ TEST(JsonIndexTest, TestJsonContains) { file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("ARRAY_DOUBLE"), - json_path, + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("ARRAY_DOUBLE"), + .json_path = json_path, + }, file_manager_ctx); auto json_index = std::unique_ptr>( static_cast*>(inv_index.release())); @@ -212,11 +216,13 @@ TEST(JsonIndexTest, TestJsonCast) { file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( - index::INVERTED_INDEX_TYPE, - JsonCastType::FromString("DOUBLE"), - json_path, - file_manager_ctx, - "STRING_TO_DOUBLE"); + index::CreateIndexInfo{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("DOUBLE"), + .json_path = json_path, + .json_cast_function = "STRING_TO_DOUBLE", + }, + file_manager_ctx); auto json_index = std::unique_ptr>( static_cast*>(inv_index.release())); diff --git a/internal/core/unittest/test_ngram_query.cpp b/internal/core/unittest/test_ngram_query.cpp index d35bccfab5..6082e79f40 100644 --- a/internal/core/unittest/test_ngram_query.cpp +++ b/internal/core/unittest/test_ngram_query.cpp @@ -22,6 +22,8 @@ #include "index/IndexFactory.h" #include "index/NgramInvertedIndex.h" #include "segcore/load_index_c.h" +#include "test_cachinglayer/cachinglayer_test_utils.h" +#include "expr/ITypeExpr.h" using namespace milvus; using namespace milvus::query; @@ -381,3 +383,137 @@ TEST(NgramIndex, TestNgramSimple) { proto::plan::OpType::PostfixMatch, std::vector(10000, true)); } + +TEST(NgramIndex, TestNgramJson) { + std::vector json_raw_data = { + R"(1)", + R"({"a": "Milvus project"})", + R"({"a": "Zilliz cloud"})", + R"({"a": "Query Node"})", + R"({"a": "Data Node"})", + R"({"a": [1, 2, 3]})", + R"({"a": {"b": 1}})", + R"({"a": 1001})", + R"({"a": true})", + R"({"a": "Milvus", "b": "Zilliz cloud"})", + }; + + auto json_path = "/a"; + auto schema = std::make_shared(); + auto json_fid = schema->AddDebugField("json", DataType::JSON); + + auto file_manager_ctx = storage::FileManagerContext(); + file_manager_ctx.fieldDataMeta.field_schema.set_data_type( + milvus::proto::schema::JSON); + file_manager_ctx.fieldDataMeta.field_schema.set_fieldid(json_fid.get()); + file_manager_ctx.fieldDataMeta.field_id = json_fid.get(); + + index::CreateIndexInfo create_index_info{ + .index_type = index::INVERTED_INDEX_TYPE, + .json_cast_type = JsonCastType::FromString("VARCHAR"), + .json_path = json_path, + .ngram_params = std::optional{index::NgramParams{ + .loading_index = false, + .min_gram = 2, + .max_gram = 3, + }}, + }; + auto inv_index = index::IndexFactory::GetInstance().CreateJsonIndex( + create_index_info, file_manager_ctx); + + auto ngram_index = std::unique_ptr( + static_cast(inv_index.release())); + + std::vector jsons; + for (auto& json : json_raw_data) { + jsons.push_back(milvus::Json(simdjson::padded_string(json))); + } + + auto json_field = + std::make_shared>(DataType::JSON, false); + json_field->add_json_data(jsons); + ngram_index->BuildWithFieldData({json_field}); + ngram_index->finish(); + ngram_index->create_reader(milvus::index::SetBitsetSealed); + + auto segment = segcore::CreateSealedSegment(schema); + segcore::LoadIndexInfo load_index_info; + load_index_info.field_id = json_fid.get(); + load_index_info.field_type = DataType::JSON; + load_index_info.cache_index = + CreateTestCacheIndex("", std::move(ngram_index)); + + std::map index_params{ + {milvus::index::INDEX_TYPE, milvus::index::NGRAM_INDEX_TYPE}, + {milvus::index::MIN_GRAM, "2"}, + {milvus::index::MAX_GRAM, "3"}, + {milvus::LOAD_PRIORITY, "HIGH"}, + {JSON_PATH, json_path}, + {JSON_CAST_TYPE, "VARCHAR"}}; + load_index_info.index_params = index_params; + + segment->LoadIndex(load_index_info); + + auto cm = milvus::storage::RemoteChunkManagerSingleton::GetInstance() + .GetRemoteChunkManager(); + auto load_info = PrepareSingleFieldInsertBinlog( + 0, 0, 0, json_fid.get(), {json_field}, cm); + segment->LoadFieldData(load_info); + + std::vector, + proto::plan::OpType>> + test_cases; + proto::plan::GenericValue value; + value.set_string_val("nothing"); + test_cases.push_back(std::make_tuple( + value, std::vector{}, proto::plan::OpType::InnerMatch)); + + value.set_string_val("il"); + test_cases.push_back(std::make_tuple( + value, std::vector{1, 2, 9}, proto::plan::OpType::InnerMatch)); + + value.set_string_val("lliz"); + test_cases.push_back(std::make_tuple( + value, std::vector{2}, proto::plan::OpType::InnerMatch)); + + value.set_string_val("Zi"); + test_cases.push_back(std::make_tuple( + value, std::vector{2}, proto::plan::OpType::PrefixMatch)); + + value.set_string_val("Zilliz"); + test_cases.push_back(std::make_tuple( + value, std::vector{2}, proto::plan::OpType::PrefixMatch)); + + value.set_string_val("de"); + test_cases.push_back(std::make_tuple( + value, std::vector{3, 4}, proto::plan::OpType::PostfixMatch)); + + value.set_string_val("Node"); + test_cases.push_back(std::make_tuple( + value, std::vector{3, 4}, proto::plan::OpType::PostfixMatch)); + + value.set_string_val("%ery%ode%"); + test_cases.push_back(std::make_tuple( + value, std::vector{3}, proto::plan::OpType::Match)); + + for (auto& test_case : test_cases) { + auto value = std::get<0>(test_case); + auto expr = std::make_shared( + milvus::expr::ColumnInfo(json_fid, DataType::JSON, {"a"}, true), + std::get<2>(test_case), + value, + std::vector{}); + + auto plan = + std::make_shared(DEFAULT_PLANNODE_ID, expr); + + auto result = milvus::query::ExecuteQueryExpr( + plan, segment.get(), json_raw_data.size(), MAX_TIMESTAMP); + auto expect_result = std::get<1>(test_case); + EXPECT_EQ(result.count(), expect_result.size()); + for (auto& id : expect_result) { + EXPECT_TRUE(result[id]); + } + } +} \ No newline at end of file diff --git a/internal/util/indexparamcheck/ngram_index_checker.go b/internal/util/indexparamcheck/ngram_index_checker.go index 86a3ad6e12..416183dd81 100644 --- a/internal/util/indexparamcheck/ngram_index_checker.go +++ b/internal/util/indexparamcheck/ngram_index_checker.go @@ -23,9 +23,8 @@ func newNgramIndexChecker() *NgramIndexChecker { } func (c *NgramIndexChecker) CheckTrain(dataType schemapb.DataType, params map[string]string) error { - if dataType != schemapb.DataType_VarChar { - // todo(SpadeA): we may support it for json in the future - return merr.WrapErrParameterInvalidMsg("Ngram index can only be created on VARCHAR field") + if dataType != schemapb.DataType_VarChar && dataType != schemapb.DataType_JSON { + return merr.WrapErrParameterInvalidMsg("Ngram index can only be created on VARCHAR or JSON field") } minGramStr, minGramExist := params[MinGramKey] @@ -53,8 +52,8 @@ func (c *NgramIndexChecker) CheckTrain(dataType schemapb.DataType, params map[st func (c *NgramIndexChecker) CheckValidDataType(indexType IndexType, field *schemapb.FieldSchema) error { dType := field.GetDataType() - if !typeutil.IsStringType(dType) { - return fmt.Errorf("ngram index can only be created on VARCHAR field") + if !typeutil.IsStringType(dType) && dType != schemapb.DataType_JSON { + return fmt.Errorf("ngram index can only be created on VARCHAR or JSON field") } return nil }