// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "JsonContainsExpr.h" #include #include "common/Types.h" namespace milvus { namespace exec { void PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { auto input = context.get_offset_input(); SetHasOffsetInput((input != nullptr)); if (expr_->vals_.empty()) { auto real_batch_size = has_offset_input_ ? context.get_offset_input()->size() : GetNextBatchSize(); if (real_batch_size == 0) { result = nullptr; return; } if (expr_->op_ == proto::plan::JSONContainsExpr_JSONOp_ContainsAll) { result = std::make_shared( TargetBitmap(real_batch_size, true), TargetBitmap(real_batch_size, true)); } else { result = std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } MoveCursor(); return; } switch (expr_->column_.data_type_) { case DataType::ARRAY: { if (SegmentExpr::CanUseIndex() && !has_offset_input_) { result = EvalArrayContainsForIndexSegment( expr_->column_.element_type_); } else { result = EvalJsonContainsForDataSegment(context); } break; } case DataType::JSON: { if (SegmentExpr::CanUseIndex() && !has_offset_input_) { result = EvalArrayContainsForIndexSegment( value_type_ == DataType::INT64 ? DataType::DOUBLE : value_type_); } else { result = EvalJsonContainsForDataSegment(context); } break; } default: ThrowInfo(DataTypeInvalid, "unsupported data type: {}", expr_->column_.data_type_); } } VectorPtr PhyJsonContainsFilterExpr::EvalJsonContainsForDataSegment(EvalCtx& context) { auto data_type = expr_->column_.data_type_; switch (expr_->op_) { case proto::plan::JSONContainsExpr_JSONOp_Contains: case proto::plan::JSONContainsExpr_JSONOp_ContainsAny: { if (IsArrayDataType(data_type)) { auto val_type = expr_->column_.element_type_; switch (val_type) { case DataType::BOOL: { return ExecArrayContains(context); } case DataType::INT8: case DataType::INT16: case DataType::INT32: case DataType::INT64: { return ExecArrayContains(context); } case DataType::FLOAT: case DataType::DOUBLE: { return ExecArrayContains(context); } case DataType::STRING: case DataType::VARCHAR: { return ExecArrayContains(context); } default: ThrowInfo( DataTypeInvalid, fmt::format("unsupported array sub element type {}", val_type)); } } else { if (expr_->same_type_) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kInt64Val: { return ExecJsonContains(context); } case proto::plan::GenericValue::kFloatVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kStringVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kArrayVal: { return ExecJsonContainsArray(context); } default: ThrowInfo(DataTypeInvalid, "unsupported data type:{}", val_type); } } else { return ExecJsonContainsWithDiffType(context); } } } case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: { if (IsArrayDataType(data_type)) { auto val_type = expr_->column_.element_type_; switch (val_type) { case DataType::BOOL: { return ExecArrayContainsAll(context); } case DataType::INT8: case DataType::INT16: case DataType::INT32: case DataType::INT64: { return ExecArrayContainsAll(context); } case DataType::FLOAT: case DataType::DOUBLE: { return ExecArrayContainsAll(context); } case DataType::STRING: case DataType::VARCHAR: { return ExecArrayContainsAll(context); } default: ThrowInfo( DataTypeInvalid, fmt::format("unsupported array sub element type {}", val_type)); } } else { if (expr_->same_type_) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kInt64Val: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kFloatVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kStringVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kArrayVal: { return ExecJsonContainsAllArray(context); } default: ThrowInfo(DataTypeInvalid, "unsupported data type:{}", val_type); } } else { return ExecJsonContainsAllWithDiffType(context); } } } default: ThrowInfo(ExprInvalid, "unsupported json contains type {}", proto::plan::JSONContainsExpr_JSONOp_Name(expr_->op_)); } } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContains(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } AssertInfo(expr_->column_.nested_path_.size() == 0, "[ExecArrayContains]nested path must be null"); auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); arg_inited_ = true; } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::ArrayView* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::shared_ptr& elements) { auto executor = [&](size_t i) { const auto& array = data[i]; for (int j = 0; j < array.length(); ++j) { if (elements->In(array.template get_data(j))) { return true; } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (int i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, arg_set_); } else { processed_size = ProcessDataChunks( execute_sub_batch, std::nullptr_t{}, res, valid_res, arg_set_); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); arg_inited_ = true; } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::shared_ptr& elements) { auto executor = [&](size_t i) { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } if (elements->In(val.value()) > 0) { return true; } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, arg_set_); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, arg_set_); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsByStats() { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } std::unordered_set elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); if constexpr (std::is_same_v) { // for int64_t, we need to a double vector to store the values auto sort_arg_set = std::dynamic_pointer_cast>(arg_set_); std::vector double_vals; for (const auto& val : sort_arg_set->GetElements()) { double_vals.emplace_back(static_cast(val)); } arg_set_double_ = std::make_shared>(double_vals); } else if constexpr (std::is_same_v) { arg_set_double_ = arg_set_; } arg_inited_ = true; } if (arg_set_->Empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsAnyExecutor executor( arg_set_, arg_set_double_); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } // process shared data auto shared_executor = [this, &res_view](milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { auto val = bson.ParseAsArrayAtOffset(value_offset); if (!val.has_value()) { res_view[row_offset] = false; return; } for (const auto& element : val.value()) { if constexpr (std::is_same_v || std::is_same_v) { auto value = milvus::BsonView::GetValueFromBsonView( element.get_value()); if (value.has_value() && this->arg_set_double_->In(value.value())) { res_view[row_offset] = true; return; } } else { auto value = milvus::BsonView::GetValueFromBsonView( element.get_value()); if (value.has_value() && this->arg_set_->In(value.value())) { res_view[row_offset] = true; return; } } } }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsArrayByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](size_t i) -> bool { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.get_array(); if (val.error()) { continue; } std::vector< simdjson::simdjson_result> json_array; json_array.reserve(val.count_elements()); for (auto&& e : val) { json_array.emplace_back(e); } for (auto const& element : elements) { if (CompareTwoJsonArray(json_array, element)) { return true; } } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArrayByStats() { auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } std::vector elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsArrayExecutor executor(elements); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } auto shared_executor = [&elements, &res_view](milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { auto array = bson.ParseAsArrayAtOffset(value_offset); if (!array.has_value()) { res_view[row_offset] = false; } for (const auto& sub_value : array.value()) { auto sub_array = milvus::BsonView::GetValueFromBsonView< bsoncxx::array::view>(sub_value.get_value()); if (!sub_array.has_value()) continue; for (const auto& element : elements) { if (CompareTwoJsonArray(sub_array.value(), element)) { return true; } } } return false; }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); AssertInfo(expr_->column_.nested_path_.size() == 0, "[ExecArrayContainsAll]nested path must be null"); auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); std::set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueWithCastNumber(element)); } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::ArrayView* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::set& elements) { auto executor = [&](size_t i) { std::set tmp_elements(elements); // Note: array can only be iterated once for (int j = 0; j < data[i].length(); ++j) { tmp_elements.erase(data[i].template get_data(j)); if (tmp_elements.size() == 0) { return true; } } return tmp_elements.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (int i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, elements); } else { processed_size = ProcessDataChunks( execute_sub_batch, std::nullptr_t{}, res, valid_res, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsAllByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::set& elements) { auto executor = [&](const size_t i) -> bool { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::set tmp_elements(elements); // Note: array can only be iterated once for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } tmp_elements.erase(val.value()); if (tmp_elements.size() == 0) { return true; } } return tmp_elements.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllByStats() { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } std::set elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsAllExecutor executor( elements); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } // process shared data auto shared_executor = [this, &elements, &res_view]( milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { auto val = bson.ParseAsArrayAtOffset(value_offset); if (!val.has_value()) { res_view[row_offset] = false; return; } std::set tmp_elements(elements); for (const auto& element : val.value()) { auto value = milvus::BsonView::GetValueFromBsonView( element.get_value()); if (!value.has_value()) { continue; } tmp_elements.erase(value.value()); if (tmp_elements.size() == 0) { res_view[row_offset] = true; return; } } res_view[row_offset] = tmp_elements.empty(); }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsAllWithDiffTypeByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::unordered_set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements, const std::unordered_set elements_index) { auto executor = [&](size_t i) -> bool { const auto& json = data[i]; auto doc = json.doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::unordered_set tmp_elements_index(elements_index); for (auto&& it : array) { int i = -1; for (auto& element : elements) { i++; switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { tmp_elements_index.erase(i); } break; } default: ThrowInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } if (tmp_elements_index.size() == 0) { return true; } } if (tmp_elements_index.size() == 0) { return true; } } return tmp_elements_index.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements, elements_index); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements, elements_index); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByStats() { auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsAllWithDiffTypeExecutor executor( elements, elements_index); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } auto shared_executor = [&elements, &elements_index, &res_view]( milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { std::set tmp_elements_index(elements_index); auto array = bson.ParseAsArrayAtOffset(value_offset); if (!array.has_value()) { res_view[row_offset] = false; return; } for (const auto& sub_value : array.value()) { int i = -1; for (auto& element : elements) { i++; switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.bool_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kInt64Val: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.int64_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kFloatVal: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.float_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kStringVal: { auto val = milvus::BsonView::GetValueFromBsonView< std::string>(sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.string_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kArrayVal: { auto val = milvus::BsonView::GetValueFromBsonView< bsoncxx::array::view>(sub_value.get_value()); if (!val.has_value()) { continue; } if (CompareTwoJsonArray(val.value(), element.array_val())) { tmp_elements_index.erase(i); } break; } default: ThrowInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } if (tmp_elements_index.size() == 0) { res_view[row_offset] = true; return; } } if (tmp_elements_index.size() == 0) { res_view[row_offset] = true; return; } } res_view[row_offset] = tmp_elements_index.size() == 0; }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsAllArrayByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](const size_t i) { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::unordered_set exist_elements_index; for (auto&& it : array) { auto val = it.get_array(); if (val.error()) { continue; } std::vector< simdjson::simdjson_result> json_array; json_array.reserve(val.count_elements()); for (auto&& e : val) { json_array.emplace_back(e); } for (int index = 0; index < elements.size(); ++index) { if (CompareTwoJsonArray(json_array, elements[index])) { exist_elements_index.insert(index); } } if (exist_elements_index.size() == elements.size()) { return true; } } return exist_elements_index.size() == elements.size(); }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByStats() { auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsAllArrayExecutor executor(elements); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } auto shared_executor = [&elements, &res_view](milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { auto array = bson.ParseAsArrayAtOffset(value_offset); if (!array.has_value()) { res_view[row_offset] = false; return; } std::set exist_elements_index; for (const auto& sub_value : array.value()) { auto sub_array = milvus::BsonView::GetValueFromBsonView< bsoncxx::array::view>(sub_value.get_value()); if (!sub_array.has_value()) continue; for (int index = 0; index < elements.size(); ++index) { if (CompareTwoJsonArray(sub_array.value(), elements[index])) { exist_elements_index.insert(index); } } if (exist_elements_index.size() == elements.size()) { res_view[row_offset] = true; return; } } res_view[row_offset] = exist_elements_index.size() == elements.size(); }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (!has_offset_input_ && CanUseJsonStats(context, field_id)) { return ExecJsonContainsWithDiffTypeByStats(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::unordered_set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](const size_t i) { auto& json = data[i]; auto doc = json.doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } // Note: array can only be iterated once for (auto&& it : array) { for (auto const& element : elements) { switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { return true; } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { return true; } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { return true; } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { return true; } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { return true; } break; } default: ThrowInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByStats() { auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0 && segment_->type() == SegmentType::Sealed) { auto* segment = dynamic_cast(segment_); auto field_id = expr_->column_.field_id_; pinned_json_stats_ = segment->GetJsonStats(op_ctx_, field_id); auto* index = pinned_json_stats_.get(); Assert(index != nullptr); cached_index_chunk_res_ = std::make_shared(active_count_); cached_index_chunk_valid_res_ = std::make_shared(active_count_, true); TargetBitmapView res_view(*cached_index_chunk_res_); TargetBitmapView valid_res_view(*cached_index_chunk_valid_res_); // process shredding data for ARRAY type (non-shared) { auto target_field = index->GetShreddingField( pointer, milvus::index::JSONType::ARRAY); if (!target_field.empty()) { ShreddingArrayBsonContainsAnyWithDiffTypeExecutor executor( elements); index->ExecutorForShreddingData( op_ctx_, target_field, executor, nullptr, res_view, valid_res_view); } } auto shared_executor = [&elements, &res_view](milvus::BsonView bson, uint32_t row_offset, uint32_t value_offset) { auto array = bson.ParseAsArrayAtOffset(value_offset); if (!array.has_value()) { res_view[row_offset] = false; return; } for (const auto& sub_value : array.value()) { for (auto const& element : elements) { switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.bool_val()) { res_view[row_offset] = true; return; } break; } case proto::plan::GenericValue::kInt64Val: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.int64_val()) { res_view[row_offset] = true; return; } break; } case proto::plan::GenericValue::kFloatVal: { auto val = milvus::BsonView::GetValueFromBsonView( sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.float_val()) { res_view[row_offset] = true; return; } break; } case proto::plan::GenericValue::kStringVal: { auto val = milvus::BsonView::GetValueFromBsonView< std::string>(sub_value.get_value()); if (!val.has_value()) { continue; } if (val.value() == element.string_val()) { res_view[row_offset] = true; return; } break; } case proto::plan::GenericValue::kArrayVal: { auto val = milvus::BsonView::GetValueFromBsonView< bsoncxx::array::view>(sub_value.get_value()); if (!val.has_value()) { continue; } if (CompareTwoJsonArray(val.value(), element.array_val())) { res_view[row_offset] = true; return; } break; } default: ThrowInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } } } }; if (!index->CanSkipShared(pointer)) { index->ExecuteForSharedData(op_ctx_, pointer, shared_executor); } cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( *cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment( DataType data_type) { switch (data_type) { case DataType::BOOL: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT8: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT16: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT32: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT64: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::FLOAT: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::DOUBLE: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::VARCHAR: case DataType::STRING: { return ExecArrayContainsForIndexSegmentImpl(); } default: ThrowInfo(DataTypeInvalid, fmt::format("unsupported data type for " "ExecArrayContainsForIndexSegmentImpl: {}", expr_->column_.element_type_)); } } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContainsForIndexSegmentImpl() { typedef std::conditional_t, std::string, ExprValueType> GetType; using Index = index::ScalarIndex; auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } std::unordered_set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueWithCastNumber(element)); } boost::container::vector elems(elements.begin(), elements.end()); auto execute_sub_batch = [this](Index* index_ptr, const boost::container::vector& vals) { switch (expr_->op_) { case proto::plan::JSONContainsExpr_JSONOp_Contains: case proto::plan::JSONContainsExpr_JSONOp_ContainsAny: { return index_ptr->In(vals.size(), vals.data()); } case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: { TargetBitmap result(index_ptr->Count()); result.set(); for (size_t i = 0; i < vals.size(); i++) { auto sub = index_ptr->In(1, &vals[i]); result &= sub; } return result; } default: ThrowInfo( ExprInvalid, "unsupported array contains type {}", proto::plan::JSONContainsExpr_JSONOp_Name(expr_->op_)); } }; auto res = ProcessIndexChunks(execute_sub_batch, elems); AssertInfo(res->size() == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", res->size(), real_batch_size); return res; } } //namespace exec } // namespace milvus