// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "JsonContainsExpr.h" #include #include "common/Types.h" namespace milvus { namespace exec { void PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { auto input = context.get_offset_input(); SetHasOffsetInput((input != nullptr)); switch (expr_->column_.data_type_) { case DataType::ARRAY: { if (is_index_mode_ && !has_offset_input_) { result = EvalArrayContainsForIndexSegment(); } else { result = EvalJsonContainsForDataSegment(context); } break; } case DataType::JSON: { if (is_index_mode_ && !context.get_offset_input()) { PanicInfo(ExprInvalid, "exists expr for json or array index mode not " "supported"); } result = EvalJsonContainsForDataSegment(context); break; } default: PanicInfo(DataTypeInvalid, "unsupported data type: {}", expr_->column_.data_type_); } } VectorPtr PhyJsonContainsFilterExpr::EvalJsonContainsForDataSegment(EvalCtx& context) { auto data_type = expr_->column_.data_type_; switch (expr_->op_) { case proto::plan::JSONContainsExpr_JSONOp_Contains: case proto::plan::JSONContainsExpr_JSONOp_ContainsAny: { if (IsArrayDataType(data_type)) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecArrayContains(context); } case proto::plan::GenericValue::kInt64Val: { return ExecArrayContains(context); } case proto::plan::GenericValue::kFloatVal: { return ExecArrayContains(context); } case proto::plan::GenericValue::kStringVal: { return ExecArrayContains(context); } default: PanicInfo( DataTypeInvalid, fmt::format("unsupported data type {}", val_type)); } } else { if (expr_->same_type_) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kInt64Val: { return ExecJsonContains(context); } case proto::plan::GenericValue::kFloatVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kStringVal: { return ExecJsonContains(context); } case proto::plan::GenericValue::kArrayVal: { return ExecJsonContainsArray(context); } default: PanicInfo(DataTypeInvalid, "unsupported data type:{}", val_type); } } else { return ExecJsonContainsWithDiffType(context); } } } case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: { if (IsArrayDataType(data_type)) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecArrayContainsAll(context); } case proto::plan::GenericValue::kInt64Val: { return ExecArrayContainsAll(context); } case proto::plan::GenericValue::kFloatVal: { return ExecArrayContainsAll(context); } case proto::plan::GenericValue::kStringVal: { return ExecArrayContainsAll(context); } default: PanicInfo( DataTypeInvalid, fmt::format("unsupported data type {}", val_type)); } } else { if (expr_->same_type_) { auto val_type = expr_->vals_[0].val_case(); switch (val_type) { case proto::plan::GenericValue::kBoolVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kInt64Val: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kFloatVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kStringVal: { return ExecJsonContainsAll(context); } case proto::plan::GenericValue::kArrayVal: { return ExecJsonContainsAllArray(context); } default: PanicInfo(DataTypeInvalid, "unsupported data type:{}", val_type); } } else { return ExecJsonContainsAllWithDiffType(context); } } } default: PanicInfo(ExprInvalid, "unsupported json contains type {}", proto::plan::JSONContainsExpr_JSONOp_Name(expr_->op_)); } } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContains(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } AssertInfo(expr_->column_.nested_path_.size() == 0, "[ExecArrayContains]nested path must be null"); auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); arg_inited_ = true; } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::ArrayView* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::shared_ptr& elements) { auto executor = [&](size_t i) { const auto& array = data[i]; for (int j = 0; j < array.length(); ++j) { if (elements->In(array.template get_data(j))) { return true; } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (int i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, arg_set_); } else { processed_size = ProcessDataChunks( execute_sub_batch, std::nullptr_t{}, res, valid_res, arg_set_); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); arg_inited_ = true; } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::shared_ptr& elements) { auto executor = [&](size_t i) { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } if (elements->In(val.value()) > 0) { return true; } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, arg_set_); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, arg_set_); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; std::unordered_set elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); if (!arg_inited_) { arg_set_ = std::make_shared>(expr_->vals_); arg_inited_ = true; } if (arg_set_->Empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [this, segment, &field_id](bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; auto array = json.array_at(offset, size); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } if (this->arg_set_->In(val.value())) { return true; } } return false; } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsArrayByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](size_t i) -> bool { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.get_array(); if (val.error()) { continue; } std::vector< simdjson::simdjson_result> json_array; json_array.reserve(val.count_elements()); for (auto&& e : val) { json_array.emplace_back(e); } for (auto const& element : elements) { if (CompareTwoJsonArray(json_array, element)) { return true; } } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() { auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; std::vector elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [segment, &elements, &field_id](bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; auto array = json.array_at(offset, size); if (array.error()) { return false; } for (auto&& it : array) { auto val = it.get_array(); if (val.error()) { continue; } for (auto const& element : elements) { if (CompareTwoJsonArray(val, element)) { return true; } } } return false; } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); AssertInfo(expr_->column_.nested_path_.size() == 0, "[ExecArrayContainsAll]nested path must be null"); auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); std::set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::ArrayView* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::set& elements) { auto executor = [&](size_t i) { std::set tmp_elements(elements); // Note: array can only be iterated once for (int j = 0; j < data[i].length(); ++j) { tmp_elements.erase(data[i].template get_data(j)); if (tmp_elements.size() == 0) { return true; } } return tmp_elements.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (int i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, elements); } else { processed_size = ProcessDataChunks( execute_sub_batch, std::nullptr_t{}, res, valid_res, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsAllByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::set& elements) { auto executor = [&](const size_t i) -> bool { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::set tmp_elements(elements); // Note: array can only be iterated once for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } tmp_elements.erase(val.value()); if (tmp_elements.size() == 0) { return true; } } return tmp_elements.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } template VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() { using GetType = std::conditional_t, std::string_view, ExprValueType>; auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; std::set elements; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [segment, &elements, &field_id](bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; auto array = json.array_at(offset, size); if (array.error()) { return false; } std::set tmp_elements(elements); for (auto&& it : array) { auto val = it.template get(); if (val.error()) { continue; } tmp_elements.erase(val.value()); if (tmp_elements.size() == 0) { return true; } } return tmp_elements.empty(); } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsAllWithDiffTypeByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::unordered_set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } int processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements, const std::unordered_set elements_index) { auto executor = [&](size_t i) -> bool { const auto& json = data[i]; auto doc = json.doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::unordered_set tmp_elements_index(elements_index); for (auto&& it : array) { int i = -1; for (auto& element : elements) { i++; switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { tmp_elements_index.erase(i); } break; } default: PanicInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } if (tmp_elements_index.size() == 0) { return true; } } if (tmp_elements_index.size() == 0) { return true; } } return tmp_elements_index.size() == 0; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements, elements_index); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements, elements_index); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() { auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [segment, &elements, &elements_index, &field_id]( bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; std::set tmp_elements_index(elements_index); auto array = json.array_at(offset, size); if (array.error()) { return false; } for (auto&& it : array) { int i = -1; for (auto& element : elements) { i++; switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { tmp_elements_index.erase(i); } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { tmp_elements_index.erase(i); } break; } default: PanicInfo( DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } if (tmp_elements_index.size() == 0) { return true; } } if (tmp_elements_index.size() == 0) { return true; } } return tmp_elements_index.size() == 0; } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsAllArrayByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](const size_t i) { auto doc = data[i].doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } std::unordered_set exist_elements_index; for (auto&& it : array) { auto val = it.get_array(); if (val.error()) { continue; } std::vector< simdjson::simdjson_result> json_array; json_array.reserve(val.count_elements()); for (auto&& e : val) { json_array.emplace_back(e); } for (int index = 0; index < elements.size(); ++index) { if (CompareTwoJsonArray(json_array, elements[index])) { exist_elements_index.insert(index); } } if (exist_elements_index.size() == elements.size()) { return true; } } return exist_elements_index.size() == elements.size(); }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() { auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); std::vector elements; for (auto const& element : expr_->vals_) { elements.emplace_back(GetValueFromProto(element)); } if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [segment, &elements, &field_id](bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; auto array = json.array_at(offset, size); if (array.error()) { return false; } std::set exist_elements_index; for (auto&& it : array) { auto json_array = it.get_array(); if (json_array.error()) { continue; } for (int index = 0; index < elements.size(); ++index) { if (CompareTwoJsonArray(json_array, elements[index])) { exist_elements_index.insert(index); } } if (exist_elements_index.size() == elements.size()) { return true; } } return exist_elements_index.size() == elements.size(); } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) { auto* input = context.get_offset_input(); const auto& bitmap_input = context.get_bitmap_input(); FieldId field_id = expr_->column_.field_id_; if (CanUseJsonKeyIndex(field_id) && !has_offset_input_) { return ExecJsonContainsWithDiffTypeByKeyIndex(); } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } auto res_vec = std::make_shared(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); TargetBitmapView res(res_vec->GetRawData(), real_batch_size); TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; std::unordered_set elements_index; int i = 0; for (auto& element : elements) { elements_index.insert(i); i++; } size_t processed_cursor = 0; auto execute_sub_batch = [&processed_cursor, & bitmap_input ]( const milvus::Json* data, const bool* valid_data, const int32_t* offsets, const int size, TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { auto executor = [&](const size_t i) { auto& json = data[i]; auto doc = json.doc(); auto array = doc.at_pointer(pointer).get_array(); if (array.error()) { return false; } // Note: array can only be iterated once for (auto&& it : array) { for (auto const& element : elements) { switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { return true; } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { return true; } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { return true; } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { return true; } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { return true; } break; } default: PanicInfo(DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } } } return false; }; bool has_bitmap_input = !bitmap_input.empty(); for (size_t i = 0; i < size; ++i) { auto offset = i; if constexpr (filter_type == FilterType::random) { offset = (offsets) ? offsets[i] : i; } if (valid_data != nullptr && !valid_data[offset]) { res[i] = valid_res[i] = false; continue; } if (has_bitmap_input && !bitmap_input[processed_cursor + i]) { continue; } res[i] = executor(offset); } processed_cursor += size; }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, std::nullptr_t{}, input, res, valid_res, pointer, elements); } else { processed_size = ProcessDataChunks(execute_sub_batch, std::nullptr_t{}, res, valid_res, pointer, elements); } AssertInfo(processed_size == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", processed_size, real_batch_size); return res_vec; } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() { auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); auto elements = expr_->vals_; if (elements.empty()) { MoveCursor(); return std::make_shared( TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); } if (cached_index_chunk_id_ != 0) { const segcore::SegmentInternalInterface* segment = nullptr; if (segment_->type() == SegmentType::Growing) { segment = dynamic_cast(segment_); } else if (segment_->type() == SegmentType::Sealed) { segment = dynamic_cast(segment_); } auto field_id = expr_->column_.field_id_; auto* index = segment->GetJsonKeyIndex(field_id); Assert(index != nullptr); auto filter_func = [segment, &elements, &field_id](bool valid, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t size, int32_t value) { if (valid) { return false; } else { auto json_pair = segment->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; } auto& json = json_pair.first; auto array = json.array_at(offset, size); if (array.error()) { return false; } // Note: array can only be iterated once for (auto&& it : array) { for (auto const& element : elements) { switch (element.val_case()) { case proto::plan::GenericValue::kBoolVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.bool_val()) { return true; } break; } case proto::plan::GenericValue::kInt64Val: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.int64_val()) { return true; } break; } case proto::plan::GenericValue::kFloatVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.float_val()) { return true; } break; } case proto::plan::GenericValue::kStringVal: { auto val = it.template get(); if (val.error()) { continue; } if (val.value() == element.string_val()) { return true; } break; } case proto::plan::GenericValue::kArrayVal: { auto val = it.get_array(); if (val.error()) { continue; } if (CompareTwoJsonArray(val, element.array_val())) { return true; } break; } default: PanicInfo( DataTypeInvalid, fmt::format("unsupported data type {}", element.val_case())); } } } return false; } }; bool is_growing = segment_->type() == SegmentType::Growing; bool is_strong_consistency = consistency_level_ == 0; cached_index_chunk_res_ = index ->FilterByPath(pointer, active_count_, is_growing, is_strong_consistency, filter_func) .clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; result.append( cached_index_chunk_res_, current_data_global_pos_, real_batch_size); MoveCursor(); return std::make_shared(std::move(result), TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::EvalArrayContainsForIndexSegment() { switch (expr_->column_.element_type_) { case DataType::BOOL: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT8: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT16: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT32: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::INT64: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::FLOAT: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::DOUBLE: { return ExecArrayContainsForIndexSegmentImpl(); } case DataType::VARCHAR: case DataType::STRING: { return ExecArrayContainsForIndexSegmentImpl(); } default: PanicInfo(DataTypeInvalid, fmt::format("unsupported data type for " "ExecArrayContainsForIndexSegmentImpl: {}", expr_->column_.element_type_)); } } template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContainsForIndexSegmentImpl() { typedef std::conditional_t, std::string, ExprValueType> GetType; using Index = index::ScalarIndex; auto real_batch_size = GetNextBatchSize(); if (real_batch_size == 0) { return nullptr; } std::unordered_set elements; for (auto const& element : expr_->vals_) { elements.insert(GetValueFromProto(element)); } boost::container::vector elems(elements.begin(), elements.end()); auto execute_sub_batch = [this](Index* index_ptr, const boost::container::vector& vals) { switch (expr_->op_) { case proto::plan::JSONContainsExpr_JSONOp_Contains: case proto::plan::JSONContainsExpr_JSONOp_ContainsAny: { return index_ptr->In(vals.size(), vals.data()); } case proto::plan::JSONContainsExpr_JSONOp_ContainsAll: { TargetBitmap result(index_ptr->Count()); result.set(); for (size_t i = 0; i < vals.size(); i++) { auto sub = index_ptr->In(1, &vals[i]); result &= sub; } return result; } default: PanicInfo( ExprInvalid, "unsupported array contains type {}", proto::plan::JSONContainsExpr_JSONOp_Name(expr_->op_)); } }; auto res = ProcessIndexChunks(execute_sub_batch, elems); AssertInfo(res->size() == real_batch_size, "internal error: expr processed rows {} not equal " "expect batch size {}", res->size(), real_batch_size); return res; } } //namespace exec } // namespace milvus