// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include "index/JsonInvertedIndex.h" #include #include #include #include #include "common/EasyAssert.h" #include "common/FieldDataInterface.h" #include "common/Json.h" #include "common/Types.h" #include "folly/FBVector.h" #include "log/Log.h" #include "common/JsonUtils.h" #include "simdjson/error.h" #include "index/JsonIndexBuilder.h" namespace milvus::index { namespace json { bool IsDataTypeSupported(JsonCastType cast_type, DataType data_type, bool is_array) { bool cast_type_is_array = cast_type.data_type() == JsonCastType::DataType::ARRAY; auto type = cast_type.ToMilvusDataType(); return is_array == cast_type_is_array && (type == data_type || (data_type == DataType::INT64 && type == DataType::DOUBLE)); } } // namespace json template void JsonInvertedIndex::build_index_for_json( const std::vector>& field_datas) { LOG_INFO("Start to build json inverted index for field: {}", nested_path_); ProcessJsonFieldData( field_datas, this->schema_, nested_path_, cast_type_, cast_function_, // add data [this](const T* data, int64_t size, int64_t offset) { this->wrapper_->template add_array_data(data, size, offset); }, // handle null [this](int64_t offset) { this->null_offset_.push_back(offset); }, // handle non exist [this](int64_t offset) { this->non_exist_offsets_.push_back(offset); }, // handle error [this](const Json& json, const std::string& nested_path, simdjson::error_code error) { this->error_recorder_.Record(json, nested_path, error); }); error_recorder_.PrintErrStats(); } template TargetBitmap JsonInvertedIndex::Exists() { int64_t count = this->Count(); TargetBitmap bitset(count, true); auto fill_bitset = [this, count, &bitset]() { auto end = std::lower_bound( non_exist_offsets_.begin(), non_exist_offsets_.end(), count); for (auto iter = non_exist_offsets_.begin(); iter != end; ++iter) { bitset.reset(*iter); } }; if (this->is_growing_) { std::shared_lock lock(this->mutex_); fill_bitset(); } else { fill_bitset(); } return bitset; } template void JsonInvertedIndex::LoadIndexMetas( const std::vector& index_files, const Config& config) { InvertedIndexTantivy::LoadIndexMetas(index_files, config); auto fill_non_exist_offset = [&](const uint8_t* data, int64_t size) { non_exist_offsets_.resize((size_t)size / sizeof(size_t)); memcpy(non_exist_offsets_.data(), data, (size_t)size); }; auto non_exist_offset_file_itr = std::find_if( index_files.begin(), index_files.end(), [&](const std::string& file) { return boost::filesystem::path(file).filename().string() == INDEX_NON_EXIST_OFFSET_FILE_NAME; }); auto load_priority = GetValueFromConfig( config, milvus::LOAD_PRIORITY) .value_or(milvus::proto::common::LoadPriority::HIGH); if (non_exist_offset_file_itr != index_files.end()) { // null offset file is not sliced auto index_datas = this->mem_file_manager_->LoadIndexToMemory( {*non_exist_offset_file_itr}, load_priority); auto non_exist_offset_data = std::move(index_datas.at(INDEX_NON_EXIST_OFFSET_FILE_NAME)); fill_non_exist_offset(non_exist_offset_data->PayloadData(), non_exist_offset_data->PayloadSize()); return; } std::vector non_exist_offset_files; for (auto& file : index_files) { auto file_name = boost::filesystem::path(file).filename().string(); if (file_name.find(INDEX_NON_EXIST_OFFSET_FILE_NAME) != std::string::npos) { non_exist_offset_files.push_back(file); } } if (non_exist_offset_files.size() > 0) { // null offset file is sliced auto index_datas = this->mem_file_manager_->LoadIndexToMemory( non_exist_offset_files, load_priority); auto non_exist_offset_data = CompactIndexDatas(index_datas); auto non_exist_offset_data_codecs = std::move( non_exist_offset_data.at(INDEX_NON_EXIST_OFFSET_FILE_NAME)); for (auto&& non_exist_offset_codec : non_exist_offset_data_codecs.codecs_) { fill_non_exist_offset(non_exist_offset_codec->PayloadData(), non_exist_offset_codec->PayloadSize()); } return; } // Fallback: no non_exist_offset_file found. This occurs in two scenarios: // 1. Legacy v2.5.x data where non_exist_offset_file doesn't exist // 2. All records are valid (no invalid offsets to track) // // Use null_offset_ as the source for non_exist_offsets_ to maintain // backward compatibility. This ensures Exists() behaves like v2.5.x IsNotNull(). non_exist_offsets_ = this->null_offset_; } template void JsonInvertedIndex::RetainTantivyIndexFiles( std::vector& index_files) { index_files.erase( std::remove_if( index_files.begin(), index_files.end(), [&](const std::string& file) { auto file_name = boost::filesystem::path(file).filename().string(); return file_name.find(INDEX_NON_EXIST_OFFSET_FILE_NAME) != std::string::npos; }), index_files.end()); InvertedIndexTantivy::RetainTantivyIndexFiles(index_files); } template class JsonInvertedIndex; template class JsonInvertedIndex; template class JsonInvertedIndex; template class JsonInvertedIndex; } // namespace milvus::index