// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include #include #include "index/JsonKeyStatsInvertedIndex.h" #include "index/InvertedIndexUtil.h" #include "index/Utils.h" #include "storage/MmapManager.h" namespace milvus::index { constexpr const char* TMP_JSON_INVERTED_LOG_PREFIX = "/tmp/milvus/json-key-inverted-index-log/"; constexpr size_t MEMORY_THRESHOLD = 128 * 1024 * 1024; constexpr size_t VECTOR_ELEMENT_SIZE = 8; constexpr size_t KEY_OVERHEAD = 2; size_t current_batch_size_ = 0; size_t calculateMemoryUsage(const std::map>& mp) { size_t total_memory = 0; for (const auto& [key, vec] : mp) { total_memory += (key.length()); total_memory += (vec.size() * VECTOR_ELEMENT_SIZE); } return total_memory * KEY_OVERHEAD; } void JsonKeyStatsInvertedIndex::AddJSONEncodeValue( const std::vector& paths, uint8_t flag, uint8_t type, uint32_t row_id, uint16_t offset, uint16_t length, int32_t value, std::map>& mp) { std::string key = milvus::Json::pointer(paths); LOG_DEBUG( "insert inverted key: {}, flag: {}, type: {}, row_id: {}, offset: " "{}, length:{}, value:{}", key, flag, type, row_id, offset, length, value); int64_t combine_id = 0; if (flag) { combine_id = EncodeValue(flag, type, row_id, value); } else { combine_id = EncodeOffset(flag, type, row_id, offset, length); } mp[key].push_back(combine_id); if (calculateMemoryUsage(mp) >= MEMORY_THRESHOLD) { AddInvertedRecord(mp); } } void JsonKeyStatsInvertedIndex::AddInvertedRecord( std::map>& mp) { std::vector json_offsets_lens; std::vector keys; std::vector json_offsets; if (mp.empty()) { return; } for (auto& iter : mp) { keys.push_back(iter.first.c_str()); json_offsets.push_back(iter.second.data()); json_offsets_lens.push_back(iter.second.size()); } wrapper_->add_json_key_stats_data_by_batch(keys.data(), json_offsets.data(), json_offsets_lens.data(), keys.size()); mp.clear(); current_batch_size_ = 0; } void JsonKeyStatsInvertedIndex::TravelJson( const char* json, jsmntok* tokens, int& index, std::vector& path, int32_t offset, std::map>& mp) { jsmntok current = tokens[0]; Assert(current.type != JSMN_UNDEFINED); if (current.type == JSMN_OBJECT) { if (!path.empty()) { AddJSONEncodeValue(path, 0, 0, offset, current.start, current.end - current.start, 0, mp); } int j = 1; for (int i = 0; i < current.size; i++) { Assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0); std::string key(json + tokens[j].start, tokens[j].end - tokens[j].start); path.push_back(key); j++; int consumed = 0; TravelJson(json, tokens + j, consumed, path, offset, mp); path.pop_back(); j += consumed; } index = j; } else if (current.type == JSMN_PRIMITIVE) { std::string value(json + current.start, current.end - current.start); auto type = getType(value); if (type == JSONType::INT32) { AddJSONEncodeValue(path, 1, static_cast(JSONType::INT32), offset, current.start, current.end - current.start, stoi(value), mp); } else if (type == JSONType::INT64) { AddJSONEncodeValue(path, 0, static_cast(JSONType::INT64), offset, current.start, current.end - current.start, 0, mp); } else if (type == JSONType::FLOAT) { auto fvalue = stof(value); int32_t valueBits = *reinterpret_cast(&fvalue); AddJSONEncodeValue(path, 1, static_cast(JSONType::FLOAT), offset, current.start, current.end - current.start, valueBits, mp); } else if (type == JSONType::DOUBLE) { AddJSONEncodeValue(path, 0, static_cast(JSONType::DOUBLE), offset, current.start, current.end - current.start, 0, mp); } else if (type == JSONType::BOOL) { AddJSONEncodeValue(path, 1, static_cast(JSONType::BOOL), offset, current.start, current.end - current.start, value == "true" ? 1 : 0, mp); } index++; } else if (current.type == JSMN_ARRAY) { AddJSONEncodeValue(path, 0, static_cast(JSONType::UNKNOWN), offset, current.start, current.end - current.start, 0, mp); // skip array parse int count = current.size; int j = 1; while (count > 0) { count--; if (tokens[j].size != 0) { count += tokens[j].size; } j++; } index = j; } else if (current.type == JSMN_STRING) { Assert(current.size == 0); std::string value(json + current.start, current.end - current.start); if (has_escape_sequence(value)) { AddJSONEncodeValue(path, 0, static_cast(JSONType::STRING_ESCAPE), offset, current.start - 1, current.end - current.start + 2, 0, mp); } else { AddJSONEncodeValue(path, 0, static_cast(JSONType::STRING), offset, current.start, current.end - current.start, 0, mp); } index++; } } void JsonKeyStatsInvertedIndex::AddJson( const char* json, int64_t offset, std::map>& mp) { jsmn_parser parser; jsmntok_t* tokens = (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); if (!tokens) { PanicInfo(ErrorCode::UnexpectedError, "alloc jsmn token failed"); return; } int num_tokens = 0; int token_capacity = 16; jsmn_init(&parser); while (1) { int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); if (r < 0) { if (r == JSMN_ERROR_NOMEM) { // Reallocate tokens array if not enough space token_capacity *= 2; tokens = (jsmntok_t*)realloc( tokens, token_capacity * sizeof(jsmntok_t)); if (!tokens) { PanicInfo(ErrorCode::UnexpectedError, "realloc failed"); } continue; } else { free(tokens); PanicInfo(ErrorCode::UnexpectedError, "Failed to parse Json: {}, error: {}", json, int(r)); } } num_tokens = r; break; } int index = 0; std::vector paths; TravelJson(json, tokens, index, paths, offset, mp); free(tokens); } JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex( const storage::FileManagerContext& ctx, bool is_load, int64_t json_stats_tantivy_memory_budget, uint32_t tantivy_index_version) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { LOG_INFO("json_stats_tantivy_memory_budget:{}", json_stats_tantivy_memory_budget); schema_ = ctx.fieldDataMeta.field_schema; field_id_ = ctx.fieldDataMeta.field_id; mem_file_manager_ = std::make_shared(ctx); disk_file_manager_ = std::make_shared(ctx); if (is_load) { auto prefix = disk_file_manager_->GetLocalJsonKeyIndexPrefix(); path_ = prefix; } else { auto prefix = disk_file_manager_->GetJsonKeyIndexIdentifier(); path_ = std::string(TMP_JSON_INVERTED_LOG_PREFIX) + prefix; boost::filesystem::create_directories(path_); std::string field_name = std::to_string(disk_file_manager_->GetFieldDataMeta().field_id); wrapper_ = std::make_shared( field_name.c_str(), path_.c_str(), tantivy_index_version, false, /* in_ram */ 1, json_stats_tantivy_memory_budget); } } JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex( int64_t commit_interval_in_ms, const char* unique_id) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { wrapper_ = std::make_shared( unique_id, "", TANTIVY_INDEX_LATEST_VERSION, true /* in_ram */); } JsonKeyStatsInvertedIndex::JsonKeyStatsInvertedIndex( int64_t commit_interval_in_ms, const char* unique_id, const std::string& path) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { boost::filesystem::path prefix = path; boost::filesystem::path sub_path = unique_id; path_ = (prefix / sub_path).string(); boost::filesystem::create_directories(path_); wrapper_ = std::make_shared( unique_id, path_.c_str(), TANTIVY_INDEX_LATEST_VERSION); } IndexStatsPtr JsonKeyStatsInvertedIndex::Upload(const Config& config) { finish(); index_build_timestamps_.index_build_done_ = std::chrono::system_clock::now(); LOG_INFO( "build json key index done for field id:{}, json parse duration: {}s, " "tantivy document add schedule duration : {}s, " "tantivy total duration : {}s, " "total duration : {}s", field_id_, index_build_timestamps_.getJsonParsingDuration(), index_build_timestamps_.getTantivyAddSchedulingDuration(), index_build_timestamps_.getTantivyTotalDuration(), index_build_timestamps_.getIndexBuildTotalDuration()); boost::filesystem::path p(path_); boost::filesystem::directory_iterator end_iter; for (boost::filesystem::directory_iterator iter(p); iter != end_iter; iter++) { if (boost::filesystem::is_directory(*iter)) { LOG_WARN("{} is a directory", iter->path().string()); } else { LOG_INFO("trying to add json key inverted index log: {}", iter->path().string()); AssertInfo( disk_file_manager_->AddJsonKeyIndexLog(iter->path().string()), "failed to add json key inverted index log: {}", iter->path().string()); LOG_INFO("json key inverted index log: {} added", iter->path().string()); } } auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize(); auto binary_set = Serialize(config); mem_file_manager_->AddFile(binary_set); auto remote_mem_path_to_size = mem_file_manager_->GetRemotePathsToFileSize(); std::vector index_files; index_files.reserve(remote_paths_to_size.size() + remote_mem_path_to_size.size()); for (auto& file : remote_paths_to_size) { index_files.emplace_back(disk_file_manager_->GetFileName(file.first), file.second); } for (auto& file : remote_mem_path_to_size) { index_files.emplace_back(file.first, file.second); } return IndexStats::New(mem_file_manager_->GetAddedTotalMemSize() + disk_file_manager_->GetAddedTotalFileSize(), std::move(index_files)); } void JsonKeyStatsInvertedIndex::Load(milvus::tracer::TraceContext ctx, const Config& config) { auto index_files = GetValueFromConfig>(config, "index_files"); AssertInfo(index_files.has_value(), "index file paths is empty when load json key index"); for (auto& index_file : index_files.value()) { boost::filesystem::path p(index_file); if (!p.has_parent_path()) { auto remote_prefix = disk_file_manager_->GetRemoteJsonKeyLogPrefix(); index_file = remote_prefix + "/" + index_file; } } disk_file_manager_->CacheJsonKeyIndexToDisk(index_files.value()); AssertInfo( tantivy_index_exist(path_.c_str()), "index not exist: {}", path_); wrapper_ = std::make_shared(path_.c_str(), milvus::index::SetBitset); LOG_INFO("load json key index done for field id:{} with dir:{}", field_id_, path_); } void JsonKeyStatsInvertedIndex::BuildWithFieldData( const std::vector& field_datas) { AssertInfo(schema_.data_type() == proto::schema::DataType::JSON, "schema data type is {}", schema_.data_type()); BuildWithFieldData(field_datas, schema_.nullable()); } void JsonKeyStatsInvertedIndex::BuildWithFieldData( const std::vector& field_datas, bool nullable) { int64_t offset = 0; std::map> mp; index_build_timestamps_.index_build_begin_ = std::chrono::system_clock::now(); if (nullable) { for (const auto& data : field_datas) { auto n = data->get_num_rows(); for (int i = 0; i < n; i++) { if (!data->is_valid(i)) { continue; } AddJson(static_cast(data->RawValue(i)) ->data() .data(), offset++, mp); } } } else { for (const auto& data : field_datas) { auto n = data->get_num_rows(); for (int i = 0; i < n; i++) { AddJson(static_cast(data->RawValue(i)) ->data() .data(), offset++, mp); } } } index_build_timestamps_.tantivy_build_begin_ = std::chrono::system_clock::now(); // Schedule all document add operations to tantivy. AddInvertedRecord(mp); index_build_timestamps_.tantivy_add_schedule_end_ = std::chrono::system_clock::now(); } void JsonKeyStatsInvertedIndex::AddJSONDatas(size_t n, const std::string* jsonDatas, const bool* valids, int64_t offset_begin) { std::map> mp; for (int i = 0; i < n; i++) { auto offset = i + offset_begin; if (valids != nullptr && !valids[i]) { continue; } AddJson(jsonDatas[i].c_str(), offset, mp); } AddInvertedRecord(mp); is_data_uncommitted_ = true; LOG_INFO("build json key index done for AddJSONDatas"); if (shouldTriggerCommit()) { Commit(); } } void JsonKeyStatsInvertedIndex::Finish() { finish(); } bool JsonKeyStatsInvertedIndex::shouldTriggerCommit() { auto span = (std::chrono::duration( stdclock::now() - last_commit_time_.load())) .count(); return span > commit_interval_in_ms_; } void JsonKeyStatsInvertedIndex::Commit() { std::unique_lock lck(mtx_, std::defer_lock); if (lck.try_lock()) { is_data_uncommitted_ = false; wrapper_->commit(); last_commit_time_.store(stdclock::now()); } } void JsonKeyStatsInvertedIndex::Reload() { std::unique_lock lck(mtx_, std::defer_lock); if (lck.try_lock()) { wrapper_->reload(); } } void JsonKeyStatsInvertedIndex::CreateReader() { wrapper_->create_reader(); } } // namespace milvus::index