// Copyright (C) 2019-2020 Zilliz. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software distributed under the License // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License #include #include #include #include #include "index/TextMatchIndex.h" #include "index/InvertedIndexUtil.h" #include "index/Utils.h" #include "storage/ThreadPools.h" namespace milvus::index { TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms, const char* unique_id, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(commit_interval_in_ms), last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; wrapper_ = std::make_shared( unique_id, true, "", TANTIVY_INDEX_LATEST_VERSION /* Growing segment has no reason to use old version index*/ , tokenizer_name, analyzer_params); set_is_growing(true); } TextMatchIndex::TextMatchIndex(const std::string& path, const char* unique_id, uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; boost::filesystem::path prefix = path; boost::filesystem::path sub_path = unique_id; path_ = (prefix / sub_path).string(); boost::filesystem::create_directories(path_); wrapper_ = std::make_shared(unique_id, false, path_.c_str(), tantivy_index_version, tokenizer_name, analyzer_params); } TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { schema_ = ctx.fieldDataMeta.field_schema; mem_file_manager_ = std::make_shared(ctx); disk_file_manager_ = std::make_shared(ctx); path_ = disk_file_manager_->GetLocalTempTextIndexPrefix(); boost::filesystem::create_directories(path_); d_type_ = TantivyDataType::Text; std::string field_name = std::to_string(disk_file_manager_->GetFieldDataMeta().field_id); wrapper_ = std::make_shared(field_name.c_str(), false, path_.c_str(), tantivy_index_version, tokenizer_name, analyzer_params); } TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx) : commit_interval_in_ms_(std::numeric_limits::max()), last_commit_time_(stdclock::now()) { schema_ = ctx.fieldDataMeta.field_schema; mem_file_manager_ = std::make_shared(ctx); disk_file_manager_ = std::make_shared(ctx); d_type_ = TantivyDataType::Text; } IndexStatsPtr TextMatchIndex::Upload(const Config& config) { finish(); boost::filesystem::path p(path_); boost::filesystem::directory_iterator end_iter; for (boost::filesystem::directory_iterator iter(p); iter != end_iter; iter++) { if (boost::filesystem::is_directory(*iter)) { LOG_WARN("{} is a directory", iter->path().string()); } else { LOG_INFO("trying to add text log: {}", iter->path().string()); AssertInfo(disk_file_manager_->AddTextLog(iter->path().string()), "failed to add text log: {}", iter->path().string()); LOG_INFO("text log: {} added", iter->path().string()); } } auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize(); auto binary_set = Serialize(config); mem_file_manager_->AddTextLog(binary_set); auto remote_mem_path_to_size = mem_file_manager_->GetRemotePathsToFileSize(); std::vector index_files; index_files.reserve(remote_paths_to_size.size() + remote_mem_path_to_size.size()); for (auto& file : remote_paths_to_size) { index_files.emplace_back(file.first, file.second); } for (auto& file : remote_mem_path_to_size) { index_files.emplace_back(file.first, file.second); } return IndexStats::New(mem_file_manager_->GetAddedTotalMemSize() + disk_file_manager_->GetAddedTotalFileSize(), std::move(index_files)); } void TextMatchIndex::Load(const Config& config) { auto index_files = GetValueFromConfig>(config, INDEX_FILES); AssertInfo(index_files.has_value(), "index file paths is empty when load text log index"); auto prefix = disk_file_manager_->GetLocalTextIndexPrefix(); auto files_value = index_files.value(); auto it = std::find_if( files_value.begin(), files_value.end(), [](const std::string& file) { return file.substr(file.find_last_of('/') + 1) == "index_null_offset"; }); auto load_priority = GetValueFromConfig( config, milvus::LOAD_PRIORITY) .value_or(milvus::proto::common::LoadPriority::HIGH); if (it != files_value.end()) { std::vector file; file.push_back(*it); files_value.erase(it); auto index_datas = mem_file_manager_->LoadIndexToMemory(file, load_priority); BinarySet binary_set; AssembleIndexDatas(index_datas, binary_set); // clear index_datas to free memory early index_datas.clear(); auto index_valid_data = binary_set.GetByName("index_null_offset"); null_offset_.resize((size_t)index_valid_data->size / sizeof(size_t)); memcpy(null_offset_.data(), index_valid_data->data.get(), (size_t)index_valid_data->size); } disk_file_manager_->CacheTextLogToDisk(files_value, load_priority); AssertInfo( tantivy_index_exist(prefix.c_str()), "index not exist: {}", prefix); auto load_in_mmap = GetValueFromConfig(config, ENABLE_MMAP).value_or(true); wrapper_ = std::make_shared( prefix.c_str(), load_in_mmap, milvus::index::SetBitsetSealed); if (!load_in_mmap) { // the index is loaded in ram, so we can remove files in advance disk_file_manager_->RemoveTextLogFiles(); } } // Add text for sealed segment void TextMatchIndex::AddTextSealed(const std::string& text, const bool valid, int64_t offset) { if (!valid) { AddNullSealed(offset); return; } wrapper_->add_data(&text, 1, offset); } // Add null for sealed segment void TextMatchIndex::AddNullSealed(int64_t offset) { null_offset_.push_back(offset); // still need to add null to make offset is correct std::string empty = ""; wrapper_->add_array_data(&empty, 0, offset); } // Add texts for growing segment void TextMatchIndex::AddTextsGrowing(size_t n, const std::string* texts, const bool* valids, int64_t offset_begin) { if (valids != nullptr) { for (int i = 0; i < n; i++) { auto offset = i + offset_begin; if (!valids[i]) { std::unique_lock lock(mutex_); null_offset_.push_back(offset); } } } wrapper_->add_data(texts, n, offset_begin); if (shouldTriggerCommit()) { Commit(); } } // schema_ may not be initialized so we need this `nullable` parameter void TextMatchIndex::BuildIndexFromFieldData( const std::vector& field_datas, bool nullable) { int64_t offset = 0; if (nullable) { int64_t total = 0; for (const auto& data : field_datas) { total += data->get_null_count(); } { std::unique_lock lock(mutex_); null_offset_.reserve(total); } for (const auto& data : field_datas) { auto n = data->get_num_rows(); for (int i = 0; i < n; i++) { if (!data->is_valid(i)) { std::unique_lock lock(mutex_); null_offset_.push_back(i); } wrapper_->add_data( static_cast(data->RawValue(i)), data->is_valid(i) ? 1 : 0, offset++); } } } else { for (const auto& data : field_datas) { auto n = data->get_num_rows(); wrapper_->add_data( static_cast(data->Data()), n, offset); offset += n; } } } void TextMatchIndex::Finish() { finish(); } bool TextMatchIndex::shouldTriggerCommit() { auto span = (std::chrono::duration( stdclock::now() - last_commit_time_.load())) .count(); return span > commit_interval_in_ms_; } void TextMatchIndex::Commit() { std::unique_lock lck(mtx_, std::defer_lock); if (lck.try_lock()) { wrapper_->commit(); last_commit_time_.store(stdclock::now()); } } void TextMatchIndex::Reload() { std::unique_lock lck(mtx_, std::defer_lock); if (lck.try_lock()) { wrapper_->reload(); } } void TextMatchIndex::CreateReader(SetBitsetFn set_bitset) { wrapper_->create_reader(set_bitset); } void TextMatchIndex::RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params) { wrapper_->register_tokenizer(tokenizer_name, analyzer_params); } TargetBitmap TextMatchIndex::MatchQuery(const std::string& query) { if (shouldTriggerCommit()) { Commit(); Reload(); } TargetBitmap bitset{static_cast(Count())}; // The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment. // So we cannot use the count operation to get the total count for bitmap. // Just use the maximum offset of hits to get the total count for bitmap here. wrapper_->match_query(query, &bitset); return bitset; } TargetBitmap TextMatchIndex::PhraseMatchQuery(const std::string& query, uint32_t slop) { if (shouldTriggerCommit()) { Commit(); Reload(); } TargetBitmap bitset{static_cast(Count())}; // The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment. // So we cannot use the count operation to get the total count for bitmap. // Just use the maximum offset of hits to get the total count for bitmap here. wrapper_->phrase_match_query(query, slop, &bitset); return bitset; } } // namespace milvus::index