mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
issue: https://github.com/milvus-io/milvus/issues/44452 Signed-off-by: zhengbuqian <zhengbuqian@gmail.com> Co-authored-by: buqian.zheng <buqian.zheng@zilliz.com>
335 lines
12 KiB
C++
335 lines
12 KiB
C++
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the License
|
|
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
|
|
// or implied. See the License for the specific language governing permissions and limitations under the License
|
|
|
|
#include <boost/uuid/random_generator.hpp>
|
|
#include <boost/uuid/uuid_io.hpp>
|
|
#include <memory>
|
|
#include <shared_mutex>
|
|
|
|
#include "index/TextMatchIndex.h"
|
|
#include "index/InvertedIndexUtil.h"
|
|
#include "index/Utils.h"
|
|
#include "storage/ThreadPools.h"
|
|
|
|
namespace milvus::index {
|
|
TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
|
|
const char* unique_id,
|
|
const char* tokenizer_name,
|
|
const char* analyzer_params)
|
|
: commit_interval_in_ms_(commit_interval_in_ms),
|
|
last_commit_time_(stdclock::now()) {
|
|
d_type_ = TantivyDataType::Text;
|
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
|
unique_id,
|
|
true,
|
|
"",
|
|
TANTIVY_INDEX_LATEST_VERSION /* Growing segment has no reason to use old version index*/
|
|
,
|
|
tokenizer_name,
|
|
analyzer_params);
|
|
set_is_growing(true);
|
|
}
|
|
|
|
TextMatchIndex::TextMatchIndex(const std::string& path,
|
|
const char* unique_id,
|
|
uint32_t tantivy_index_version,
|
|
const char* tokenizer_name,
|
|
const char* analyzer_params)
|
|
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
|
last_commit_time_(stdclock::now()) {
|
|
d_type_ = TantivyDataType::Text;
|
|
boost::filesystem::path prefix = path;
|
|
boost::filesystem::path sub_path = unique_id;
|
|
path_ = (prefix / sub_path).string();
|
|
boost::filesystem::create_directories(path_);
|
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(unique_id,
|
|
false,
|
|
path_.c_str(),
|
|
tantivy_index_version,
|
|
tokenizer_name,
|
|
analyzer_params);
|
|
}
|
|
|
|
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
|
|
uint32_t tantivy_index_version,
|
|
const char* tokenizer_name,
|
|
const char* analyzer_params)
|
|
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
|
last_commit_time_(stdclock::now()) {
|
|
schema_ = ctx.fieldDataMeta.field_schema;
|
|
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
|
|
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
|
|
|
|
path_ = disk_file_manager_->GetLocalTempTextIndexPrefix();
|
|
|
|
boost::filesystem::create_directories(path_);
|
|
d_type_ = TantivyDataType::Text;
|
|
std::string field_name =
|
|
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
|
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(field_name.c_str(),
|
|
false,
|
|
path_.c_str(),
|
|
tantivy_index_version,
|
|
tokenizer_name,
|
|
analyzer_params);
|
|
}
|
|
|
|
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx)
|
|
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
|
|
last_commit_time_(stdclock::now()) {
|
|
schema_ = ctx.fieldDataMeta.field_schema;
|
|
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
|
|
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
|
|
d_type_ = TantivyDataType::Text;
|
|
}
|
|
|
|
IndexStatsPtr
|
|
TextMatchIndex::Upload(const Config& config) {
|
|
finish();
|
|
|
|
boost::filesystem::path p(path_);
|
|
boost::filesystem::directory_iterator end_iter;
|
|
|
|
for (boost::filesystem::directory_iterator iter(p); iter != end_iter;
|
|
iter++) {
|
|
if (boost::filesystem::is_directory(*iter)) {
|
|
LOG_WARN("{} is a directory", iter->path().string());
|
|
} else {
|
|
LOG_INFO("trying to add text log: {}", iter->path().string());
|
|
AssertInfo(disk_file_manager_->AddTextLog(iter->path().string()),
|
|
"failed to add text log: {}",
|
|
iter->path().string());
|
|
LOG_INFO("text log: {} added", iter->path().string());
|
|
}
|
|
}
|
|
|
|
auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize();
|
|
|
|
auto binary_set = Serialize(config);
|
|
mem_file_manager_->AddTextLog(binary_set);
|
|
auto remote_mem_path_to_size =
|
|
mem_file_manager_->GetRemotePathsToFileSize();
|
|
|
|
std::vector<SerializedIndexFileInfo> index_files;
|
|
index_files.reserve(remote_paths_to_size.size() +
|
|
remote_mem_path_to_size.size());
|
|
for (auto& file : remote_paths_to_size) {
|
|
index_files.emplace_back(file.first, file.second);
|
|
}
|
|
for (auto& file : remote_mem_path_to_size) {
|
|
index_files.emplace_back(file.first, file.second);
|
|
}
|
|
return IndexStats::New(mem_file_manager_->GetAddedTotalMemSize() +
|
|
disk_file_manager_->GetAddedTotalFileSize(),
|
|
std::move(index_files));
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::Load(const Config& config) {
|
|
auto index_files =
|
|
GetValueFromConfig<std::vector<std::string>>(config, INDEX_FILES);
|
|
AssertInfo(index_files.has_value(),
|
|
"index file paths is empty when load text log index");
|
|
auto prefix = disk_file_manager_->GetLocalTextIndexPrefix();
|
|
auto files_value = index_files.value();
|
|
auto it = std::find_if(
|
|
files_value.begin(), files_value.end(), [](const std::string& file) {
|
|
return file.substr(file.find_last_of('/') + 1) ==
|
|
"index_null_offset";
|
|
});
|
|
auto load_priority =
|
|
GetValueFromConfig<milvus::proto::common::LoadPriority>(
|
|
config, milvus::LOAD_PRIORITY)
|
|
.value_or(milvus::proto::common::LoadPriority::HIGH);
|
|
if (it != files_value.end()) {
|
|
std::vector<std::string> file;
|
|
file.push_back(*it);
|
|
files_value.erase(it);
|
|
auto index_datas =
|
|
mem_file_manager_->LoadIndexToMemory(file, load_priority);
|
|
BinarySet binary_set;
|
|
AssembleIndexDatas(index_datas, binary_set);
|
|
// clear index_datas to free memory early
|
|
index_datas.clear();
|
|
auto index_valid_data = binary_set.GetByName("index_null_offset");
|
|
null_offset_.resize((size_t)index_valid_data->size / sizeof(size_t));
|
|
memcpy(null_offset_.data(),
|
|
index_valid_data->data.get(),
|
|
(size_t)index_valid_data->size);
|
|
}
|
|
disk_file_manager_->CacheTextLogToDisk(files_value, load_priority);
|
|
AssertInfo(
|
|
tantivy_index_exist(prefix.c_str()), "index not exist: {}", prefix);
|
|
|
|
auto load_in_mmap =
|
|
GetValueFromConfig<bool>(config, ENABLE_MMAP).value_or(true);
|
|
|
|
wrapper_ = std::make_shared<TantivyIndexWrapper>(
|
|
prefix.c_str(), load_in_mmap, milvus::index::SetBitsetSealed);
|
|
|
|
if (!load_in_mmap) {
|
|
// the index is loaded in ram, so we can remove files in advance
|
|
disk_file_manager_->RemoveTextLogFiles();
|
|
}
|
|
}
|
|
|
|
// Add text for sealed segment
|
|
void
|
|
TextMatchIndex::AddTextSealed(const std::string& text,
|
|
const bool valid,
|
|
int64_t offset) {
|
|
if (!valid) {
|
|
AddNullSealed(offset);
|
|
return;
|
|
}
|
|
wrapper_->add_data(&text, 1, offset);
|
|
}
|
|
|
|
// Add null for sealed segment
|
|
void
|
|
TextMatchIndex::AddNullSealed(int64_t offset) {
|
|
null_offset_.push_back(offset);
|
|
// still need to add null to make offset is correct
|
|
std::string empty = "";
|
|
wrapper_->add_array_data(&empty, 0, offset);
|
|
}
|
|
|
|
// Add texts for growing segment
|
|
void
|
|
TextMatchIndex::AddTextsGrowing(size_t n,
|
|
const std::string* texts,
|
|
const bool* valids,
|
|
int64_t offset_begin) {
|
|
if (valids != nullptr) {
|
|
for (int i = 0; i < n; i++) {
|
|
auto offset = i + offset_begin;
|
|
if (!valids[i]) {
|
|
std::unique_lock<folly::SharedMutex> lock(mutex_);
|
|
null_offset_.push_back(offset);
|
|
}
|
|
}
|
|
}
|
|
wrapper_->add_data(texts, n, offset_begin);
|
|
if (shouldTriggerCommit()) {
|
|
Commit();
|
|
}
|
|
}
|
|
|
|
// schema_ may not be initialized so we need this `nullable` parameter
|
|
void
|
|
TextMatchIndex::BuildIndexFromFieldData(
|
|
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
|
|
int64_t offset = 0;
|
|
if (nullable) {
|
|
int64_t total = 0;
|
|
for (const auto& data : field_datas) {
|
|
total += data->get_null_count();
|
|
}
|
|
{
|
|
std::unique_lock<folly::SharedMutex> lock(mutex_);
|
|
null_offset_.reserve(total);
|
|
}
|
|
for (const auto& data : field_datas) {
|
|
auto n = data->get_num_rows();
|
|
for (int i = 0; i < n; i++) {
|
|
if (!data->is_valid(i)) {
|
|
std::unique_lock<folly::SharedMutex> lock(mutex_);
|
|
null_offset_.push_back(i);
|
|
}
|
|
wrapper_->add_data(
|
|
static_cast<const std::string*>(data->RawValue(i)),
|
|
data->is_valid(i) ? 1 : 0,
|
|
offset++);
|
|
}
|
|
}
|
|
} else {
|
|
for (const auto& data : field_datas) {
|
|
auto n = data->get_num_rows();
|
|
wrapper_->add_data(
|
|
static_cast<const std::string*>(data->Data()), n, offset);
|
|
offset += n;
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::Finish() {
|
|
finish();
|
|
}
|
|
|
|
bool
|
|
TextMatchIndex::shouldTriggerCommit() {
|
|
auto span = (std::chrono::duration<double, std::milli>(
|
|
stdclock::now() - last_commit_time_.load()))
|
|
.count();
|
|
return span > commit_interval_in_ms_;
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::Commit() {
|
|
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
|
|
if (lck.try_lock()) {
|
|
wrapper_->commit();
|
|
last_commit_time_.store(stdclock::now());
|
|
}
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::Reload() {
|
|
std::unique_lock<std::mutex> lck(mtx_, std::defer_lock);
|
|
if (lck.try_lock()) {
|
|
wrapper_->reload();
|
|
}
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::CreateReader(SetBitsetFn set_bitset) {
|
|
wrapper_->create_reader(set_bitset);
|
|
}
|
|
|
|
void
|
|
TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
|
|
const char* analyzer_params) {
|
|
wrapper_->register_tokenizer(tokenizer_name, analyzer_params);
|
|
}
|
|
|
|
TargetBitmap
|
|
TextMatchIndex::MatchQuery(const std::string& query) {
|
|
if (shouldTriggerCommit()) {
|
|
Commit();
|
|
Reload();
|
|
}
|
|
|
|
TargetBitmap bitset{static_cast<size_t>(Count())};
|
|
// The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment.
|
|
// So we cannot use the count operation to get the total count for bitmap.
|
|
// Just use the maximum offset of hits to get the total count for bitmap here.
|
|
wrapper_->match_query(query, &bitset);
|
|
return bitset;
|
|
}
|
|
|
|
TargetBitmap
|
|
TextMatchIndex::PhraseMatchQuery(const std::string& query, uint32_t slop) {
|
|
if (shouldTriggerCommit()) {
|
|
Commit();
|
|
Reload();
|
|
}
|
|
|
|
TargetBitmap bitset{static_cast<size_t>(Count())};
|
|
// The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment.
|
|
// So we cannot use the count operation to get the total count for bitmap.
|
|
// Just use the maximum offset of hits to get the total count for bitmap here.
|
|
wrapper_->phrase_match_query(query, slop, &bitset);
|
|
return bitset;
|
|
}
|
|
|
|
} // namespace milvus::index
|