// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include #include #include #include #include #include #include #include #include "common/EasyAssert.h" #include "common/Exception.h" #include "common/File.h" #include "common/FieldData.h" #include "common/Slice.h" #include "index/Utils.h" #include "index/Meta.h" #include "storage/Util.h" #include "knowhere/comp/index_param.h" namespace milvus::index { size_t get_file_size(int fd) { struct stat s; fstat(fd, &s); return s.st_size; } std::vector NM_List() { static std::vector ret{ knowhere::IndexEnum::INDEX_FAISS_IVFFLAT, }; return ret; } std::vector BIN_List() { static std::vector ret{ knowhere::IndexEnum::INDEX_FAISS_BIN_IDMAP, knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, }; return ret; } // TODO caiyd: should list supported list std::vector> unsupported_index_combinations() { static std::vector> ret{ std::make_tuple(knowhere::IndexEnum::INDEX_FAISS_BIN_IVFFLAT, knowhere::metric::L2), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::L2), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::COSINE), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::HAMMING), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::JACCARD), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::SUBSTRUCTURE), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_INVERTED_INDEX, knowhere::metric::SUPERSTRUCTURE), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::L2), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::COSINE), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::HAMMING), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::JACCARD), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::SUBSTRUCTURE), std::make_tuple(knowhere::IndexEnum::INDEX_SPARSE_WAND, knowhere::metric::SUPERSTRUCTURE), }; return ret; } bool is_in_bin_list(const IndexType& index_type) { return is_in_list(index_type, BIN_List); } bool is_in_nm_list(const IndexType& index_type) { return is_in_list(index_type, NM_List); } bool is_unsupported(const IndexType& index_type, const MetricType& metric_type) { return is_in_list>( std::make_tuple(index_type, metric_type), unsupported_index_combinations); } bool CheckKeyInConfig(const Config& cfg, const std::string& key) { return cfg.contains(key); } void ParseFromString(google::protobuf::Message& params, const std::string& str) { auto ok = google::protobuf::TextFormat::ParseFromString(str, ¶ms); AssertInfo(ok, "failed to parse params from string"); } int64_t GetDimFromConfig(const Config& config) { auto dimension = GetValueFromConfig(config, "dim"); AssertInfo(dimension.has_value(), "dimension not exist in config"); try { return (std::stoi(dimension.value())); } catch (const std::logic_error& e) { auto err_message = fmt::format( "invalided dimension:{}, error:{}", dimension.value(), e.what()); LOG_ERROR(err_message); throw std::logic_error(err_message); } } std::string GetMetricTypeFromConfig(const Config& config) { auto metric_type = GetValueFromConfig(config, "metric_type"); AssertInfo(metric_type.has_value(), "metric_type not exist in config"); return metric_type.value(); } std::string GetIndexTypeFromConfig(const Config& config) { auto index_type = GetValueFromConfig(config, "index_type"); AssertInfo(index_type.has_value(), "index_type not exist in config"); return index_type.value(); } IndexVersion GetIndexEngineVersionFromConfig(const Config& config) { auto index_engine_version = GetValueFromConfig(config, INDEX_ENGINE_VERSION); AssertInfo(index_engine_version.has_value(), "index_engine not exist in config"); try { return (std::stoi(index_engine_version.value())); } catch (const std::logic_error& e) { auto err_message = fmt::format("invalided index engine version:{}, error:{}", index_engine_version.value(), e.what()); LOG_ERROR(err_message); throw std::logic_error(err_message); } } int32_t GetBitmapCardinalityLimitFromConfig(const Config& config) { auto bitmap_limit = GetValueFromConfig( config, index::BITMAP_INDEX_CARDINALITY_LIMIT); AssertInfo(bitmap_limit.has_value(), "bitmap cardinality limit not exist in config"); try { return (std::stoi(bitmap_limit.value())); } catch (const std::logic_error& e) { auto err_message = fmt::format("invalided bitmap limit:{}, error:{}", bitmap_limit.value(), e.what()); LOG_ERROR(err_message); throw std::logic_error(err_message); } } // TODO :: too ugly storage::FieldDataMeta GetFieldDataMetaFromConfig(const Config& config) { storage::FieldDataMeta field_data_meta; // set collection id auto collection_id = index::GetValueFromConfig(config, index::COLLECTION_ID); AssertInfo(collection_id.has_value(), "collection id not exist in index config"); field_data_meta.collection_id = std::stol(collection_id.value()); // set partition id auto partition_id = index::GetValueFromConfig(config, index::PARTITION_ID); AssertInfo(partition_id.has_value(), "partition id not exist in index config"); field_data_meta.partition_id = std::stol(partition_id.value()); // set segment id auto segment_id = index::GetValueFromConfig(config, index::SEGMENT_ID); AssertInfo(segment_id.has_value(), "segment id not exist in index config"); field_data_meta.segment_id = std::stol(segment_id.value()); // set field id auto field_id = index::GetValueFromConfig(config, index::FIELD_ID); AssertInfo(field_id.has_value(), "field id not exist in index config"); field_data_meta.field_id = std::stol(field_id.value()); return field_data_meta; } storage::IndexMeta GetIndexMetaFromConfig(const Config& config) { storage::IndexMeta index_meta; // set segment id auto segment_id = index::GetValueFromConfig(config, index::SEGMENT_ID); AssertInfo(segment_id.has_value(), "segment id not exist in index config"); index_meta.segment_id = std::stol(segment_id.value()); // set field id auto field_id = index::GetValueFromConfig(config, index::FIELD_ID); AssertInfo(field_id.has_value(), "field id not exist in index config"); index_meta.field_id = std::stol(field_id.value()); // set index version auto index_version = index::GetValueFromConfig(config, index::INDEX_VERSION); AssertInfo(index_version.has_value(), "index_version id not exist in index config"); index_meta.index_version = std::stol(index_version.value()); // set index id auto build_id = index::GetValueFromConfig(config, index::INDEX_BUILD_ID); AssertInfo(build_id.has_value(), "build id not exist in index config"); index_meta.build_id = std::stol(build_id.value()); return index_meta; } Config ParseConfigFromIndexParams( const std::map& index_params) { Config config; for (auto& p : index_params) { config[p.first] = p.second; } return config; } std::map CompactIndexDatas( std::map>& index_datas) { std::map index_file_slices; std::unordered_set compacted_files; if (index_datas.find(INDEX_FILE_SLICE_META) != index_datas.end()) { auto slice_meta = std::move(index_datas.at(INDEX_FILE_SLICE_META)); Config meta_data = Config::parse(std::string( reinterpret_cast(slice_meta->PayloadData()), slice_meta->PayloadSize())); compacted_files.insert(INDEX_FILE_SLICE_META); for (auto& item : meta_data[META]) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); auto data_len = 0; index_file_slices.insert({prefix, IndexDataCodec{}}); auto& index_data_codec = index_file_slices.at(prefix); for (auto i = 0; i < slice_num; ++i) { std::string file_name = GenSlicedFileName(prefix, i); AssertInfo(index_datas.find(file_name) != index_datas.end(), "lost index slice data"); index_data_codec.codecs_.push_back( std::move(index_datas.at(file_name))); compacted_files.insert(file_name); data_len += index_data_codec.codecs_.back()->PayloadSize(); } AssertInfo( total_len == data_len, "index len is inconsistent after disassemble and assemble"); if (index_datas.count(prefix) > 0) { index_data_codec.codecs_.push_back( std::move(index_datas[prefix])); compacted_files.insert(prefix); } index_data_codec.size_ = data_len; } } for (auto& index_data : index_datas) { if (compacted_files.find(index_data.first) == compacted_files.end()) { index_file_slices.insert({index_data.first, IndexDataCodec{}}); auto& index_data_codec = index_file_slices.at(index_data.first); index_data_codec.size_ = index_data.second->PayloadSize(); index_data_codec.codecs_.push_back(std::move(index_data.second)); } } return index_file_slices; } void AssembleIndexDatas( std::map>& index_datas, BinarySet& index_binary_set) { auto index_file_slices = CompactIndexDatas(index_datas); AssembleIndexDatas(index_file_slices, index_binary_set); } void AssembleIndexDatas(std::map& index_file_slices, BinarySet& index_binary_set) { for (auto& [key, index_slices] : index_file_slices) { auto index_size = index_slices.size_; auto buf = std::shared_ptr(new uint8_t[index_size]); int64_t offset = 0; for (auto&& index_slice : index_slices.codecs_) { std::memcpy(buf.get() + offset, index_slice->PayloadData(), index_slice->PayloadSize()); offset += index_slice->PayloadSize(); } index_binary_set.Append(key, buf, index_size); } } void AssembleIndexDatas(std::map& index_datas, std::unordered_map& result) { if (auto meta_iter = index_datas.find(INDEX_FILE_SLICE_META); meta_iter != index_datas.end()) { auto raw_metadata_array = storage::CollectFieldDataChannel(meta_iter->second); auto raw_metadata = storage::MergeFieldData(raw_metadata_array); result[INDEX_FILE_SLICE_META] = raw_metadata; index_datas.erase(INDEX_FILE_SLICE_META); Config metadata = Config::parse( std::string(static_cast(raw_metadata->Data()), raw_metadata->DataSize())); for (auto& item : metadata[META]) { std::string prefix = item[NAME]; int slice_num = item[SLICE_NUM]; auto total_len = static_cast(item[TOTAL_LEN]); // build index skip null value, so not need to set nullable == true auto new_field_data = storage::CreateFieldData(DataType::INT8, false, 1, total_len); for (auto i = 0; i < slice_num; ++i) { std::string file_name = GenSlicedFileName(prefix, i); auto it = index_datas.find(file_name); AssertInfo(it != index_datas.end(), "lost index slice data"); auto& channel = it->second; auto data_array = storage::CollectFieldDataChannel(channel); auto data = storage::MergeFieldData(data_array); auto len = data->DataSize(); new_field_data->FillFieldData(data->Data(), len); index_datas.erase(file_name); } AssertInfo( new_field_data->IsFull(), "index len is inconsistent after disassemble and assemble"); result[prefix] = new_field_data; } } for (auto& [key, channel] : index_datas) { if (key == INDEX_FILE_SLICE_META) { continue; } auto data_array = storage::CollectFieldDataChannel(channel); auto data = storage::MergeFieldData(data_array); result[key] = data; } } void ReadDataFromFD(int fd, void* buf, size_t size, size_t chunk_size) { lseek(fd, 0, SEEK_SET); while (size != 0) { const size_t count = (size < chunk_size) ? size : chunk_size; const ssize_t size_read = read(fd, buf, count); if (size_read != count) { ThrowInfo(ErrorCode::UnistdError, "read data from fd error, returned read size is " + std::to_string(size_read)); } buf = static_cast(buf) + size_read; size -= static_cast(size_read); } } bool CheckAndUpdateKnowhereRangeSearchParam(const SearchInfo& search_info, const int64_t topk, const MetricType& metric_type, knowhere::Json& search_config) { const auto radius = index::GetValueFromConfig(search_info.search_params_, RADIUS); if (!radius.has_value()) { return false; } search_config[RADIUS] = radius.value(); // `range_search_k` is only used as one of the conditions for iterator early termination. // not gurantee to return exactly `range_search_k` results, which may be more or less. // set it to -1 will return all results in the range. search_config[knowhere::meta::RANGE_SEARCH_K] = topk; const auto range_filter = GetValueFromConfig(search_info.search_params_, RANGE_FILTER); if (range_filter.has_value()) { search_config[RANGE_FILTER] = range_filter.value(); CheckRangeSearchParam( search_config[RADIUS], search_config[RANGE_FILTER], metric_type); } const auto page_retain_order = GetValueFromConfig(search_info.search_params_, PAGE_RETAIN_ORDER); if (page_retain_order.has_value()) { search_config[knowhere::meta::RETAIN_ITERATOR_ORDER] = page_retain_order.value(); } return true; } } // namespace milvus::index