From 310d5d70bce99d59adefa0b983ce4a5aaf619da9 Mon Sep 17 00:00:00 2001 From: op-hunter Date: Fri, 27 Mar 2020 09:52:31 +0800 Subject: [PATCH] Annoy support (#1746) * add annoy source code Signed-off-by: shengjun.li * add annoy knowhere Signed-off-by: shengjun.li * annoy local gtest passed Signed-off-by: lichengming * fix lint error and update changelog Signed-off-by: lichengming * fix compile error Signed-off-by: cmli * Update connect timeout in test cases Signed-off-by: zw * fix some potential bugs Signed-off-by: cmli * retry ci Signed-off-by: cmli * rerun ci! Signed-off-by: cmli * fix errors tested by c++ sdk Signed-off-by: cmli * fix lint error Signed-off-by: cmli Co-authored-by: shengjun.li Co-authored-by: lichengming Co-authored-by: zw --- CHANGELOG.md | 1 + NOTICE.md | 1 + core/src/db/DBImpl.cpp | 1 + core/src/db/engine/ExecutionEngine.h | 3 +- core/src/db/engine/ExecutionEngineImpl.cpp | 4 + core/src/index/knowhere/CMakeLists.txt | 1 + .../index/vector_index/ConfAdapter.cpp | 16 + .../knowhere/index/vector_index/ConfAdapter.h | 9 + .../index/vector_index/ConfAdapterMgr.cpp | 1 + .../index/vector_index/IndexAnnoy.cpp | 172 ++ .../knowhere/index/vector_index/IndexAnnoy.h | 74 + .../knowhere/index/vector_index/IndexType.cpp | 2 + .../knowhere/index/vector_index/IndexType.h | 2 + .../index/vector_index/VecIndexFactory.cpp | 3 + .../vector_index/helpers/IndexParameter.h | 4 + core/src/index/thirdparty/annoy/LICENSE | 202 +++ core/src/index/thirdparty/annoy/RELEASE.md | 15 + .../thirdparty/annoy/examples/mmap_test.py | 14 + .../annoy/examples/precision_test.cpp | 176 +++ .../annoy/examples/precision_test.py | 46 + .../annoy/examples/s_compile_cpp.sh | 7 + .../thirdparty/annoy/examples/simple_test.py | 10 + .../thirdparty/annoy/src/annoygomodule.h | 92 ++ .../thirdparty/annoy/src/annoygomodule.i | 96 ++ .../src/index/thirdparty/annoy/src/annoylib.h | 1377 +++++++++++++++++ .../thirdparty/annoy/src/annoyluamodule.cc | 318 ++++ .../index/thirdparty/annoy/src/annoymodule.cc | 632 ++++++++ .../index/thirdparty/annoy/src/kissrandom.h | 106 ++ core/src/index/thirdparty/annoy/src/mman.h | 238 +++ core/src/index/unittest/CMakeLists.txt | 11 + core/src/index/unittest/test_annoy.cpp | 221 +++ .../delivery/request/DeleteByIDRequest.cpp | 1 + tests/milvus_python_test/test_connect.py | 11 +- .../milvus_python_test/test_search_vectors.py | 8 +- 34 files changed, 3864 insertions(+), 11 deletions(-) create mode 100644 core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp create mode 100644 core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.h create mode 100644 core/src/index/thirdparty/annoy/LICENSE create mode 100644 core/src/index/thirdparty/annoy/RELEASE.md create mode 100644 core/src/index/thirdparty/annoy/examples/mmap_test.py create mode 100644 core/src/index/thirdparty/annoy/examples/precision_test.cpp create mode 100644 core/src/index/thirdparty/annoy/examples/precision_test.py create mode 100755 core/src/index/thirdparty/annoy/examples/s_compile_cpp.sh create mode 100644 core/src/index/thirdparty/annoy/examples/simple_test.py create mode 100644 core/src/index/thirdparty/annoy/src/annoygomodule.h create mode 100644 core/src/index/thirdparty/annoy/src/annoygomodule.i create mode 100644 core/src/index/thirdparty/annoy/src/annoylib.h create mode 100644 core/src/index/thirdparty/annoy/src/annoyluamodule.cc create mode 100644 core/src/index/thirdparty/annoy/src/annoymodule.cc create mode 100644 core/src/index/thirdparty/annoy/src/kissrandom.h create mode 100644 core/src/index/thirdparty/annoy/src/mman.h create mode 100644 core/src/index/unittest/test_annoy.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 9832f15217..8e064e6788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub - \#1756 Fix memory exhausted during searching ## Feature +- \#261 Integrate ANNOY into Milvus - \#1603 BinaryFlat add 2 Metric: Substructure and Superstructure - \#1660 IVF PQ CPU support deleted vectors searching - \#1661 HNSW support deleted vectors searching diff --git a/NOTICE.md b/NOTICE.md index fb898419e7..b7cf8653e6 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -21,3 +21,4 @@ | aws-sdk-cpp | [Apache 2.0](https://github.com/aws/aws-sdk-cpp/blob/master/LICENSE) | | SPTAG | [MIT](https://github.com/microsoft/SPTAG/blob/master/LICENSE) | | hnswlib | [Apache 2.0](https://github.com/nmslib/hnswlib/blob/master/LICENSE) | +| annoy | [Apache 2.0](https://github.com/spotify/annoy/blob/master/LICENSE) | diff --git a/core/src/db/DBImpl.cpp b/core/src/db/DBImpl.cpp index cbb70b9466..fca45d369b 100644 --- a/core/src/db/DBImpl.cpp +++ b/core/src/db/DBImpl.cpp @@ -291,6 +291,7 @@ DBImpl::GetTableInfo(const std::string& table_id, TableInfo& table_info) { {(int32_t)engine::EngineType::FAISS_IVFFLAT, "IVFFLAT"}, {(int32_t)engine::EngineType::FAISS_IVFSQ8, "IVFSQ8"}, {(int32_t)engine::EngineType::NSG_MIX, "NSG"}, + {(int32_t)engine::EngineType::ANNOY, "ANNOY"}, {(int32_t)engine::EngineType::FAISS_IVFSQ8H, "IVFSQ8H"}, {(int32_t)engine::EngineType::FAISS_PQ, "PQ"}, {(int32_t)engine::EngineType::SPTAG_KDT, "KDT"}, diff --git a/core/src/db/engine/ExecutionEngine.h b/core/src/db/engine/ExecutionEngine.h index e7739d4d53..56f829960d 100644 --- a/core/src/db/engine/ExecutionEngine.h +++ b/core/src/db/engine/ExecutionEngine.h @@ -35,7 +35,8 @@ enum class EngineType { FAISS_BIN_IDMAP, FAISS_BIN_IVFFLAT, HNSW, - MAX_VALUE = HNSW, + ANNOY, + MAX_VALUE = ANNOY, }; enum class MetricType { diff --git a/core/src/db/engine/ExecutionEngineImpl.cpp b/core/src/db/engine/ExecutionEngineImpl.cpp index 3864fe514c..9fcd8863de 100644 --- a/core/src/db/engine/ExecutionEngineImpl.cpp +++ b/core/src/db/engine/ExecutionEngineImpl.cpp @@ -216,6 +216,10 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) { index = vec_index_factory.CreateVecIndex(knowhere::IndexEnum::INDEX_HNSW, mode); break; } + case EngineType::ANNOY: { + index = vec_index_factory.CreateVecIndex(knowhere::IndexEnum::INDEX_ANNOY, mode); + break; + } default: { ENGINE_LOG_ERROR << "Unsupported index type " << (int)type; return nullptr; diff --git a/core/src/index/knowhere/CMakeLists.txt b/core/src/index/knowhere/CMakeLists.txt index 02add8ef97..b6aff8a894 100644 --- a/core/src/index/knowhere/CMakeLists.txt +++ b/core/src/index/knowhere/CMakeLists.txt @@ -50,6 +50,7 @@ set(index_srcs knowhere/index/vector_index/IndexSPTAG.cpp knowhere/index/vector_index/IndexType.cpp knowhere/index/vector_index/VecIndexFactory.cpp + knowhere/index/vector_index/IndexAnnoy.cpp ) set(depend_libs diff --git a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.cpp b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.cpp index a2df3d1c78..0f69887715 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.cpp @@ -297,5 +297,21 @@ BinIVFConfAdapter::CheckTrain(Config& oricfg, const IndexMode mode) { return true; } +bool +ANNOYConfAdapter::CheckTrain(Config& oricfg, const IndexMode mode) { + static int64_t MIN_NTREES = 0; + // too large of n_trees takes much time, if there is real requirement, change this threshold. + static int64_t MAX_NTREES = 16384; + + CheckIntByRange(knowhere::IndexParams::n_trees, MIN_NTREES, MAX_NTREES); + + return ConfAdapter::CheckTrain(oricfg, mode); +} + +bool +ANNOYConfAdapter::CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) { + return ConfAdapter::CheckSearch(oricfg, type, mode); +} + } // namespace knowhere } // namespace milvus diff --git a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.h b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.h index a46da52cbf..2ff6ebe3d3 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.h +++ b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapter.h @@ -84,5 +84,14 @@ class HNSWConfAdapter : public ConfAdapter { CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) override; }; +class ANNOYConfAdapter : public ConfAdapter { + public: + bool + CheckTrain(Config& oricfg, const IndexMode mode) override; + + bool + CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) override; +}; + } // namespace knowhere } // namespace milvus diff --git a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapterMgr.cpp b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapterMgr.cpp index 91cd5d05be..2de15290e2 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapterMgr.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/ConfAdapterMgr.cpp @@ -46,6 +46,7 @@ AdapterMgr::RegisterAdapter() { REGISTER_CONF_ADAPTER(ConfAdapter, IndexEnum::INDEX_SPTAG_KDT_RNT, sptag_kdt_adapter); REGISTER_CONF_ADAPTER(ConfAdapter, IndexEnum::INDEX_SPTAG_BKT_RNT, sptag_bkt_adapter); REGISTER_CONF_ADAPTER(HNSWConfAdapter, IndexEnum::INDEX_HNSW, hnsw_adapter); + REGISTER_CONF_ADAPTER(ANNOYConfAdapter, IndexEnum::INDEX_ANNOY, annoy_adapter); } } // namespace knowhere diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp new file mode 100644 index 0000000000..4adb9f62c5 --- /dev/null +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp @@ -0,0 +1,172 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include "knowhere/index/vector_index/IndexAnnoy.h" + +#include +#include +#include +#include +#include +#include + +#include "hnswlib/hnswalg.h" +#include "hnswlib/space_ip.h" +#include "hnswlib/space_l2.h" +#include "knowhere/common/Exception.h" +#include "knowhere/common/Log.h" +#include "knowhere/index/vector_index/adapter/VectorAdapter.h" +#include "knowhere/index/vector_index/helpers/FaissIO.h" + +namespace milvus { +namespace knowhere { + +BinarySet +IndexAnnoy::Serialize(const Config& config) { + if (!index_) { + KNOWHERE_THROW_MSG("index not initialize or trained"); + } + + BinarySet res_set; + auto metric_type_length = metric_type_.length(); + uint8_t* p = new uint8_t[metric_type_length]; + std::shared_ptr metric_type(p, [](uint8_t* p) { delete[] p; }); + memcpy(p, metric_type_.data(), metric_type_.length()); + + uint8_t* p_dim = new uint8_t[sizeof(uint64_t)]; + std::shared_ptr dim_data(p_dim, [](uint8_t* p_dim) { delete[] p_dim; }); + auto dim = Dim(); + memcpy(p_dim, &dim, sizeof(uint64_t)); + + auto index_length = index_->get_index_length(); + uint8_t* q = new uint8_t[index_length]; + std::shared_ptr index_data(q, [](uint8_t* q) { delete[] q; }); + memcpy(q, index_->get_index(), (size_t)index_length); + + res_set.Append("annoy_metric_type", metric_type, metric_type_length); + res_set.Append("annoy_dim", dim_data, sizeof(uint64_t)); + res_set.Append("annoy_index_data", index_data, index_length); + return res_set; +} + +void +IndexAnnoy::Load(const BinarySet& index_binary) { + auto metric_type = index_binary.GetByName("annoy_metric_type"); + metric_type_.resize((size_t)metric_type->size + 1); + memcpy(metric_type_.data(), metric_type->data.get(), (size_t)metric_type->size); + + auto dim_data = index_binary.GetByName("annoy_dim"); + uint64_t dim; + memcpy(&dim, dim_data->data.get(), (size_t)dim_data->size); + + if (metric_type_ == Metric::L2) { + index_ = std::make_shared>(dim); + } else if (metric_type_ == Metric::IP) { + index_ = std::make_shared>(dim); + } else { + KNOWHERE_THROW_MSG("metric not supported " + metric_type_); + } + + auto index_data = index_binary.GetByName("annoy_index_data"); + char* p = nullptr; + if (!index_->load_index(index_data->data.get(), index_data->size, &p)) { + std::string error_msg(p); + free(p); + KNOWHERE_THROW_MSG(error_msg); + } +} + +void +IndexAnnoy::BuildAll(const DatasetPtr& dataset_ptr, const Config& config) { + if (index_) { + // it is builded all + return; + } + + GETTENSORWITHIDS(dataset_ptr) + + metric_type_ = config[Metric::TYPE]; + if (metric_type_ == Metric::L2) { + index_ = std::make_shared>(dim); + } else if (metric_type_ == Metric::IP) { + index_ = std::make_shared>(dim); + } else { + KNOWHERE_THROW_MSG("metric not supported " + metric_type_); + } + + for (int i = 0; i < rows; ++i) { + index_->add_item(p_ids[i], (const float*)p_data + dim * i); + } + + index_->build(config[IndexParams::n_trees].get()); +} + +DatasetPtr +IndexAnnoy::Query(const DatasetPtr& dataset_ptr, const Config& config) { + if (!index_) { + KNOWHERE_THROW_MSG("index not initialize or trained"); + } + + GETTENSOR(dataset_ptr) + auto k = config[meta::TOPK].get(); + auto search_k = config[IndexParams::search_k].get(); + auto all_num = rows * k; + auto p_id = (int64_t*)malloc(all_num * sizeof(int64_t)); + auto p_dist = (float*)malloc(all_num * sizeof(float)); + faiss::ConcurrentBitsetPtr blacklist = nullptr; + GetBlacklist(blacklist); + +#pragma omp parallel for + for (unsigned int i = 0; i < rows; ++i) { + std::vector result; + result.reserve(k); + std::vector distances; + distances.reserve(k); + index_->get_nns_by_vector((const float*)p_data + i * dim, k, search_k, &result, &distances, blacklist); + + memcpy(p_id + k * i, result.data(), k * sizeof(int64_t)); + memcpy(p_dist + k * i, distances.data(), k * sizeof(float)); + } + + auto ret_ds = std::make_shared(); + ret_ds->Set(meta::IDS, p_id); + ret_ds->Set(meta::DISTANCE, p_dist); + return ret_ds; +} + +int64_t +IndexAnnoy::Count() { + if (!index_) { + KNOWHERE_THROW_MSG("index not initialize"); + } + + return index_->get_n_items(); +} + +int64_t +IndexAnnoy::Dim() { + if (!index_) { + KNOWHERE_THROW_MSG("index not initialize"); + } + + return index_->get_dim(); +} + +int64_t +IndexAnnoy::IndexSize() { + if (index_size_ != -1) { + return index_size_; + } + + return index_size_ = Dim() * Count() * sizeof(float); +} +} // namespace knowhere +} // namespace milvus diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.h new file mode 100644 index 0000000000..cbef69c5a4 --- /dev/null +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexAnnoy.h @@ -0,0 +1,74 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#pragma once + +#include +#include + +#include "annoy/src/annoylib.h" +#include "annoy/src/kissrandom.h" + +#include "knowhere/common/Exception.h" +#include "knowhere/index/vector_index/VecIndex.h" + +namespace milvus { +namespace knowhere { + +class IndexAnnoy : public VecIndex { + public: + IndexAnnoy() { + index_type_ = IndexEnum::INDEX_ANNOY; + } + + BinarySet + Serialize(const Config& config = Config()) override; + + void + Load(const BinarySet& index_binary) override; + + void + BuildAll(const DatasetPtr& dataset_ptr, const Config& config) override; + + void + Train(const DatasetPtr& dataset_ptr, const Config& config) override { + KNOWHERE_THROW_MSG("Annoy not support build item dynamically, please invoke BuildAll interface."); + } + + void + Add(const DatasetPtr& dataset_ptr, const Config& config) override { + KNOWHERE_THROW_MSG("Annoy not support add item dynamically, please invoke BuildAll interface."); + } + + void + AddWithoutIds(const DatasetPtr&, const Config&) override { + KNOWHERE_THROW_MSG("Incremental index is not supported"); + } + + DatasetPtr + Query(const DatasetPtr& dataset_ptr, const Config& config) override; + + int64_t + Count() override; + + int64_t + Dim() override; + + int64_t + IndexSize() override; + + private: + MetricType metric_type_; + std::shared_ptr> index_ = nullptr; +}; + +} // namespace knowhere +} // namespace milvus diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexType.cpp b/core/src/index/knowhere/knowhere/index/vector_index/IndexType.cpp index 27ec3444d1..bb7aec4ed2 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexType.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexType.cpp @@ -34,6 +34,7 @@ static std::unordered_map old_index_type_str_map = { {(int32_t)OldIndexType::SPTAG_KDT_RNT_CPU, IndexEnum::INDEX_SPTAG_KDT_RNT}, {(int32_t)OldIndexType::SPTAG_BKT_RNT_CPU, IndexEnum::INDEX_SPTAG_BKT_RNT}, {(int32_t)OldIndexType::HNSW, IndexEnum::INDEX_HNSW}, + {(int32_t)OldIndexType::ANNOY, IndexEnum::INDEX_ANNOY}, {(int32_t)OldIndexType::FAISS_BIN_IDMAP, IndexEnum::INDEX_FAISS_BIN_IDMAP}, {(int32_t)OldIndexType::FAISS_BIN_IVFLAT_CPU, IndexEnum::INDEX_FAISS_BIN_IVFFLAT}, }; @@ -49,6 +50,7 @@ static std::unordered_map str_old_index_type_map = { {IndexEnum::INDEX_SPTAG_KDT_RNT, (int32_t)OldIndexType::SPTAG_KDT_RNT_CPU}, {IndexEnum::INDEX_SPTAG_BKT_RNT, (int32_t)OldIndexType::SPTAG_BKT_RNT_CPU}, {IndexEnum::INDEX_HNSW, (int32_t)OldIndexType::HNSW}, + {IndexEnum::INDEX_ANNOY, (int32_t)OldIndexType::ANNOY}, {IndexEnum::INDEX_FAISS_BIN_IDMAP, (int32_t)OldIndexType::FAISS_BIN_IDMAP}, {IndexEnum::INDEX_FAISS_BIN_IVFFLAT, (int32_t)OldIndexType::FAISS_BIN_IVFLAT_CPU}, }; diff --git a/core/src/index/knowhere/knowhere/index/vector_index/IndexType.h b/core/src/index/knowhere/knowhere/index/vector_index/IndexType.h index 1f2b26866d..fcb1dbd430 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/IndexType.h +++ b/core/src/index/knowhere/knowhere/index/vector_index/IndexType.h @@ -34,6 +34,7 @@ enum class OldIndexType { FAISS_IVFPQ_MIX, SPTAG_BKT_RNT_CPU, HNSW, + ANNOY, FAISS_BIN_IDMAP = 100, FAISS_BIN_IVFLAT_CPU = 101, }; @@ -54,6 +55,7 @@ constexpr const char* INDEX_NSG = "NSG"; constexpr const char* INDEX_SPTAG_KDT_RNT = "SPTAG_KDT_RNT"; constexpr const char* INDEX_SPTAG_BKT_RNT = "SPTAG_BKT_RNT"; constexpr const char* INDEX_HNSW = "HNSW"; +constexpr const char* INDEX_ANNOY = "ANNOY"; } // namespace IndexEnum enum class IndexMode { MODE_CPU = 0, MODE_GPU = 1 }; diff --git a/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp b/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp index 5a512870ed..8e3119ecac 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/VecIndexFactory.cpp @@ -13,6 +13,7 @@ #include "knowhere/common/Exception.h" #include "knowhere/common/Log.h" +#include "knowhere/index/vector_index/IndexAnnoy.h" #include "knowhere/index/vector_index/IndexBinaryIDMAP.h" #include "knowhere/index/vector_index/IndexBinaryIVF.h" #include "knowhere/index/vector_index/IndexHNSW.h" @@ -78,6 +79,8 @@ VecIndexFactory::CreateVecIndex(const IndexType& type, const IndexMode mode) { return std::make_shared("BKT"); } else if (type == IndexEnum::INDEX_HNSW) { return std::make_shared(); + } else if (type == IndexEnum::INDEX_ANNOY) { + return std::make_shared(); } else { return nullptr; } diff --git a/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h b/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h index b9f6b361fe..b37988d881 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h +++ b/core/src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h @@ -44,6 +44,10 @@ constexpr const char* candidate = "candidate_pool_size"; constexpr const char* efConstruction = "efConstruction"; constexpr const char* M = "M"; constexpr const char* ef = "ef"; + +// Annoy Params +constexpr const char* n_trees = "n_trees"; +constexpr const char* search_k = "search_k"; } // namespace IndexParams namespace Metric { diff --git a/core/src/index/thirdparty/annoy/LICENSE b/core/src/index/thirdparty/annoy/LICENSE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/core/src/index/thirdparty/annoy/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/core/src/index/thirdparty/annoy/RELEASE.md b/core/src/index/thirdparty/annoy/RELEASE.md new file mode 100644 index 0000000000..c3a1147ce9 --- /dev/null +++ b/core/src/index/thirdparty/annoy/RELEASE.md @@ -0,0 +1,15 @@ +How to release +-------------- + +1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master` +1. Update `setup.py` to the newest version, `git add setup.py && git commit -m "version 1.2.3"` +1. `python setup.py sdist bdist_wheel` +1. `git tag -a v1.2.3 -m "version 1.2.3"` +1. `git push --tags origin master` to push the last version to Github +1. Go to https://github.com/spotify/annoy/releases and click "Draft a new release" +1. `twine upload dist/annoy-1.2.3*` + +TODO +---- + +* Wheel diff --git a/core/src/index/thirdparty/annoy/examples/mmap_test.py b/core/src/index/thirdparty/annoy/examples/mmap_test.py new file mode 100644 index 0000000000..4f86e86713 --- /dev/null +++ b/core/src/index/thirdparty/annoy/examples/mmap_test.py @@ -0,0 +1,14 @@ +from annoy import AnnoyIndex + +a = AnnoyIndex(3, 'angular') +a.add_item(0, [1, 0, 0]) +a.add_item(1, [0, 1, 0]) +a.add_item(2, [0, 0, 1]) +a.build(-1) +a.save('test.tree') + +b = AnnoyIndex(3) +b.load('test.tree') + +print(b.get_nns_by_item(0, 100)) +print(b.get_nns_by_vector([1.0, 0.5, 0.5], 100)) diff --git a/core/src/index/thirdparty/annoy/examples/precision_test.cpp b/core/src/index/thirdparty/annoy/examples/precision_test.cpp new file mode 100644 index 0000000000..2c006487c9 --- /dev/null +++ b/core/src/index/thirdparty/annoy/examples/precision_test.cpp @@ -0,0 +1,176 @@ +/* + * precision_test.cpp + + * + * Created on: Jul 13, 2016 + * Author: Claudio Sanhueza + * Contact: csanhuezalobos@gmail.com + */ + +#include +#include +#include "../src/kissrandom.h" +#include "../src/annoylib.h" +#include +#include +#include +#include + + +int precision(int f=40, int n=1000000){ + std::chrono::high_resolution_clock::time_point t_start, t_end; + + std::default_random_engine generator; + std::normal_distribution distribution(0.0, 1.0); + + //****************************************************** + //Building the tree + AnnoyIndex t = AnnoyIndex(f); + + std::cout << "Building index ... be patient !!" << std::endl; + std::cout << "\"Trees that are slow to grow bear the best fruit\" (Moliere)" << std::endl; + + + + for(int i=0; i( t_end - t_start ).count(); + std::cout << " Done in "<< duration << " secs." << std::endl; + + + std::cout << "Saving index ..."; + t.save("precision.tree"); + std::cout << " Done" << std::endl; + + + + //****************************************************** + std::vector limits = {10, 100, 1000, 10000}; + int K=10; + int prec_n = 1000; + + std::map prec_sum; + std::map time_sum; + std::vector closest; + + //init precision and timers map + for(std::vector::iterator it = limits.begin(); it!=limits.end(); ++it){ + prec_sum[(*it)] = 0.0; + time_sum[(*it)] = 0.0; + } + + // doing the work + for(int i=0; i toplist; + std::vector intersection; + + for(std::vector::iterator limit = limits.begin(); limit!=limits.end(); ++limit){ + + t_start = std::chrono::high_resolution_clock::now(); + t.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to "n_trees * n" if not provided. + t_end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast( t_end - t_start ).count(); + + //intersecting results + std::sort(closest.begin(), closest.end(), std::less()); + std::sort(toplist.begin(), toplist.end(), std::less()); + intersection.resize(std::max(closest.size(), toplist.size())); + std::vector::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin()); + intersection.resize(it_set-intersection.begin()); + + // storing metrics + int found = intersection.size(); + double hitrate = found / (double) K; + prec_sum[(*limit)] += hitrate; + + time_sum[(*limit)] += duration; + + + //deallocate memory + vector().swap(intersection); + vector().swap(toplist); + } + + //print resulting metrics + for(std::vector::iterator limit = limits.begin(); limit!=limits.end(); ++limit){ + std::cout << "limit: " << (*limit) << "\tprecision: "<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << "% \tavg. time: "<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << "s" << std::endl; + } + + closest.clear(); vector().swap(closest); + + } + + std::cout << "\nDone" << std::endl; + return 0; +} + + +void help(){ + std::cout << "Annoy Precision C++ example" << std::endl; + std::cout << "Usage:" << std::endl; + std::cout << "(default) ./precision" << std::endl; + std::cout << "(using parameters) ./precision num_features num_nodes" << std::endl; + std::cout << std::endl; +} + +void feedback(int f, int n){ + std::cout<<"Runing precision example with:" << std::endl; + std::cout<<"num. features: "<< f << std::endl; + std::cout<<"num. nodes: "<< n << std::endl; + std::cout << std::endl; +} + + +int main(int argc, char **argv) { + int f, n; + + + if(argc == 1){ + f = 40; + n = 1000000; + + feedback(f,n); + + precision(40, 1000000); + } + else if(argc == 3){ + + f = atoi(argv[1]); + n = atoi(argv[2]); + + feedback(f,n); + + precision(f, n); + } + else { + help(); + return EXIT_FAILURE; + } + + + return EXIT_SUCCESS; +} diff --git a/core/src/index/thirdparty/annoy/examples/precision_test.py b/core/src/index/thirdparty/annoy/examples/precision_test.py new file mode 100644 index 0000000000..d179e6b9ba --- /dev/null +++ b/core/src/index/thirdparty/annoy/examples/precision_test.py @@ -0,0 +1,46 @@ +from __future__ import print_function +import random, time +from annoy import AnnoyIndex + +try: + xrange +except NameError: + # Python 3 compat + xrange = range + +n, f = 100000, 40 + +t = AnnoyIndex(f, 'angular') +for i in xrange(n): + v = [] + for z in xrange(f): + v.append(random.gauss(0, 1)) + t.add_item(i, v) + +t.build(2 * f) +t.save('test.tree') + +limits = [10, 100, 1000, 10000] +k = 10 +prec_sum = {} +prec_n = 1000 +time_sum = {} + +for i in xrange(prec_n): + j = random.randrange(0, n) + + closest = set(t.get_nns_by_item(j, k, n)) + for limit in limits: + t0 = time.time() + toplist = t.get_nns_by_item(j, k, limit) + T = time.time() - t0 + + found = len(closest.intersection(toplist)) + hitrate = 1.0 * found / k + prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate + time_sum[limit] = time_sum.get(limit, 0.0) + T + +for limit in limits: + print('limit: %-9d precision: %6.2f%% avg time: %.6fs' + % (limit, 100.0 * prec_sum[limit] / (i + 1), + time_sum[limit] / (i + 1))) diff --git a/core/src/index/thirdparty/annoy/examples/s_compile_cpp.sh b/core/src/index/thirdparty/annoy/examples/s_compile_cpp.sh new file mode 100755 index 0000000000..687a6082b2 --- /dev/null +++ b/core/src/index/thirdparty/annoy/examples/s_compile_cpp.sh @@ -0,0 +1,7 @@ +#!/bin/bash + + +echo "compiling precision example..." +cmd="g++ precision_test.cpp -o precision_test -std=c++11" +eval $cmd +echo "Done" diff --git a/core/src/index/thirdparty/annoy/examples/simple_test.py b/core/src/index/thirdparty/annoy/examples/simple_test.py new file mode 100644 index 0000000000..27e0343a26 --- /dev/null +++ b/core/src/index/thirdparty/annoy/examples/simple_test.py @@ -0,0 +1,10 @@ +from annoy import AnnoyIndex + +a = AnnoyIndex(3, 'angular') +a.add_item(0, [1, 0, 0]) +a.add_item(1, [0, 1, 0]) +a.add_item(2, [0, 0, 1]) +a.build(-1) + +print(a.get_nns_by_item(0, 100)) +print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100)) diff --git a/core/src/index/thirdparty/annoy/src/annoygomodule.h b/core/src/index/thirdparty/annoy/src/annoygomodule.h new file mode 100644 index 0000000000..005ed06558 --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/annoygomodule.h @@ -0,0 +1,92 @@ +#include "annoylib.h" +#include "kissrandom.h" + +namespace GoAnnoy { + +class AnnoyIndex { + protected: + ::AnnoyIndexInterface *ptr; + + int f; + + public: + ~AnnoyIndex() { + delete ptr; + }; + void addItem(int item, const float* w) { + ptr->add_item(item, w); + }; + void build(int q) { + ptr->build(q); + }; + bool save(const char* filename, bool prefault) { + return ptr->save(filename, prefault); + }; + bool save(const char* filename) { + return ptr->save(filename, true); + }; + void unload() { + ptr->unload(); + }; + bool load(const char* filename, bool prefault) { + return ptr->load(filename, prefault); + }; + bool load(const char* filename) { + return ptr->load(filename, true); + }; + float getDistance(int i, int j) { + return ptr->get_distance(i, j); + }; + void getNnsByItem(int item, int n, int search_k, vector* result, vector* distances) { + ptr->get_nns_by_item(item, n, search_k, result, distances); + }; + void getNnsByVector(const float* w, int n, int search_k, vector* result, vector* distances) { + ptr->get_nns_by_vector(w, n, search_k, result, distances); + }; + void getNnsByItem(int item, int n, int search_k, vector* result) { + ptr->get_nns_by_item(item, n, search_k, result, NULL); + }; + void getNnsByVector(const float* w, int n, int search_k, vector* result) { + ptr->get_nns_by_vector(w, n, search_k, result, NULL); + }; + + int getNItems() { + return (int)ptr->get_n_items(); + }; + void verbose(bool v) { + ptr->verbose(v); + }; + void getItem(int item, vector *v) { + v->resize(this->f); + ptr->get_item(item, &v->front()); + }; + bool onDiskBuild(const char* filename) { + return ptr->on_disk_build(filename); + }; +}; + +class AnnoyIndexAngular : public AnnoyIndex +{ + public: + AnnoyIndexAngular(int f) { + ptr = new ::AnnoyIndex(f); + this->f = f; + } +}; + +class AnnoyIndexEuclidean : public AnnoyIndex { + public: + AnnoyIndexEuclidean(int f) { + ptr = new ::AnnoyIndex(f); + this->f = f; + } +}; + +class AnnoyIndexManhattan : public AnnoyIndex { + public: + AnnoyIndexManhattan(int f) { + ptr = new ::AnnoyIndex(f); + this->f = f; + } +}; +} diff --git a/core/src/index/thirdparty/annoy/src/annoygomodule.i b/core/src/index/thirdparty/annoy/src/annoygomodule.i new file mode 100644 index 0000000000..9882cbeb2c --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/annoygomodule.i @@ -0,0 +1,96 @@ +%module annoyindex + +%{ +#include "annoygomodule.h" +%} + + +// const float * +%typemap(gotype) (const float *) "[]float32" + +%typemap(in) (const float *) +%{ + float *v; + vector w; + v = (float *)$input.array; + for (int i = 0; i < $input.len; i++) { + w.push_back(v[i]); + } + $1 = &w[0]; +%} + +// vector * +%typemap(gotype) (vector *) "*[]int" + +%typemap(in) (vector *) +%{ + $1 = new vector(); +%} + +%typemap(freearg) (vector *) +%{ + delete $1; +%} + +%typemap(argout) (vector *) +%{ + { + $input->len = $1->size(); + $input->cap = $1->size(); + $input->array = malloc($input->len * sizeof(intgo)); + for (int i = 0; i < $1->size(); i++) { + ((intgo *)$input->array)[i] = (intgo)(*$1)[i]; + } + } +%} + + +// vector * +%typemap(gotype) (vector *) "*[]float32" + +%typemap(in) (vector *) +%{ + $1 = new vector(); +%} + +%typemap(freearg) (vector *) +%{ + delete $1; +%} + +%typemap(argout) (vector *) +%{ + { + $input->len = $1->size(); + $input->cap = $1->size(); + $input->array = malloc($input->len * sizeof(float)); + for (int i = 0; i < $1->size(); i++) { + ((float *)$input->array)[i] = (float)(*$1)[i]; + } + } +%} + + +%typemap(gotype) (const char *) "string" + +%typemap(in) (const char *) +%{ + $1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char)); + strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n); +%} + +%typemap(freearg) (const char *) +%{ + free($1); +%} + + +/* Let's just grab the original header file here */ +%include "annoygomodule.h" + +%feature("notabstract") GoAnnoyIndexAngular; +%feature("notabstract") GoAnnoyIndexEuclidean; +%feature("notabstract") GoAnnoyIndexManhattan; + + + diff --git a/core/src/index/thirdparty/annoy/src/annoylib.h b/core/src/index/thirdparty/annoy/src/annoylib.h new file mode 100644 index 0000000000..eebfa78d62 --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/annoylib.h @@ -0,0 +1,1377 @@ +// Copyright (c) 2013 Spotify AB +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + + +#ifndef ANNOYLIB_H +#define ANNOYLIB_H + +#include +#include +#ifndef _MSC_VER +#include +#endif +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) && _MSC_VER == 1500 +typedef unsigned char uint8_t; +typedef signed __int32 int32_t; +typedef unsigned __int64 uint64_t; +typedef signed __int64 int64_t; +#else +#include +#endif + +#if defined(_MSC_VER) || defined(__MINGW32__) + // a bit hacky, but override some definitions to support 64 bit + #define off_t int64_t + #define lseek_getsize(fd) _lseeki64(fd, 0, SEEK_END) + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include "mman.h" + #include +#else + #include + #define lseek_getsize(fd) lseek(fd, 0, SEEK_END) +#endif + +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +// Needed for Visual Studio to disable runtime checks for mempcy +#pragma runtime_checks("s", off) +#endif + +// This allows others to supply their own logger / error printer without +// requiring Annoy to import their headers. See RcppAnnoy for a use case. +#ifndef __ERROR_PRINTER_OVERRIDE__ + #define showUpdate(...) { fprintf(stderr, __VA_ARGS__ ); } +#else + #define showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); } +#endif + +// Portable alloc definition, cf Writing R Extensions, Section 1.6.4 +#ifdef __GNUC__ + // Includes GCC, clang and Intel compilers + # undef alloca + # define alloca(x) __builtin_alloca((x)) +#elif defined(__sun) || defined(_AIX) + // this is necessary (and sufficient) for Solaris 10 and AIX 6: + # include +#endif + +inline void set_error_from_errno(char **error, const char* msg) { + showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno); + if (error) { + *error = (char *)malloc(256); // TODO: win doesn't support snprintf + sprintf(*error, "%s: %s (%d)", msg, strerror(errno), errno); + } +} + +inline void set_error_from_string(char **error, const char* msg) { + showUpdate("%s\n", msg); + if (error) { + *error = (char *)malloc(strlen(msg) + 1); + strcpy(*error, msg); + } +} + +// We let the v array in the Node struct take whatever space is needed, so this is a mostly insignificant number. +// Compilers need *some* size defined for the v array, and some memory checking tools will flag for buffer overruns if this is set too low. +#define V_ARRAY_SIZE 65536 + +#ifndef _MSC_VER +#define popcount __builtin_popcountll +#else // See #293, #358 +#define isnan(x) _isnan(x) +#define popcount cole_popcount +#endif + +#if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && (__GNUC__ >6) && defined(__AVX512F__) // See #402 +#define USE_AVX512 +#elif !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__) +#define USE_AVX +#else +#endif + +#if defined(USE_AVX) || defined(USE_AVX512) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#include + +#endif +#endif + + +using std::vector; +using std::pair; +using std::numeric_limits; +using std::make_pair; + +inline void* remap_memory(void* _ptr, int _fd, size_t old_size, size_t new_size) { +#ifdef __linux__ + _ptr = mremap(_ptr, old_size, new_size, MREMAP_MAYMOVE); +#else + munmap(_ptr, old_size); +#ifdef MAP_POPULATE + _ptr = mmap(_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0); +#else + _ptr = mmap(_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0); +#endif +#endif + return _ptr; +} + +namespace { + +template +inline Node* get_node_ptr(const void* _nodes, const size_t _s, const S i) { + return (Node*)((uint8_t *)_nodes + (_s * i)); +} + +template +inline T dot(const T* x, const T* y, int f) { + T s = 0; + for (int z = 0; z < f; z++) { + s += (*x) * (*y); + x++; + y++; + } + return s; +} + +template +inline T manhattan_distance(const T* x, const T* y, int f) { + T d = 0.0; + for (int i = 0; i < f; i++) + d += fabs(x[i] - y[i]); + return d; +} + +template +inline T euclidean_distance(const T* x, const T* y, int f) { + // Don't use dot-product: avoid catastrophic cancellation in #314. + T d = 0.0; + for (int i = 0; i < f; ++i) { + const T tmp=*x - *y; + d += tmp * tmp; + ++x; + ++y; + } + return d; +} + +#ifdef USE_AVX +// Horizontal single sum of 256bit vector. +inline float hsum256_ps_avx(__m256 v) { + const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v)); + const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); + const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); + return _mm_cvtss_f32(x32); +} + +template<> +inline float dot(const float* x, const float *y, int f) { + float result = 0; + if (f > 7) { + __m256 d = _mm256_setzero_ps(); + for (; f > 7; f -= 8) { + d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y))); + x += 8; + y += 8; + } + // Sum all floats in dot register. + result += hsum256_ps_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *x * *y; + x++; + y++; + } + return result; +} + +template<> +inline float manhattan_distance(const float* x, const float* y, int f) { + float result = 0; + int i = f; + if (f > 7) { + __m256 manhattan = _mm256_setzero_ps(); + __m256 minus_zero = _mm256_set1_ps(-0.0f); + for (; i > 7; i -= 8) { + const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); + const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero) + manhattan = _mm256_add_ps(manhattan, distance); + x += 8; + y += 8; + } + // Sum all floats in manhattan register. + result = hsum256_ps_avx(manhattan); + } + // Don't forget the remaining values. + for (; i > 0; i--) { + result += fabsf(*x - *y); + x++; + y++; + } + return result; +} + +template<> +inline float euclidean_distance(const float* x, const float* y, int f) { + float result=0; + if (f > 7) { + __m256 d = _mm256_setzero_ps(); + for (; f > 7; f -= 8) { + const __m256 diff = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); + d = _mm256_add_ps(d, _mm256_mul_ps(diff, diff)); // no support for fmadd in AVX... + x += 8; + y += 8; + } + // Sum all floats in dot register. + result = hsum256_ps_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + float tmp = *x - *y; + result += tmp * tmp; + x++; + y++; + } + return result; +} + +#endif + +#ifdef USE_AVX512 +template<> +inline float dot(const float* x, const float *y, int f) { + float result = 0; + if (f > 15) { + __m512 d = _mm512_setzero_ps(); + for (; f > 15; f -= 16) { + //AVX512F includes FMA + d = _mm512_fmadd_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y), d); + x += 16; + y += 16; + } + // Sum all floats in dot register. + result += _mm512_reduce_add_ps(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *x * *y; + x++; + y++; + } + return result; +} + +template<> +inline float manhattan_distance(const float* x, const float* y, int f) { + float result = 0; + int i = f; + if (f > 15) { + __m512 manhattan = _mm512_setzero_ps(); + for (; i > 15; i -= 16) { + const __m512 x_minus_y = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y)); + manhattan = _mm512_add_ps(manhattan, _mm512_abs_ps(x_minus_y)); + x += 16; + y += 16; + } + // Sum all floats in manhattan register. + result = _mm512_reduce_add_ps(manhattan); + } + // Don't forget the remaining values. + for (; i > 0; i--) { + result += fabsf(*x - *y); + x++; + y++; + } + return result; +} + +template<> +inline float euclidean_distance(const float* x, const float* y, int f) { + float result=0; + if (f > 15) { + __m512 d = _mm512_setzero_ps(); + for (; f > 15; f -= 16) { + const __m512 diff = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y)); + d = _mm512_fmadd_ps(diff, diff, d); + x += 16; + y += 16; + } + // Sum all floats in dot register. + result = _mm512_reduce_add_ps(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + float tmp = *x - *y; + result += tmp * tmp; + x++; + y++; + } + return result; +} + +#endif + + +template +inline T get_norm(T* v, int f) { + return sqrt(dot(v, v, f)); +} + +template +inline void two_means(const vector& nodes, int f, Random& random, bool cosine, Node* p, Node* q) { + /* + This algorithm is a huge heuristic. Empirically it works really well, but I + can't motivate it well. The basic idea is to keep two centroids and assign + points to either one of them. We weight each centroid by the number of points + assigned to it, so to balance it. + */ + static int iteration_steps = 200; + size_t count = nodes.size(); + + size_t i = random.index(count); + size_t j = random.index(count-1); + j += (j >= i); // ensure that i != j + + Distance::template copy_node(p, nodes[i], f); + Distance::template copy_node(q, nodes[j], f); + + if (cosine) { Distance::template normalize(p, f); Distance::template normalize(q, f); } + Distance::init_node(p, f); + Distance::init_node(q, f); + + int ic = 1, jc = 1; + for (int l = 0; l < iteration_steps; l++) { + size_t k = random.index(count); + T di = ic * Distance::distance(p, nodes[k], f), + dj = jc * Distance::distance(q, nodes[k], f); + T norm = cosine ? get_norm(nodes[k]->v, f) : 1; + if (!(norm > T(0))) { + continue; + } + if (di < dj) { + for (int z = 0; z < f; z++) + p->v[z] = (p->v[z] * ic + nodes[k]->v[z] / norm) / (ic + 1); + Distance::init_node(p, f); + ic++; + } else if (dj < di) { + for (int z = 0; z < f; z++) + q->v[z] = (q->v[z] * jc + nodes[k]->v[z] / norm) / (jc + 1); + Distance::init_node(q, f); + jc++; + } + } +} +} // namespace + +struct Base { + template + static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) { + // Override this in specific metric structs below if you need to do any pre-processing + // on the entire set of nodes passed into this index. + } + + template + static inline void zero_value(Node* dest) { + // Initialize any fields that require sane defaults within this node. + } + + template + static inline void copy_node(Node* dest, const Node* source, const int f) { + memcpy(dest->v, source->v, f * sizeof(T)); + } + + template + static inline void normalize(Node* node, int f) { + T norm = get_norm(node->v, f); + if (norm > 0) { + for (int z = 0; z < f; z++) + node->v[z] /= norm; + } + } +}; + +struct Angular : Base { + template + struct Node { + /* + * We store a binary tree where each node has two things + * - A vector associated with it + * - Two children + * All nodes occupy the same amount of memory + * All nodes with n_descendants == 1 are leaf nodes. + * A memory optimization is that for nodes with 2 <= n_descendants <= K, + * we skip the vector. Instead we store a list of all descendants. K is + * determined by the number of items that fits in the space of the vector. + * For nodes with n_descendants == 1 the vector is a data point. + * For nodes with n_descendants > K the vector is the normal of the split plane. + * Note that we can't really do sizeof(node) because we cheat and allocate + * more memory to be able to fit the vector outside + */ + S n_descendants; + union { + S children[2]; // Will possibly store more than 2 + T norm; + }; + T v[V_ARRAY_SIZE]; + }; + template + static inline T distance(const Node* x, const Node* y, int f) { + // want to calculate (a/|a| - b/|b|)^2 + // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b| + // = 2 - 2cos + T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here + T qq = y->norm ? y->norm : dot(y->v, y->v, f); + T pq = dot(x->v, y->v, f); + T ppqq = pp * qq; + if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq); + else return 2.0; // cos is 0 + } + template + static inline T margin(const Node* n, const T* y, int f) { + return dot(n->v, y, f); + } + template + static inline bool side(const Node* n, const T* y, int f, Random& random) { + T dot = margin(n, y, f); + if (dot != 0) + return (dot > 0); + else + return (bool)random.flip(); + } + template + static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { + Node* p = (Node*)alloca(s); + Node* q = (Node*)alloca(s); + two_means >(nodes, f, random, true, p, q); + for (int z = 0; z < f; z++) + n->v[z] = p->v[z] - q->v[z]; + Base::normalize >(n, f); + } + template + static inline T normalized_distance(T distance) { + // Used when requesting distances from Python layer + // Turns out sometimes the squared distance is -0.0 + // so we have to make sure it's a positive number. + return sqrt(std::max(distance, T(0))); + } + template + static inline T pq_distance(T distance, T margin, int child_nr) { + if (child_nr == 0) + margin = -margin; + return std::min(distance, margin); + } + template + static inline T pq_initial_value() { + return numeric_limits::infinity(); + } + template + static inline void init_node(Node* n, int f) { + n->norm = dot(n->v, n->v, f); + } + static const char* name() { + return "angular"; + } +}; + + +struct DotProduct : Angular { + template + struct Node { + /* + * This is an extension of the Angular node with an extra attribute for the scaled norm. + */ + S n_descendants; + S children[2]; // Will possibly store more than 2 + T dot_factor; + T v[V_ARRAY_SIZE]; + }; + + static const char* name() { + return "dot"; + } + template + static inline T distance(const Node* x, const Node* y, int f) { + return -dot(x->v, y->v, f); + } + + template + static inline void zero_value(Node* dest) { + dest->dot_factor = 0; + } + + template + static inline void init_node(Node* n, int f) { + } + + template + static inline void copy_node(Node* dest, const Node* source, const int f) { + memcpy(dest->v, source->v, f * sizeof(T)); + dest->dot_factor = source->dot_factor; + } + + template + static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { + Node* p = (Node*)alloca(s); + Node* q = (Node*)alloca(s); + DotProduct::zero_value(p); + DotProduct::zero_value(q); + two_means >(nodes, f, random, true, p, q); + for (int z = 0; z < f; z++) + n->v[z] = p->v[z] - q->v[z]; + n->dot_factor = p->dot_factor - q->dot_factor; + DotProduct::normalize >(n, f); + } + + template + static inline void normalize(Node* node, int f) { + T norm = sqrt(dot(node->v, node->v, f) + pow(node->dot_factor, 2)); + if (norm > 0) { + for (int z = 0; z < f; z++) + node->v[z] /= norm; + node->dot_factor /= norm; + } + } + + template + static inline T margin(const Node* n, const T* y, int f) { + return dot(n->v, y, f) + (n->dot_factor * n->dot_factor); + } + + template + static inline bool side(const Node* n, const T* y, int f, Random& random) { + T dot = margin(n, y, f); + if (dot != 0) + return (dot > 0); + else + return (bool)random.flip(); + } + + template + static inline T normalized_distance(T distance) { + return -distance; + } + + template + static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) { + // This uses a method from Microsoft Research for transforming inner product spaces to cosine/angular-compatible spaces. + // (Bachrach et al., 2014, see https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf) + + // Step one: compute the norm of each vector and store that in its extra dimension (f-1) + for (S i = 0; i < node_count; i++) { + Node* node = get_node_ptr(nodes, _s, i); + T norm = sqrt(dot(node->v, node->v, f)); + if (isnan(norm)) norm = 0; + node->dot_factor = norm; + } + + // Step two: find the maximum norm + T max_norm = 0; + for (S i = 0; i < node_count; i++) { + Node* node = get_node_ptr(nodes, _s, i); + if (node->dot_factor > max_norm) { + max_norm = node->dot_factor; + } + } + + // Step three: set each vector's extra dimension to sqrt(max_norm^2 - norm^2) + for (S i = 0; i < node_count; i++) { + Node* node = get_node_ptr(nodes, _s, i); + T node_norm = node->dot_factor; + + T dot_factor = sqrt(pow(max_norm, static_cast(2.0)) - pow(node_norm, static_cast(2.0))); + if (isnan(dot_factor)) dot_factor = 0; + + node->dot_factor = dot_factor; + } + } +}; + +struct Hamming : Base { + template + struct Node { + S n_descendants; + S children[2]; + T v[V_ARRAY_SIZE]; + }; + + static const size_t max_iterations = 20; + + template + static inline T pq_distance(T distance, T margin, int child_nr) { + return distance - (margin != (unsigned int) child_nr); + } + + template + static inline T pq_initial_value() { + return numeric_limits::max(); + } + template + static inline int cole_popcount(T v) { + // Note: Only used with MSVC 9, which lacks intrinsics and fails to + // calculate std::bitset::count for v > 32bit. Uses the generalized + // approach by Eric Cole. + // See https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 + v = v - ((v >> 1) & (T)~(T)0/3); + v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); + v = (v + (v >> 4)) & (T)~(T)0/255*15; + return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; + } + template + static inline T distance(const Node* x, const Node* y, int f) { + size_t dist = 0; + for (int i = 0; i < f; i++) { + dist += popcount(x->v[i] ^ y->v[i]); + } + return dist; + } + template + static inline bool margin(const Node* n, const T* y, int f) { + static const size_t n_bits = sizeof(T) * 8; + T chunk = n->v[0] / n_bits; + return (y[chunk] & (static_cast(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0; + } + template + static inline bool side(const Node* n, const T* y, int f, Random& random) { + return margin(n, y, f); + } + template + static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { + size_t cur_size = 0; + size_t i = 0; + int dim = f * 8 * sizeof(T); + for (; i < max_iterations; i++) { + // choose random position to split at + n->v[0] = random.index(dim); + cur_size = 0; + for (typename vector*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) { + if (margin(n, (*it)->v, f)) { + cur_size++; + } + } + if (cur_size > 0 && cur_size < nodes.size()) { + break; + } + } + // brute-force search for splitting coordinate + if (i == max_iterations) { + int j = 0; + for (; j < dim; j++) { + n->v[0] = j; + cur_size = 0; + for (typename vector*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) { + if (margin(n, (*it)->v, f)) { + cur_size++; + } + } + if (cur_size > 0 && cur_size < nodes.size()) { + break; + } + } + } + } + template + static inline T normalized_distance(T distance) { + return distance; + } + template + static inline void init_node(Node* n, int f) { + } + static const char* name() { + return "hamming"; + } +}; + + +struct Minkowski : Base { + template + struct Node { + S n_descendants; + T a; // need an extra constant term to determine the offset of the plane + S children[2]; + T v[V_ARRAY_SIZE]; + }; + template + static inline T margin(const Node* n, const T* y, int f) { + return n->a + dot(n->v, y, f); + } + template + static inline bool side(const Node* n, const T* y, int f, Random& random) { + T dot = margin(n, y, f); + if (dot != 0) + return (dot > 0); + else + return (bool)random.flip(); + } + template + static inline T pq_distance(T distance, T margin, int child_nr) { + if (child_nr == 0) + margin = -margin; + return std::min(distance, margin); + } + template + static inline T pq_initial_value() { + return numeric_limits::infinity(); + } +}; + + +struct Euclidean : Minkowski { + template + static inline T distance(const Node* x, const Node* y, int f) { + return euclidean_distance(x->v, y->v, f); + } + template + static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { + Node* p = (Node*)alloca(s); + Node* q = (Node*)alloca(s); + two_means >(nodes, f, random, false, p, q); + + for (int z = 0; z < f; z++) + n->v[z] = p->v[z] - q->v[z]; + Base::normalize >(n, f); + n->a = 0.0; + for (int z = 0; z < f; z++) + n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2; + } + template + static inline T normalized_distance(T distance) { + return sqrt(std::max(distance, T(0))); + } + template + static inline void init_node(Node* n, int f) { + } + static const char* name() { + return "euclidean"; + } + +}; + +struct Manhattan : Minkowski { + template + static inline T distance(const Node* x, const Node* y, int f) { + return manhattan_distance(x->v, y->v, f); + } + template + static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { + Node* p = (Node*)alloca(s); + Node* q = (Node*)alloca(s); + two_means >(nodes, f, random, false, p, q); + + for (int z = 0; z < f; z++) + n->v[z] = p->v[z] - q->v[z]; + Base::normalize >(n, f); + n->a = 0.0; + for (int z = 0; z < f; z++) + n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2; + } + template + static inline T normalized_distance(T distance) { + return std::max(distance, T(0)); + } + template + static inline void init_node(Node* n, int f) { + } + static const char* name() { + return "manhattan"; + } +}; + +template +class AnnoyIndexInterface { + public: + // Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL + virtual ~AnnoyIndexInterface() {}; + virtual bool add_item(S item, const T* w, char** error=NULL) = 0; + virtual bool build(int q, char** error=NULL) = 0; + virtual bool unbuild(char** error=NULL) = 0; + virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0; + virtual void unload() = 0; + virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; + virtual bool load_index(const unsigned char* index_data, const int64_t& index_size, char** error = NULL) = 0; + virtual T get_distance(S i, S j) const = 0; + virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances, + faiss::ConcurrentBitsetPtr bitset = nullptr) const = 0; + virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances, + faiss::ConcurrentBitsetPtr bitset = nullptr) const = 0; + virtual S get_n_items() const = 0; + virtual S get_dim() const = 0; + virtual S get_n_trees() const = 0; + virtual int64_t get_index_length() const = 0; + virtual void* get_index() const = 0; + virtual void verbose(bool v) = 0; + virtual void get_item(S item, T* v) const = 0; + virtual void set_seed(int q) = 0; + virtual bool on_disk_build(const char* filename, char** error=NULL) = 0; +}; + +template + class AnnoyIndex : public AnnoyIndexInterface { + /* + * We use random projection to build a forest of binary trees of all items. + * Basically just split the hyperspace into two sides by a hyperplane, + * then recursively split each of those subtrees etc. + * We create a tree like this q times. The default q is determined automatically + * in such a way that we at most use 2x as much memory as the vectors take. + */ +public: + typedef Distance D; + typedef typename D::template Node Node; + +protected: + const int _f; + size_t _s; + S _n_items; + Random _random; + void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate + S _n_nodes; + S _nodes_size; + vector _roots; + S _K; + bool _loaded; + bool _verbose; + int _fd; + bool _on_disk; + bool _built; +public: + + AnnoyIndex(int f) : _f(f), _random() { + _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node + _verbose = false; + _built = false; + _K = (S) (((size_t) (_s - offsetof(Node, children))) / sizeof(S)); // Max number of descendants to fit into node + reinitialize(); // Reset everything + } + ~AnnoyIndex() { + unload(); + } + + int get_f() const { + return _f; + } + + bool add_item(S item, const T* w, char** error=NULL) { + return add_item_impl(item, w, error); + } + + template + bool add_item_impl(S item, const W& w, char** error=NULL) { + if (_loaded) { + set_error_from_string(error, "You can't add an item to a loaded index"); + return false; + } + _allocate_size(item + 1); + Node* n = _get(item); + + D::zero_value(n); + + n->children[0] = 0; + n->children[1] = 0; + n->n_descendants = 1; + + for (int z = 0; z < _f; z++) + n->v[z] = w[z]; + + D::init_node(n, _f); + + if (item >= _n_items) + _n_items = item + 1; + + return true; + } + + bool on_disk_build(const char* file, char** error=NULL) { + _on_disk = true; + _fd = open(file, O_RDWR | O_CREAT | O_TRUNC, (int) 0600); + if (_fd == -1) { + set_error_from_errno(error, "Unable to open"); + _fd = 0; + return false; + } + _nodes_size = 1; + if (ftruncate(_fd, _s * _nodes_size) == -1) { + set_error_from_errno(error, "Unable to truncate"); + return false; + } +#ifdef MAP_POPULATE + _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0); +#else + _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0); +#endif + return true; + } + + bool build(int q, char** error=NULL) { + if (_loaded) { + set_error_from_string(error, "You can't build a loaded index"); + return false; + } + + if (_built) { + set_error_from_string(error, "You can't build a built index"); + return false; + } + + D::template preprocess(_nodes, _s, _n_items, _f); + + _n_nodes = _n_items; + while (1) { + if (q == -1 && _n_nodes >= _n_items * 2) + break; + if (q != -1 && _roots.size() >= (size_t)q) + break; + if (_verbose) showUpdate("pass %zd...\n", _roots.size()); + + vector indices; + for (S i = 0; i < _n_items; i++) { + if (_get(i)->n_descendants >= 1) // Issue #223 + indices.push_back(i); + } + + _roots.push_back(_make_tree(indices, true)); + } + + // Also, copy the roots into the last segment of the array + // This way we can load them faster without reading the whole file + _allocate_size(_n_nodes + (S)_roots.size()); + for (size_t i = 0; i < _roots.size(); i++) + memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s); + _n_nodes += _roots.size(); + + if (_verbose) showUpdate("has %d nodes\n", _n_nodes); + + if (_on_disk) { + _nodes = remap_memory(_nodes, _fd, _s * _nodes_size, _s * _n_nodes); + if (ftruncate(_fd, _s * _n_nodes)) { + // TODO: this probably creates an index in a corrupt state... not sure what to do + set_error_from_errno(error, "Unable to truncate"); + return false; + } + _nodes_size = _n_nodes; + } + _built = true; + return true; + } + + bool unbuild(char** error=NULL) { + if (_loaded) { + set_error_from_string(error, "You can't unbuild a loaded index"); + return false; + } + + _roots.clear(); + _n_nodes = _n_items; + _built = false; + + return true; + } + + bool save(const char* filename, bool prefault=false, char** error=NULL) { + if (!_built) { + set_error_from_string(error, "You can't save an index that hasn't been built"); + return false; + } + if (_on_disk) { + return true; + } else { + // Delete file if it already exists (See issue #335) + unlink(filename); + + FILE *f = fopen(filename, "wb"); + if (f == NULL) { + set_error_from_errno(error, "Unable to open"); + return false; + } + + if (fwrite(_nodes, _s, _n_nodes, f) != (size_t) _n_nodes) { + set_error_from_errno(error, "Unable to write"); + return false; + } + + if (fclose(f) == EOF) { + set_error_from_errno(error, "Unable to close"); + return false; + } + + unload(); + return load(filename, prefault, error); + } + } + + void reinitialize() { + _fd = 0; + _nodes = NULL; + _loaded = false; + _n_items = 0; + _n_nodes = 0; + _nodes_size = 0; + _on_disk = false; + _roots.clear(); + } + + void unload() { + if (_on_disk && _fd) { + close(_fd); + munmap(_nodes, _s * _nodes_size); + } else { + if (_fd) { + // we have mmapped data + close(_fd); + munmap(_nodes, _n_nodes * _s); + } else if (_nodes) { + // We have heap allocated data + free(_nodes); + } + } + reinitialize(); + if (_verbose) showUpdate("unloaded\n"); + } + + bool load(const char* filename, bool prefault=false, char** error=NULL) { + _fd = open(filename, O_RDONLY, (int)0400); + if (_fd == -1) { + set_error_from_errno(error, "Unable to open"); + _fd = 0; + return false; + } + off_t size = lseek_getsize(_fd); + if (size == -1) { + set_error_from_errno(error, "Unable to get size"); + return false; + } else if (size == 0) { + set_error_from_errno(error, "Size of file is zero"); + return false; + } else if (size % _s) { + // Something is fishy with this index! + set_error_from_errno(error, "Index size is not a multiple of vector size"); + return false; + } + + int flags = MAP_SHARED; + if (prefault) { +#ifdef MAP_POPULATE + flags |= MAP_POPULATE; +#else + showUpdate("prefault is set to true, but MAP_POPULATE is not defined on this platform"); +#endif + } + _nodes = (Node*)mmap(0, size, PROT_READ, flags, _fd, 0); + _n_nodes = (S)(size / _s); + + // Find the roots by scanning the end of the file and taking the nodes with most descendants + _roots.clear(); + S m = -1; + for (S i = _n_nodes - 1; i >= 0; i--) { + S k = _get(i)->n_descendants; + if (m == -1 || k == m) { + _roots.push_back(i); + m = k; + } else { + break; + } + } + // hacky fix: since the last root precedes the copy of all roots, delete it + if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) + _roots.pop_back(); + _loaded = true; + _built = true; + _n_items = m; + if (_verbose) showUpdate("found %lu roots with degree %d\n", _roots.size(), m); + return true; + } + + bool load_index(const unsigned char* index_data, const int64_t& index_size, char** error) { + if (index_size == -1) { + set_error_from_errno(error, "Unable to get size"); + return false; + } else if (index_size == 0) { + set_error_from_errno(error, "Size of file is zero"); + return false; + } else if (index_size % _s) { + // Something is fishy with this index! + set_error_from_errno(error, "Index size is not a multiple of vector size"); + return false; + } + + _n_nodes = (S)(index_size / _s); + _nodes = (Node*)malloc(_s * _n_nodes); + memcpy(_nodes, index_data, (size_t)index_size); + + // Find the roots by scanning the end of the file and taking the nodes with most descendants + _roots.clear(); + S m = -1; + for (S i = _n_nodes - 1; i >= 0; i--) { + S k = _get(i)->n_descendants; + if (m == -1 || k == m) { + _roots.push_back(i); + m = k; + } else { + break; + } + } + // hacky fix: since the last root precedes the copy of all roots, delete it + if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) + _roots.pop_back(); + _loaded = true; + _built = true; + _n_items = m; + if (_verbose) showUpdate("found %lu roots with degree %d\n", _roots.size(), m); + return true; + } + + T get_distance(S i, S j) const { + return D::normalized_distance(D::distance(_get(i), _get(j), _f)); + } + + void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances, + faiss::ConcurrentBitsetPtr bitset) const { + // TODO: handle OOB + const Node* m = _get(item); + _get_all_nns(m->v, n, search_k, result, distances, bitset); + } + + void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances, + faiss::ConcurrentBitsetPtr bitset) const { + _get_all_nns(w, n, search_k, result, distances, bitset); + } + + S get_n_items() const { + return _n_items; + } + + S get_dim() const { + return _f; + } + + S get_n_trees() const { + return (S)_roots.size(); + } + + int64_t get_index_length() const { + return (int64_t)_s * _nodes_size; + } + + void* get_index() const { + return _nodes; + } + + void verbose(bool v) { + _verbose = v; + } + + void get_item(S item, T* v) const { + // TODO: handle OOB + Node* m = _get(item); + memcpy(v, m->v, (_f) * sizeof(T)); + } + + void set_seed(int seed) { + _random.set_seed(seed); + } + +protected: + void _allocate_size(S n) { + if (n > _nodes_size) { + const double reallocation_factor = 1.3; + S new_nodes_size = std::max(n, (S) ((_nodes_size + 1) * reallocation_factor)); + void *old = _nodes; + + if (_on_disk) { + int rc = ftruncate(_fd, _s * new_nodes_size); + if (_verbose && rc) showUpdate("File truncation error\n"); + _nodes = remap_memory(_nodes, _fd, _s * _nodes_size, _s * new_nodes_size); + } else { + _nodes = realloc(_nodes, _s * new_nodes_size); + memset((char *) _nodes + (_nodes_size * _s) / sizeof(char), 0, (new_nodes_size - _nodes_size) * _s); + } + + _nodes_size = new_nodes_size; + if (_verbose) showUpdate("Reallocating to %d nodes: old_address=%p, new_address=%p\n", new_nodes_size, old, _nodes); + } + } + + inline Node* _get(const S i) const { + return get_node_ptr(_nodes, _s, i); + } + + S _make_tree(const vector& indices, bool is_root) { + // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node. + // There's some regrettable complications caused by the problem that root nodes have to be "special": + // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have + // 2. Root nodes with only 1 child need to be a "dummy" parent + // 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K + if (indices.size() == 1 && !is_root) + return indices[0]; + + if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) { + _allocate_size(_n_nodes + 1); + S item = _n_nodes++; + Node* m = _get(item); + m->n_descendants = is_root ? _n_items : (S)indices.size(); + + // Using std::copy instead of a loop seems to resolve issues #3 and #13, + // probably because gcc 4.8 goes overboard with optimizations. + // Using memcpy instead of std::copy for MSVC compatibility. #235 + // Only copy when necessary to avoid crash in MSVC 9. #293 + if (!indices.empty()) + memcpy(m->children, &indices[0], indices.size() * sizeof(S)); + return item; + } + + vector children; + for (size_t i = 0; i < indices.size(); i++) { + S j = indices[i]; + Node* n = _get(j); + if (n) + children.push_back(n); + } + + vector children_indices[2]; + Node* m = (Node*)alloca(_s); + D::create_split(children, _f, _s, _random, m); + + for (size_t i = 0; i < indices.size(); i++) { + S j = indices[i]; + Node* n = _get(j); + if (n) { + bool side = D::side(m, n->v, _f, _random); + children_indices[side].push_back(j); + } else { + showUpdate("No node for index %d?\n", j); + } + } + + // If we didn't find a hyperplane, just randomize sides as a last option + while (children_indices[0].size() == 0 || children_indices[1].size() == 0) { + if (_verbose) + showUpdate("\tNo hyperplane found (left has %ld children, right has %ld children)\n", + children_indices[0].size(), children_indices[1].size()); + if (_verbose && indices.size() > 100000) + showUpdate("Failed splitting %lu items\n", indices.size()); + + children_indices[0].clear(); + children_indices[1].clear(); + + // Set the vector to 0.0 + for (int z = 0; z < _f; z++) + m->v[z] = 0; + + for (size_t i = 0; i < indices.size(); i++) { + S j = indices[i]; + // Just randomize... + children_indices[_random.flip()].push_back(j); + } + } + + int flip = (children_indices[0].size() > children_indices[1].size()); + + m->n_descendants = is_root ? _n_items : (S)indices.size(); + for (int side = 0; side < 2; side++) { + // run _make_tree for the smallest child first (for cache locality) + m->children[side^flip] = _make_tree(children_indices[side^flip], false); + } + + _allocate_size(_n_nodes + 1); + S item = _n_nodes++; + memcpy(_get(item), m, _s); + + return item; + } + + void _get_all_nns(const T* v, size_t n, int search_k, vector* result, vector* distances, + faiss::ConcurrentBitsetPtr bitset) const { + Node* v_node = (Node *)alloca(_s); + D::template zero_value(v_node); + memcpy(v_node->v, v, sizeof(T) * _f); + D::init_node(v_node, _f); + + std::priority_queue > q; + + if (search_k <= 0) { + search_k = n * _roots.size(); + } + + for (size_t i = 0; i < _roots.size(); i++) { + q.push(make_pair(Distance::template pq_initial_value(), _roots[i])); + } + + std::vector nns; + while (nns.size() < (size_t)search_k && !q.empty()) { + const pair& top = q.top(); + T d = top.first; + S i = top.second; + Node* nd = _get(i); + q.pop(); + if (nd->n_descendants == 1 && i < _n_items) { // raw data + if (bitset == nullptr || !bitset->test((faiss::ConcurrentBitset::id_type_t)i)) + nns.push_back(i); + } else if (nd->n_descendants <= _K) { + const S* dst = nd->children; + for (auto ii = 0; ii < nd->n_descendants; ++ ii) { + if (bitset == nullptr || !bitset->test((faiss::ConcurrentBitset::id_type_t)dst[ii])) + nns.push_back(dst[ii]); +// nns.insert(nns.end(), dst, &dst[nd->n_descendants]); + } + } else { + T margin = D::margin(nd, v, _f); + q.push(make_pair(D::pq_distance(d, margin, 1), static_cast(nd->children[1]))); + q.push(make_pair(D::pq_distance(d, margin, 0), static_cast(nd->children[0]))); + } + } + + // Get distances for all items + // To avoid calculating distance multiple times for any items, sort by id + std::sort(nns.begin(), nns.end()); + vector > nns_dist; + S last = -1; + for (size_t i = 0; i < nns.size(); i++) { + S j = nns[i]; + if (j == last) + continue; + last = j; + if (_get(j)->n_descendants == 1) // This is only to guard a really obscure case, #284 + nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j)); + } + + size_t m = nns_dist.size(); + size_t p = n < m ? n : m; // Return this many items + std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end()); + for (size_t i = 0; i < p; i++) { + if (distances) + distances->push_back(D::normalized_distance(nns_dist[i].first)); + result->push_back(nns_dist[i].second); + } + } +}; + +#endif +// vim: tabstop=2 shiftwidth=2 diff --git a/core/src/index/thirdparty/annoy/src/annoyluamodule.cc b/core/src/index/thirdparty/annoy/src/annoyluamodule.cc new file mode 100644 index 0000000000..76fec7c9e0 --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/annoyluamodule.cc @@ -0,0 +1,318 @@ +// Copyright (c) 2016 Boris Nagaev +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include +#include + +#include + +#include "annoylib.h" +#include "kissrandom.h" + +#if LUA_VERSION_NUM == 501 +#define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs) +#define compat_rawlen lua_objlen +#else +#define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0) +#define compat_rawlen lua_rawlen +#endif + +template +class LuaAnnoy { +public: + typedef int32_t AnnoyS; + typedef float AnnoyT; + typedef AnnoyIndex Impl; + typedef LuaAnnoy ThisClass; + + class LuaArrayProxy { + public: + LuaArrayProxy(lua_State* L, int object, int f) + : L_(L) + , object_(object) + { + luaL_checktype(L, object, LUA_TTABLE); + int v_len = compat_rawlen(L, object); + luaL_argcheck(L, v_len == f, object, "Length of v != f"); + } + + double operator[](int index) const { + lua_rawgeti(L_, object_, index + 1); + double result = lua_tonumber(L_, -1); + lua_pop(L_, 1); + return result; + } + + private: + lua_State* L_; + int object_; + }; + + static void toVector(lua_State* L, int object, int f, AnnoyT* dst) { + LuaArrayProxy proxy(L, object, f); + for (int i = 0; i < f; i++) { + dst[i] = proxy[i]; + } + } + + template + static void pushVector(lua_State* L, const Vector& v) { + lua_createtable(L, v.size(), 0); + for (int j = 0; j < v.size(); j++) { + lua_pushnumber(L, v[j]); + lua_rawseti(L, -2, j + 1); + } + } + + static const char* typeAsString() { + return typeid(Impl).name(); + } + + static Impl* getAnnoy(lua_State* L, int object) { + return reinterpret_cast( + luaL_checkudata(L, object, typeAsString()) + ); + } + + static int getItemIndex(lua_State* L, int object, int size = -1) { + int item = luaL_checkinteger(L, object); + luaL_argcheck(L, item >= 0, object, "Index must be >= 0"); + if (size != -1) { + luaL_argcheck(L, item < size, object, "Index must be < size"); + } + return item; + } + + static int gc(lua_State* L) { + Impl* self = getAnnoy(L, 1); + self->~Impl(); + return 0; + } + + static int tostring(lua_State* L) { + Impl* self = getAnnoy(L, 1); + lua_pushfstring( + L, + "annoy.AnnoyIndex object (%dx%d, %s distance)", + self->get_n_items(), self->get_f(), Distance::name() + ); + return 1; + } + + static int add_item(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int item = getItemIndex(L, 2); + self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f())); + return 0; + } + + static int build(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int n_trees = luaL_checkinteger(L, 2); + self->build(n_trees); + lua_pushboolean(L, true); + return 1; + } + + static int on_disk_build(lua_State* L) { + Impl* self = getAnnoy(L, 1); + const char* filename = luaL_checkstring(L, 2); + self->on_disk_build(filename); + lua_pushboolean(L, true); + return 1; + } + + static int save(lua_State* L) { + int nargs = lua_gettop(L); + Impl* self = getAnnoy(L, 1); + const char* filename = luaL_checkstring(L, 2); + bool prefault = true; + if (nargs >= 3) { + prefault = lua_toboolean(L, 3); + } + self->save(filename, prefault); + lua_pushboolean(L, true); + return 1; + } + + static int load(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int nargs = lua_gettop(L); + const char* filename = luaL_checkstring(L, 2); + bool prefault = true; + if (nargs >= 3) { + prefault = lua_toboolean(L, 3); + } + if (!self->load(filename, prefault)) { + return luaL_error(L, "Can't load file: %s", filename); + } + lua_pushboolean(L, true); + return 1; + } + + static int unload(lua_State* L) { + Impl* self = getAnnoy(L, 1); + self->unload(); + lua_pushboolean(L, true); + return 1; + } + + struct Searcher { + std::vector result; + std::vector distances; + Impl* self; + int n; + int search_k; + bool include_distances; + + Searcher(lua_State* L) { + int nargs = lua_gettop(L); + self = getAnnoy(L, 1); + n = luaL_checkinteger(L, 3); + search_k = -1; + if (nargs >= 4) { + search_k = luaL_checkinteger(L, 4); + } + include_distances = false; + if (nargs >= 5) { + include_distances = lua_toboolean(L, 5); + } + } + + int pushResults(lua_State* L) { + pushVector(L, result); + if (include_distances) { + pushVector(L, distances); + } + return include_distances ? 2 : 1; + } + }; + + static int get_nns_by_item(lua_State* L) { + Searcher s(L); + int item = getItemIndex(L, 2, s.self->get_n_items()); + s.self->get_nns_by_item(item, s.n, s.search_k, &s.result, + s.include_distances ? &s.distances : NULL); + return s.pushResults(L); + } + + static int get_nns_by_vector(lua_State* L) { + Searcher s(L); + std::vector _vec(s.self->get_f()); + AnnoyT* vec = &(_vec[0]); + toVector(L, 2, s.self->get_f(), vec); + s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result, + s.include_distances ? &s.distances : NULL); + return s.pushResults(L); + } + + static int get_item_vector(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int item = getItemIndex(L, 2, self->get_n_items()); + std::vector _vec(self->get_f()); + AnnoyT* vec = &(_vec[0]); + self->get_item(item, vec); + pushVector(L, _vec); + return 1; + } + + static int get_distance(lua_State* L) { + Impl* self = getAnnoy(L, 1); + int i = getItemIndex(L, 2, self->get_n_items()); + int j = getItemIndex(L, 3, self->get_n_items()); + AnnoyT distance = self->get_distance(i, j); + lua_pushnumber(L, distance); + return 1; + } + + static int get_n_items(lua_State* L) { + Impl* self = getAnnoy(L, 1); + lua_pushnumber(L, self->get_n_items()); + return 1; + } + + static const luaL_Reg* getMetatable() { + static const luaL_Reg funcs[] = { + {"__gc", &ThisClass::gc}, + {"__tostring", &ThisClass::tostring}, + {NULL, NULL}, + }; + return funcs; + } + + static const luaL_Reg* getMethods() { + static const luaL_Reg funcs[] = { + {"add_item", &ThisClass::add_item}, + {"build", &ThisClass::build}, + {"save", &ThisClass::save}, + {"load", &ThisClass::load}, + {"unload", &ThisClass::unload}, + {"get_nns_by_item", &ThisClass::get_nns_by_item}, + {"get_nns_by_vector", &ThisClass::get_nns_by_vector}, + {"get_item_vector", &ThisClass::get_item_vector}, + {"get_distance", &ThisClass::get_distance}, + {"get_n_items", &ThisClass::get_n_items}, + {"on_disk_build", &ThisClass::on_disk_build}, + {NULL, NULL}, + }; + return funcs; + } + + static void createNew(lua_State* L, int f) { + void* self = lua_newuserdata(L, sizeof(Impl)); + if (luaL_newmetatable(L, typeAsString())) { + compat_setfuncs(L, getMetatable()); + lua_newtable(L); + compat_setfuncs(L, getMethods()); + lua_setfield(L, -2, "__index"); + } + new (self) Impl(f); + lua_setmetatable(L, -2); + } +}; + +static int lua_an_make(lua_State* L) { + int f = luaL_checkinteger(L, 1); + const char* metric = "angular"; + if (lua_gettop(L) >= 2) { + metric = luaL_checkstring(L, 2); + } + if (strcmp(metric, "angular") == 0) { + LuaAnnoy::createNew(L, f); + return 1; + } else if (strcmp(metric, "euclidean") == 0) { + LuaAnnoy::createNew(L, f); + return 1; + } else if (strcmp(metric, "manhattan") == 0) { + LuaAnnoy::createNew(L, f); + return 1; + } else { + return luaL_error(L, "Unknown metric: %s", metric); + } +} + +static const luaL_Reg LUA_ANNOY_FUNCS[] = { + {"AnnoyIndex", lua_an_make}, + {NULL, NULL}, +}; + +extern "C" { +int luaopen_annoy(lua_State* L) { + lua_newtable(L); + compat_setfuncs(L, LUA_ANNOY_FUNCS); + return 1; +} +} + +// vim: tabstop=2 shiftwidth=2 diff --git a/core/src/index/thirdparty/annoy/src/annoymodule.cc b/core/src/index/thirdparty/annoy/src/annoymodule.cc new file mode 100644 index 0000000000..f15a0cc692 --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/annoymodule.cc @@ -0,0 +1,632 @@ +// Copyright (c) 2013 Spotify AB +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +#include "annoylib.h" +#include "kissrandom.h" +#include "Python.h" +#include "structmember.h" +#include +#if defined(_MSC_VER) && _MSC_VER == 1500 +typedef signed __int32 int32_t; +#else +#include +#endif + + +#if defined(USE_AVX512) +#define AVX_INFO "Using 512-bit AVX instructions" +#elif defined(USE_AVX128) +#define AVX_INFO "Using 128-bit AVX instructions" +#else +#define AVX_INFO "Not using AVX instructions" +#endif + +#if defined(_MSC_VER) +#define COMPILER_INFO "Compiled using MSC" +#elif defined(__GNUC__) +#define COMPILER_INFO "Compiled on GCC" +#else +#define COMPILER_INFO "Compiled on unknown platform" +#endif + +#define ANNOY_DOC (COMPILER_INFO ". " AVX_INFO ".") + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef Py_TYPE + #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) +#endif + +#ifdef IS_PY3K + #define PyInt_FromLong PyLong_FromLong +#endif + + +template class AnnoyIndexInterface; + +class HammingWrapper : public AnnoyIndexInterface { + // Wrapper class for Hamming distance, using composition. + // This translates binary (float) vectors into packed uint64_t vectors. + // This is questionable from a performance point of view. Should reconsider this solution. +private: + int32_t _f_external, _f_internal; + AnnoyIndex _index; + void _pack(const float* src, uint64_t* dst) const { + for (int32_t i = 0; i < _f_internal; i++) { + dst[i] = 0; + for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) { + dst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j; + } + } + }; + void _unpack(const uint64_t* src, float* dst) const { + for (int32_t i = 0; i < _f_external; i++) { + dst[i] = (src[i / 64] >> (i % 64)) & 1; + } + }; +public: + HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {}; + bool add_item(int32_t item, const float* w, char**error) { + vector w_internal(_f_internal, 0); + _pack(w, &w_internal[0]); + return _index.add_item(item, &w_internal[0], error); + }; + bool build(int q, char** error) { return _index.build(q, error); }; + bool unbuild(char** error) { return _index.unbuild(error); }; + bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); }; + void unload() { _index.unload(); }; + bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); }; + float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); }; + void get_nns_by_item(int32_t item, size_t n, int search_k, vector* result, vector* distances) const { + if (distances) { + vector distances_internal; + _index.get_nns_by_item(item, n, search_k, result, &distances_internal); + distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end()); + } else { + _index.get_nns_by_item(item, n, search_k, result, NULL); + } + }; + void get_nns_by_vector(const float* w, size_t n, int search_k, vector* result, vector* distances) const { + vector w_internal(_f_internal, 0); + _pack(w, &w_internal[0]); + if (distances) { + vector distances_internal; + _index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal); + distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end()); + } else { + _index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL); + } + }; + int32_t get_n_items() const { return _index.get_n_items(); }; + int32_t get_n_trees() const { return _index.get_n_trees(); }; + void verbose(bool v) { _index.verbose(v); }; + void get_item(int32_t item, float* v) const { + vector v_internal(_f_internal, 0); + _index.get_item(item, &v_internal[0]); + _unpack(&v_internal[0], v); + }; + void set_seed(int q) { _index.set_seed(q); }; + bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); }; +}; + +// annoy python object +typedef struct { + PyObject_HEAD + int f; + AnnoyIndexInterface* ptr; +} py_annoy; + + +static PyObject * +py_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { + py_annoy *self = (py_annoy *)type->tp_alloc(type, 0); + if (self == NULL) { + return NULL; + } + const char *metric = NULL; + + static char const * kwlist[] = {"f", "metric", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &self->f, &metric)) + return NULL; + if (!metric) { + // This keeps coming up, see #368 etc + PyErr_WarnEx(PyExc_FutureWarning, "The default argument for metric will be removed " + "in future version of Annoy. Please pass metric='angular' explicitly.", 1); + self->ptr = new AnnoyIndex(self->f); + } else if (!strcmp(metric, "angular")) { + self->ptr = new AnnoyIndex(self->f); + } else if (!strcmp(metric, "euclidean")) { + self->ptr = new AnnoyIndex(self->f); + } else if (!strcmp(metric, "manhattan")) { + self->ptr = new AnnoyIndex(self->f); + } else if (!strcmp(metric, "hamming")) { + self->ptr = new HammingWrapper(self->f); + } else if (!strcmp(metric, "dot")) { + self->ptr = new AnnoyIndex(self->f); + } else { + PyErr_SetString(PyExc_ValueError, "No such metric"); + return NULL; + } + + return (PyObject *)self; +} + + +static int +py_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) { + // Seems to be needed for Python 3 + const char *metric = NULL; + int f; + static char const * kwlist[] = {"f", "metric", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &f, &metric)) + return (int) NULL; + return 0; +} + + +static void +py_an_dealloc(py_annoy* self) { + delete self->ptr; + Py_TYPE(self)->tp_free((PyObject*)self); +} + + +static PyMemberDef py_annoy_members[] = { + {(char*)"f", T_INT, offsetof(py_annoy, f), 0, + (char*)""}, + {NULL} /* Sentinel */ +}; + + +static PyObject * +py_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) { + char *filename, *error; + bool prefault = false; + if (!self->ptr) + return NULL; + static char const * kwlist[] = {"fn", "prefault", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault)) + return NULL; + + if (!self->ptr->load(filename, prefault, &error)) { + PyErr_SetString(PyExc_IOError, error); + free(error); + return NULL; + } + Py_RETURN_TRUE; +} + + +static PyObject * +py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) { + char *filename, *error; + bool prefault = false; + if (!self->ptr) + return NULL; + static char const * kwlist[] = {"fn", "prefault", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault)) + return NULL; + + if (!self->ptr->save(filename, prefault, &error)) { + PyErr_SetString(PyExc_IOError, error); + free(error); + return NULL; + } + Py_RETURN_TRUE; +} + + +PyObject* +get_nns_to_python(const vector& result, const vector& distances, int include_distances) { + PyObject* l = PyList_New(result.size()); + for (size_t i = 0; i < result.size(); i++) + PyList_SetItem(l, i, PyInt_FromLong(result[i])); + if (!include_distances) + return l; + + PyObject* d = PyList_New(distances.size()); + for (size_t i = 0; i < distances.size(); i++) + PyList_SetItem(d, i, PyFloat_FromDouble(distances[i])); + + PyObject* t = PyTuple_New(2); + PyTuple_SetItem(t, 0, l); + PyTuple_SetItem(t, 1, d); + + return t; +} + + +bool check_constraints(py_annoy *self, int32_t item, bool building) { + if (item < 0) { + PyErr_SetString(PyExc_IndexError, "Item index can not be negative"); + return false; + } else if (!building && item >= self->ptr->get_n_items()) { + PyErr_SetString(PyExc_IndexError, "Item index larger than the largest item index"); + return false; + } else { + return true; + } +} + +static PyObject* +py_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) { + int32_t item, n, search_k=-1, include_distances=0; + if (!self->ptr) + return NULL; + + static char const * kwlist[] = {"i", "n", "search_k", "include_distances", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|ii", (char**)kwlist, &item, &n, &search_k, &include_distances)) + return NULL; + + if (!check_constraints(self, item, false)) { + return NULL; + } + + vector result; + vector distances; + + Py_BEGIN_ALLOW_THREADS; + self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL); + Py_END_ALLOW_THREADS; + + return get_nns_to_python(result, distances, include_distances); +} + + +bool +convert_list_to_vector(PyObject* v, int f, vector* w) { + if (PyObject_Size(v) == -1) { + char buf[256]; + snprintf(buf, 256, "Expected an iterable, got an object of type \"%s\"", v->ob_type->tp_name); + PyErr_SetString(PyExc_ValueError, buf); + return false; + } + if (PyObject_Size(v) != f) { + char buf[128]; + snprintf(buf, 128, "Vector has wrong length (expected %d, got %ld)", f, PyObject_Size(v)); + PyErr_SetString(PyExc_IndexError, buf); + return false; + } + for (int z = 0; z < f; z++) { + PyObject *key = PyInt_FromLong(z); + PyObject *pf = PyObject_GetItem(v, key); + (*w)[z] = PyFloat_AsDouble(pf); + Py_DECREF(key); + Py_DECREF(pf); + } + return true; +} + +static PyObject* +py_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) { + PyObject* v; + int32_t n, search_k=-1, include_distances=0; + if (!self->ptr) + return NULL; + + static char const * kwlist[] = {"vector", "n", "search_k", "include_distances", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", (char**)kwlist, &v, &n, &search_k, &include_distances)) + return NULL; + + vector w(self->f); + if (!convert_list_to_vector(v, self->f, &w)) { + return NULL; + } + + vector result; + vector distances; + + Py_BEGIN_ALLOW_THREADS; + self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL); + Py_END_ALLOW_THREADS; + + return get_nns_to_python(result, distances, include_distances); +} + + +static PyObject* +py_an_get_item_vector(py_annoy *self, PyObject *args) { + int32_t item; + if (!self->ptr) + return NULL; + if (!PyArg_ParseTuple(args, "i", &item)) + return NULL; + + if (!check_constraints(self, item, false)) { + return NULL; + } + + vector v(self->f); + self->ptr->get_item(item, &v[0]); + PyObject* l = PyList_New(self->f); + for (int z = 0; z < self->f; z++) { + PyList_SetItem(l, z, PyFloat_FromDouble(v[z])); + } + + return l; +} + + +static PyObject* +py_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) { + PyObject* v; + int32_t item; + if (!self->ptr) + return NULL; + static char const * kwlist[] = {"i", "vector", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iO", (char**)kwlist, &item, &v)) + return NULL; + + if (!check_constraints(self, item, true)) { + return NULL; + } + + vector w(self->f); + if (!convert_list_to_vector(v, self->f, &w)) { + return NULL; + } + char* error; + if (!self->ptr->add_item(item, &w[0], &error)) { + PyErr_SetString(PyExc_Exception, error); + free(error); + return NULL; + } + + Py_RETURN_NONE; +} + +static PyObject * +py_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) { + char *filename, *error; + if (!self->ptr) + return NULL; + static char const * kwlist[] = {"fn", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &filename)) + return NULL; + + if (!self->ptr->on_disk_build(filename, &error)) { + PyErr_SetString(PyExc_IOError, error); + free(error); + return NULL; + } + Py_RETURN_TRUE; +} + +static PyObject * +py_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) { + int q; + if (!self->ptr) + return NULL; + static char const * kwlist[] = {"n_trees", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i", (char**)kwlist, &q)) + return NULL; + + bool res; + char* error; + Py_BEGIN_ALLOW_THREADS; + res = self->ptr->build(q, &error); + Py_END_ALLOW_THREADS; + if (!res) { + PyErr_SetString(PyExc_Exception, error); + free(error); + return NULL; + } + + Py_RETURN_TRUE; +} + + +static PyObject * +py_an_unbuild(py_annoy *self) { + if (!self->ptr) + return NULL; + + char* error; + if (!self->ptr->unbuild(&error)) { + PyErr_SetString(PyExc_Exception, error); + free(error); + return NULL; + } + + Py_RETURN_TRUE; +} + + +static PyObject * +py_an_unload(py_annoy *self) { + if (!self->ptr) + return NULL; + + self->ptr->unload(); + + Py_RETURN_TRUE; +} + + +static PyObject * +py_an_get_distance(py_annoy *self, PyObject *args) { + int32_t i, j; + if (!self->ptr) + return NULL; + if (!PyArg_ParseTuple(args, "ii", &i, &j)) + return NULL; + + if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) { + return NULL; + } + + double d = self->ptr->get_distance(i,j); + return PyFloat_FromDouble(d); +} + + +static PyObject * +py_an_get_n_items(py_annoy *self) { + if (!self->ptr) + return NULL; + + int32_t n = self->ptr->get_n_items(); + return PyInt_FromLong(n); +} + +static PyObject * +py_an_get_n_trees(py_annoy *self) { + if (!self->ptr) + return NULL; + + int32_t n = self->ptr->get_n_trees(); + return PyInt_FromLong(n); +} + +static PyObject * +py_an_verbose(py_annoy *self, PyObject *args) { + int verbose; + if (!self->ptr) + return NULL; + if (!PyArg_ParseTuple(args, "i", &verbose)) + return NULL; + + self->ptr->verbose((bool)verbose); + + Py_RETURN_TRUE; +} + + +static PyObject * +py_an_set_seed(py_annoy *self, PyObject *args) { + int q; + if (!self->ptr) + return NULL; + if (!PyArg_ParseTuple(args, "i", &q)) + return NULL; + + self->ptr->set_seed(q); + + Py_RETURN_NONE; +} + + +static PyMethodDef AnnoyMethods[] = { + {"load", (PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."}, + {"save", (PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."}, + {"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, + {"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, + {"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."}, + {"add_item",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, "Adds item `i` (any nonnegative integer) with vector `v`.\n\nNote that it will allocate memory for `max(i)+1` items."}, + {"on_disk_build",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, "Build will be performed with storage on disk instead of RAM."}, + {"build",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, "Builds a forest of `n_trees` trees.\n\nMore trees give higher precision when querying. After calling `build`,\nno more items can be added."}, + {"unbuild",(PyCFunction)py_an_unbuild, METH_NOARGS, "Unbuilds the tree in order to allows adding new items.\n\nbuild() has to be called again afterwards in order to\nrun queries."}, + {"unload",(PyCFunction)py_an_unload, METH_NOARGS, "Unloads an index from disk."}, + {"get_distance",(PyCFunction)py_an_get_distance, METH_VARARGS, "Returns the distance between items `i` and `j`."}, + {"get_n_items",(PyCFunction)py_an_get_n_items, METH_NOARGS, "Returns the number of items in the index."}, + {"get_n_trees",(PyCFunction)py_an_get_n_trees, METH_NOARGS, "Returns the number of trees in the index."}, + {"verbose",(PyCFunction)py_an_verbose, METH_VARARGS, ""}, + {"set_seed",(PyCFunction)py_an_set_seed, METH_VARARGS, "Sets the seed of Annoy's random number generator."}, + {NULL, NULL, 0, NULL} /* Sentinel */ +}; + + +static PyTypeObject PyAnnoyType = { + PyVarObject_HEAD_INIT(NULL, 0) + "annoy.Annoy", /*tp_name*/ + sizeof(py_annoy), /*tp_basicsize*/ + 0, /*tp_itemsize*/ + (destructor)py_an_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + 0, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + ANNOY_DOC, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + AnnoyMethods, /* tp_methods */ + py_annoy_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)py_an_init, /* tp_init */ + 0, /* tp_alloc */ + py_an_new, /* tp_new */ +}; + +static PyMethodDef module_methods[] = { + {NULL} /* Sentinel */ +}; + +#if PY_MAJOR_VERSION >= 3 + static struct PyModuleDef moduledef = { + PyModuleDef_HEAD_INIT, + "annoylib", /* m_name */ + ANNOY_DOC, /* m_doc */ + -1, /* m_size */ + module_methods, /* m_methods */ + NULL, /* m_reload */ + NULL, /* m_traverse */ + NULL, /* m_clear */ + NULL, /* m_free */ + }; +#endif + +PyObject *create_module(void) { + PyObject *m; + + if (PyType_Ready(&PyAnnoyType) < 0) + return NULL; + +#if PY_MAJOR_VERSION >= 3 + m = PyModule_Create(&moduledef); +#else + m = Py_InitModule("annoylib", module_methods); +#endif + + if (m == NULL) + return NULL; + + Py_INCREF(&PyAnnoyType); + PyModule_AddObject(m, "Annoy", (PyObject *)&PyAnnoyType); + return m; +} + +#if PY_MAJOR_VERSION >= 3 + PyMODINIT_FUNC PyInit_annoylib(void) { + return create_module(); // it should return moudule object in py3 + } +#else + PyMODINIT_FUNC initannoylib(void) { + create_module(); + } +#endif + + +// vim: tabstop=2 shiftwidth=2 diff --git a/core/src/index/thirdparty/annoy/src/kissrandom.h b/core/src/index/thirdparty/annoy/src/kissrandom.h new file mode 100644 index 0000000000..9e40110f3e --- /dev/null +++ b/core/src/index/thirdparty/annoy/src/kissrandom.h @@ -0,0 +1,106 @@ +#ifndef KISSRANDOM_H +#define KISSRANDOM_H + +#if defined(_MSC_VER) && _MSC_VER == 1500 +typedef unsigned __int32 uint32_t; +typedef unsigned __int64 uint64_t; +#else +#include +#endif + +// KISS = "keep it simple, stupid", but high quality random number generator +// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code" +// http://mathforum.org/kb/message.jspa?messageID=6627731 +// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator) + +// 32 bit KISS +struct Kiss32Random { + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t c; + + // seed must be != 0 + Kiss32Random(uint32_t seed = 123456789) { + x = seed; + y = 362436000; + z = 521288629; + c = 7654321; + } + + uint32_t kiss() { + // Linear congruence generator + x = 69069 * x + 12345; + + // Xor shift + y ^= y << 13; + y ^= y >> 17; + y ^= y << 5; + + // Multiply-with-carry + uint64_t t = 698769069ULL * z + c; + c = t >> 32; + z = (uint32_t) t; + + return x + y + z; + } + inline int flip() { + // Draw random 0 or 1 + return kiss() & 1; + } + inline size_t index(size_t n) { + // Draw random integer between 0 and n-1 where n is at most the number of data points you have + return kiss() % n; + } + inline void set_seed(uint32_t seed) { + x = seed; + } +}; + +// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) ) +struct Kiss64Random { + uint64_t x; + uint64_t y; + uint64_t z; + uint64_t c; + + // seed must be != 0 + Kiss64Random(uint64_t seed = 1234567890987654321ULL) { + x = seed; + y = 362436362436362436ULL; + z = 1066149217761810ULL; + c = 123456123456123456ULL; + } + + uint64_t kiss() { + // Linear congruence generator + z = 6906969069LL*z+1234567; + + // Xor shift + y ^= (y<<13); + y ^= (y>>17); + y ^= (y<<43); + + // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t) + uint64_t t = (x<<58)+c; + c = (x>>6); + x += t; + c += (x +#include +#include +#include + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void *)-1) + +/* Flags for msync. */ +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +#ifndef FILE_MAP_EXECUTE +#define FILE_MAP_EXECUTE 0x0020 +#endif + +static int __map_mman_error(const DWORD err, const int deferr) +{ + if (err == 0) + return 0; + //TODO: implement + return err; +} + +static DWORD __map_mmap_prot_page(const int prot) +{ + DWORD protect = 0; + + if (prot == PROT_NONE) + return protect; + + if ((prot & PROT_EXEC) != 0) + { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } + else + { + protect = ((prot & PROT_WRITE) != 0) ? + PAGE_READWRITE : PAGE_READONLY; + } + + return protect; +} + +static DWORD __map_mmap_prot_file(const int prot) +{ + DWORD desiredAccess = 0; + + if (prot == PROT_NONE) + return desiredAccess; + + if ((prot & PROT_READ) != 0) + desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) + desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) + desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +inline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off) +{ + HANDLE fm, h; + + void * map = MAP_FAILED; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4293) +#endif + + const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); + const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const off_t maxSize = off + (off_t)len; + + const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL); + const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ? + (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + errno = 0; + + if (len == 0 + /* Unsupported flag combinations */ + || (flags & MAP_FIXED) != 0 + /* Usupported protection combinations */ + || prot == PROT_EXEC) + { + errno = EINVAL; + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? + (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) + { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + + CloseHandle(fm); + + if (map == NULL) + { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + return map; +} + +inline int munmap(void *addr, size_t len) +{ + if (UnmapViewOfFile(addr)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +inline int mprotect(void *addr, size_t len, int prot) +{ + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +inline int msync(void *addr, size_t len, int flags) +{ + if (FlushViewOfFile(addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +inline int mlock(const void *addr, size_t len) +{ + if (VirtualLock((LPVOID)addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +inline int munlock(const void *addr, size_t len) +{ + if (VirtualUnlock((LPVOID)addr, len)) + return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +#if !defined(__MINGW32__) +inline int ftruncate(int fd, unsigned int size) { + if (fd < 0) { + errno = EBADF; + return -1; + } + + HANDLE h = (HANDLE)_get_osfhandle(fd); + unsigned int cur = SetFilePointer(h, 0, NULL, FILE_CURRENT); + if (cur == ~0 || SetFilePointer(h, size, NULL, FILE_BEGIN) == ~0 || !SetEndOfFile(h)) { + int error = GetLastError(); + switch (GetLastError()) { + case ERROR_INVALID_HANDLE: + errno = EBADF; + break; + default: + errno = EIO; + break; + } + return -1; + } + + return 0; +} +#endif + +#endif diff --git a/core/src/index/unittest/CMakeLists.txt b/core/src/index/unittest/CMakeLists.txt index 234b75394c..b9ce7ee834 100644 --- a/core/src/index/unittest/CMakeLists.txt +++ b/core/src/index/unittest/CMakeLists.txt @@ -89,6 +89,16 @@ if (NOT TARGET test_idmap) endif () target_link_libraries(test_idmap ${depend_libs} ${unittest_libs} ${basic_libs}) +# +set(annoy_srcs + ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp + ) + +if (NOT TARGET test_annoy) + add_executable(test_annoy test_annoy.cpp ${annoy_srcs} ${util_srcs}) +endif () +target_link_libraries(test_annoy ${depend_libs} ${unittest_libs} ${basic_libs}) + # set(hnsw_srcs ${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexHNSW.cpp @@ -144,6 +154,7 @@ install(TARGETS test_idmap DESTINATION unittest) install(TARGETS test_binaryidmap DESTINATION unittest) install(TARGETS test_sptag DESTINATION unittest) install(TARGETS test_knowhere_common DESTINATION unittest) +install(TARGETS test_annoy DESTINATION unittest) if (KNOWHERE_GPU_VERSION) install(TARGETS test_gpuresource DESTINATION unittest) diff --git a/core/src/index/unittest/test_annoy.cpp b/core/src/index/unittest/test_annoy.cpp new file mode 100644 index 0000000000..77dc49d19a --- /dev/null +++ b/core/src/index/unittest/test_annoy.cpp @@ -0,0 +1,221 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include +#include +#include +#include + +#include "knowhere/common/Exception.h" +#include "knowhere/index/vector_index/IndexAnnoy.h" + +#include "unittest/utils.h" + +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; + +int +main() { + int64_t d = 64; // dimension + int64_t nb = 10000; // database size + int64_t nq = 10; // 10000; // nb of queries + faiss::ConcurrentBitsetPtr bitset = std::make_shared(nb); + + int64_t* ids = new int64_t[nb]; + float* xb = new float[d * nb]; + float* xq = new float[d * nq]; + + for (int i = 0; i < nb; i++) { + for (int j = 0; j < d; j++) xb[d * i + j] = (float)drand48(); + xb[d * i] += i / 1000.; + ids[i] = i; + } + printf("gen xb and ids done! \n"); + + // srand((unsigned)time(NULL)); + auto random_seed = (unsigned)time(NULL); + printf("delete ids: \n"); + for (int i = 0; i < nq; i++) { + auto tmp = rand_r(&random_seed) % nb; + printf("%d\n", tmp); + // std::cout << "before delete, test result: " << bitset->test(tmp) << std::endl; + bitset->set(tmp); + // std::cout << "after delete, test result: " << bitset->test(tmp) << std::endl; + for (int j = 0; j < d; j++) xq[d * i + j] = xb[d * tmp + j]; + // xq[d * i] += i / 1000.; + } + printf("\n"); + + int k = 4; + int n_trees = 5; + int search_k = 100; + milvus::knowhere::IndexAnnoy index; + milvus::knowhere::DatasetPtr base_dataset = generate_dataset(nb, d, (const void*)xb, ids); + + milvus::knowhere::Config base_conf{ + {milvus::knowhere::meta::DIM, d}, + {milvus::knowhere::meta::TOPK, k}, + {milvus::knowhere::IndexParams::n_trees, n_trees}, + {milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, + }; + milvus::knowhere::DatasetPtr query_dataset = generate_query_dataset(nq, d, (const void*)xq); + milvus::knowhere::Config query_conf{ + {milvus::knowhere::meta::DIM, d}, + {milvus::knowhere::meta::TOPK, k}, + {milvus::knowhere::IndexParams::search_k, search_k}, + }; + + index.BuildAll(base_dataset, base_conf); + + printf("------------sanity check----------------\n"); + { // sanity check + auto res = index.Query(query_dataset, query_conf); + printf("Query done!\n"); + const int64_t* I = res->Get(milvus::knowhere::meta::IDS); + float* D = res->Get(milvus::knowhere::meta::DISTANCE); + + printf("I=\n"); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + + printf("D=\n"); + for (int i = 0; i < 5; i++) { + for (int j = 0; j < k; j++) printf("%7g ", D[i * k + j]); + printf("\n"); + } + } + + printf("---------------search xq-------------\n"); + { // search xq + auto res = index.Query(query_dataset, query_conf); + const int64_t* I = res->Get(milvus::knowhere::meta::IDS); + + printf("I=\n"); + for (int i = 0; i < nq; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + } + + printf("----------------search xq with delete------------\n"); + { // search xq with delete + index.SetBlacklist(bitset); + auto res = index.Query(query_dataset, query_conf); + auto I = res->Get(milvus::knowhere::meta::IDS); + + printf("I=\n"); + for (int i = 0; i < nq; i++) { + for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]); + printf("\n"); + } + } + + delete[] xb; + delete[] xq; + delete[] ids; + + return 0; +} + +/* +class AnnoyTest : public DataGen, public TestWithParam { + protected: + void + SetUp() override { + IndexType = GetParam(); + std::cout << "IndexType from GetParam() is: " << IndexType << std::endl; + Generate(128, 1000, 5); + index_ = std::make_shared(); + conf = milvus::knowhere::Config{ + {milvus::knowhere::meta::DIM, dim}, + {milvus::knowhere::meta::TOPK, 1}, + {milvus::knowhere::IndexParams::n_trees, 4}, + {milvus::knowhere::IndexParams::search_k, 100}, + {milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2}, + }; + +// Init_with_default(); + } + + protected: + milvus::knowhere::Config conf; + std::shared_ptr index_ = nullptr; + std::string IndexType; +}; + +INSTANTIATE_TEST_CASE_P(AnnoyParameters, AnnoyTest, Values("")); + +TEST_P(AnnoyTest, annoy_basic) { + assert(!xb.empty()); + +// index_->Train(base_dataset, conf); + index_->BuildAll(base_dataset, conf); + auto result = index_->Query(query_dataset, conf); + AssertAnns(result, nq, k); + + { + auto ids = result->Get(milvus::knowhere::meta::IDS); + auto dist = result->Get(milvus::knowhere::meta::DISTANCE); + + std::stringstream ss_id; + std::stringstream ss_dist; + for (auto i = 0; i < nq; i++) { + for (auto j = 0; j < k; ++j) { + // ss_id << *ids->data()->GetValues(1, i * k + j) << " "; + // ss_dist << *dists->data()->GetValues(1, i * k + j) << " "; + ss_id << *((int64_t*)(ids) + i * k + j) << " "; + ss_dist << *((float*)(dist) + i * k + j) << " "; + } + ss_id << std::endl; + ss_dist << std::endl; + } + std::cout << "id\n" << ss_id.str() << std::endl; + std::cout << "dist\n" << ss_dist.str() << std::endl; + } +} + +TEST_P(AnnoyTest, annoy_delete) { + assert(!xb.empty()); + +// index_->Train(base_dataset, conf); + index_->BuildAll(base_dataset, conf); + // index_->Add(base_dataset, conf); + faiss::ConcurrentBitsetPtr bitset = std::make_shared(nb); + for (auto i = 0; i < nq; ++ i) { + bitset->set(i); + + auto result = index_->Query(query_dataset, conf); + AssertAnns(result, nq, k); + + { + auto ids = result->Get(milvus::knowhere::meta::IDS); + auto dist = result->Get(milvus::knowhere::meta::DISTANCE); + + std::stringstream ss_id; + std::stringstream ss_dist; + for (auto i = 0; i < nq; i++) { + for (auto j = 0; j < k; ++j) { + // ss_id << *ids->data()->GetValues(1, i * k + j) << " "; + // ss_dist << *dists->data()->GetValues(1, i * k + j) << " "; + ss_id << *((int64_t*)(ids) + i * k + j) << " "; + ss_dist << *((float*)(dist) + i * k + j) << " "; + } + ss_id << std::endl; + ss_dist << std::endl; + } + std::cout << "id\n" << ss_id.str() << std::endl; + std::cout << "dist\n" << ss_dist.str() << std::endl; + } } +} +*/ diff --git a/core/src/server/delivery/request/DeleteByIDRequest.cpp b/core/src/server/delivery/request/DeleteByIDRequest.cpp index 9262c8386a..92bd066d42 100644 --- a/core/src/server/delivery/request/DeleteByIDRequest.cpp +++ b/core/src/server/delivery/request/DeleteByIDRequest.cpp @@ -71,6 +71,7 @@ DeleteByIDRequest::OnExecute() { if (table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IDMAP && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IDMAP && table_schema.engine_type_ != (int32_t)engine::EngineType::HNSW && + table_schema.engine_type_ != (int32_t)engine::EngineType::ANNOY && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFFLAT && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IVFFLAT && table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFSQ8 && diff --git a/tests/milvus_python_test/test_connect.py b/tests/milvus_python_test/test_connect.py index efc0f92359..14704b657d 100644 --- a/tests/milvus_python_test/test_connect.py +++ b/tests/milvus_python_test/test_connect.py @@ -38,7 +38,7 @@ class TestConnect: if not connect.connected(): milvus = get_milvus(args["handler"]) uri_value = "tcp://%s:%s" % (args["ip"], args["port"]) - milvus.connect(uri=uri_value) + milvus.connect(uri=uri_value, timeout=5) res = milvus.disconnect() with pytest.raises(Exception) as e: res = milvus.disconnect() @@ -181,9 +181,8 @@ class TestConnect: ''' milvus = get_milvus(args["handler"]) uri_value = "tcp://%s:%s" % (args["ip"], args["port"]) - milvus.connect(uri=uri_value) - - milvus.connect(uri=uri_value) + milvus.connect(uri=uri_value, timeout=5) + milvus.connect(uri=uri_value, timeout=5) assert milvus.connected() def test_connect_disconnect_repeatedly_once(self, args): @@ -209,10 +208,10 @@ class TestConnect: times = 10 milvus = get_milvus(args["handler"]) uri_value = "tcp://%s:%s" % (args["ip"], args["port"]) - milvus.connect(uri=uri_value) + milvus.connect(uri=uri_value, timeout=5) for i in range(times): milvus.disconnect() - milvus.connect(uri=uri_value) + milvus.connect(uri=uri_value, timeout=5) assert milvus.connected() # TODO: enable diff --git a/tests/milvus_python_test/test_search_vectors.py b/tests/milvus_python_test/test_search_vectors.py index 2033715692..21a482d830 100644 --- a/tests/milvus_python_test/test_search_vectors.py +++ b/tests/milvus_python_test/test_search_vectors.py @@ -851,7 +851,7 @@ class TestSearchBase: 'store_raw_vector': False} # create collection milvus = get_milvus(args["handler"]) - milvus.connect(uri=uri) + milvus.connect(uri=uri, timeout=5) milvus.create_collection(param) vectors, ids = self.init_data(milvus, collection, nb=nb) query_vecs = vectors[nb//2:nb] @@ -864,7 +864,7 @@ class TestSearchBase: for i in range(threads_num): milvus = get_milvus(args["handler"]) - milvus.connect(uri=uri) + milvus.connect(uri=uri, timeout=5) t = threading.Thread(target=search, args=(milvus, )) threads.append(t) t.start() @@ -932,7 +932,7 @@ class TestSearchBase: 'metric_type': MetricType.L2} # create collection milvus = get_milvus(args["handler"]) - milvus.connect(uri=uri) + milvus.connect(uri=uri, timeout=5) milvus.create_collection(param) status, ids = milvus.add_vectors(collection, vectors) assert status.OK() @@ -973,7 +973,7 @@ class TestSearchBase: 'metric_type': MetricType.L2} # create collection milvus = get_milvus(args["handler"]) - milvus.connect(uri=uri) + milvus.connect(uri=uri, timeout=5) milvus.create_collection(param) status, ids = milvus.add_vectors(collection, vectors) assert status.OK()