Annoy support (#1746)

* add annoy source code

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* add annoy knowhere

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* annoy local gtest passed

Signed-off-by: lichengming <chengming.li@zilliz.com>

* fix lint error and update changelog

Signed-off-by: lichengming <chengming.li@zilliz.com>

* fix compile error

Signed-off-by: cmli <chengming.li@zilliz.com>

* Update connect timeout in test cases

Signed-off-by: zw <zw@milvus.io>

* fix some potential bugs

Signed-off-by: cmli <chengming.li@zilliz.com>

* retry ci

Signed-off-by: cmli <chengming.li@zilliz.com>

* rerun ci!

Signed-off-by: cmli <chengming.li@zilliz.com>

* fix errors tested by c++ sdk

Signed-off-by: cmli <chengming.li@zilliz.com>

* fix lint error

Signed-off-by: cmli <chengming.li@zilliz.com>

Co-authored-by: shengjun.li <shengjun.li@zilliz.com>
Co-authored-by: lichengming <chengming.li@zilliz.com>
Co-authored-by: zw <zw@milvus.io>
This commit is contained in:
op-hunter 2020-03-27 09:52:31 +08:00 committed by GitHub
parent 23e2780309
commit 310d5d70bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 3864 additions and 11 deletions

View File

@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub
- \#1756 Fix memory exhausted during searching
## Feature
- \#261 Integrate ANNOY into Milvus
- \#1603 BinaryFlat add 2 Metric: Substructure and Superstructure
- \#1660 IVF PQ CPU support deleted vectors searching
- \#1661 HNSW support deleted vectors searching

View File

@ -21,3 +21,4 @@
| aws-sdk-cpp | [Apache 2.0](https://github.com/aws/aws-sdk-cpp/blob/master/LICENSE) |
| SPTAG | [MIT](https://github.com/microsoft/SPTAG/blob/master/LICENSE) |
| hnswlib | [Apache 2.0](https://github.com/nmslib/hnswlib/blob/master/LICENSE) |
| annoy | [Apache 2.0](https://github.com/spotify/annoy/blob/master/LICENSE) |

View File

@ -291,6 +291,7 @@ DBImpl::GetTableInfo(const std::string& table_id, TableInfo& table_info) {
{(int32_t)engine::EngineType::FAISS_IVFFLAT, "IVFFLAT"},
{(int32_t)engine::EngineType::FAISS_IVFSQ8, "IVFSQ8"},
{(int32_t)engine::EngineType::NSG_MIX, "NSG"},
{(int32_t)engine::EngineType::ANNOY, "ANNOY"},
{(int32_t)engine::EngineType::FAISS_IVFSQ8H, "IVFSQ8H"},
{(int32_t)engine::EngineType::FAISS_PQ, "PQ"},
{(int32_t)engine::EngineType::SPTAG_KDT, "KDT"},

View File

@ -35,7 +35,8 @@ enum class EngineType {
FAISS_BIN_IDMAP,
FAISS_BIN_IVFFLAT,
HNSW,
MAX_VALUE = HNSW,
ANNOY,
MAX_VALUE = ANNOY,
};
enum class MetricType {

View File

@ -216,6 +216,10 @@ ExecutionEngineImpl::CreatetVecIndex(EngineType type) {
index = vec_index_factory.CreateVecIndex(knowhere::IndexEnum::INDEX_HNSW, mode);
break;
}
case EngineType::ANNOY: {
index = vec_index_factory.CreateVecIndex(knowhere::IndexEnum::INDEX_ANNOY, mode);
break;
}
default: {
ENGINE_LOG_ERROR << "Unsupported index type " << (int)type;
return nullptr;

View File

@ -50,6 +50,7 @@ set(index_srcs
knowhere/index/vector_index/IndexSPTAG.cpp
knowhere/index/vector_index/IndexType.cpp
knowhere/index/vector_index/VecIndexFactory.cpp
knowhere/index/vector_index/IndexAnnoy.cpp
)
set(depend_libs

View File

@ -297,5 +297,21 @@ BinIVFConfAdapter::CheckTrain(Config& oricfg, const IndexMode mode) {
return true;
}
bool
ANNOYConfAdapter::CheckTrain(Config& oricfg, const IndexMode mode) {
static int64_t MIN_NTREES = 0;
// too large of n_trees takes much time, if there is real requirement, change this threshold.
static int64_t MAX_NTREES = 16384;
CheckIntByRange(knowhere::IndexParams::n_trees, MIN_NTREES, MAX_NTREES);
return ConfAdapter::CheckTrain(oricfg, mode);
}
bool
ANNOYConfAdapter::CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) {
return ConfAdapter::CheckSearch(oricfg, type, mode);
}
} // namespace knowhere
} // namespace milvus

View File

@ -84,5 +84,14 @@ class HNSWConfAdapter : public ConfAdapter {
CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) override;
};
class ANNOYConfAdapter : public ConfAdapter {
public:
bool
CheckTrain(Config& oricfg, const IndexMode mode) override;
bool
CheckSearch(Config& oricfg, const IndexType type, const IndexMode mode) override;
};
} // namespace knowhere
} // namespace milvus

View File

@ -46,6 +46,7 @@ AdapterMgr::RegisterAdapter() {
REGISTER_CONF_ADAPTER(ConfAdapter, IndexEnum::INDEX_SPTAG_KDT_RNT, sptag_kdt_adapter);
REGISTER_CONF_ADAPTER(ConfAdapter, IndexEnum::INDEX_SPTAG_BKT_RNT, sptag_bkt_adapter);
REGISTER_CONF_ADAPTER(HNSWConfAdapter, IndexEnum::INDEX_HNSW, hnsw_adapter);
REGISTER_CONF_ADAPTER(ANNOYConfAdapter, IndexEnum::INDEX_ANNOY, annoy_adapter);
}
} // namespace knowhere

View File

@ -0,0 +1,172 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
#include "knowhere/index/vector_index/IndexAnnoy.h"
#include <algorithm>
#include <cassert>
#include <iterator>
#include <string>
#include <utility>
#include <vector>
#include "hnswlib/hnswalg.h"
#include "hnswlib/space_ip.h"
#include "hnswlib/space_l2.h"
#include "knowhere/common/Exception.h"
#include "knowhere/common/Log.h"
#include "knowhere/index/vector_index/adapter/VectorAdapter.h"
#include "knowhere/index/vector_index/helpers/FaissIO.h"
namespace milvus {
namespace knowhere {
BinarySet
IndexAnnoy::Serialize(const Config& config) {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize or trained");
}
BinarySet res_set;
auto metric_type_length = metric_type_.length();
uint8_t* p = new uint8_t[metric_type_length];
std::shared_ptr<uint8_t> metric_type(p, [](uint8_t* p) { delete[] p; });
memcpy(p, metric_type_.data(), metric_type_.length());
uint8_t* p_dim = new uint8_t[sizeof(uint64_t)];
std::shared_ptr<uint8_t> dim_data(p_dim, [](uint8_t* p_dim) { delete[] p_dim; });
auto dim = Dim();
memcpy(p_dim, &dim, sizeof(uint64_t));
auto index_length = index_->get_index_length();
uint8_t* q = new uint8_t[index_length];
std::shared_ptr<uint8_t> index_data(q, [](uint8_t* q) { delete[] q; });
memcpy(q, index_->get_index(), (size_t)index_length);
res_set.Append("annoy_metric_type", metric_type, metric_type_length);
res_set.Append("annoy_dim", dim_data, sizeof(uint64_t));
res_set.Append("annoy_index_data", index_data, index_length);
return res_set;
}
void
IndexAnnoy::Load(const BinarySet& index_binary) {
auto metric_type = index_binary.GetByName("annoy_metric_type");
metric_type_.resize((size_t)metric_type->size + 1);
memcpy(metric_type_.data(), metric_type->data.get(), (size_t)metric_type->size);
auto dim_data = index_binary.GetByName("annoy_dim");
uint64_t dim;
memcpy(&dim, dim_data->data.get(), (size_t)dim_data->size);
if (metric_type_ == Metric::L2) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::Euclidean, ::Kiss64Random>>(dim);
} else if (metric_type_ == Metric::IP) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::DotProduct, ::Kiss64Random>>(dim);
} else {
KNOWHERE_THROW_MSG("metric not supported " + metric_type_);
}
auto index_data = index_binary.GetByName("annoy_index_data");
char* p = nullptr;
if (!index_->load_index(index_data->data.get(), index_data->size, &p)) {
std::string error_msg(p);
free(p);
KNOWHERE_THROW_MSG(error_msg);
}
}
void
IndexAnnoy::BuildAll(const DatasetPtr& dataset_ptr, const Config& config) {
if (index_) {
// it is builded all
return;
}
GETTENSORWITHIDS(dataset_ptr)
metric_type_ = config[Metric::TYPE];
if (metric_type_ == Metric::L2) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::Euclidean, ::Kiss64Random>>(dim);
} else if (metric_type_ == Metric::IP) {
index_ = std::make_shared<AnnoyIndex<int64_t, float, ::DotProduct, ::Kiss64Random>>(dim);
} else {
KNOWHERE_THROW_MSG("metric not supported " + metric_type_);
}
for (int i = 0; i < rows; ++i) {
index_->add_item(p_ids[i], (const float*)p_data + dim * i);
}
index_->build(config[IndexParams::n_trees].get<int64_t>());
}
DatasetPtr
IndexAnnoy::Query(const DatasetPtr& dataset_ptr, const Config& config) {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize or trained");
}
GETTENSOR(dataset_ptr)
auto k = config[meta::TOPK].get<int64_t>();
auto search_k = config[IndexParams::search_k].get<int64_t>();
auto all_num = rows * k;
auto p_id = (int64_t*)malloc(all_num * sizeof(int64_t));
auto p_dist = (float*)malloc(all_num * sizeof(float));
faiss::ConcurrentBitsetPtr blacklist = nullptr;
GetBlacklist(blacklist);
#pragma omp parallel for
for (unsigned int i = 0; i < rows; ++i) {
std::vector<int64_t> result;
result.reserve(k);
std::vector<float> distances;
distances.reserve(k);
index_->get_nns_by_vector((const float*)p_data + i * dim, k, search_k, &result, &distances, blacklist);
memcpy(p_id + k * i, result.data(), k * sizeof(int64_t));
memcpy(p_dist + k * i, distances.data(), k * sizeof(float));
}
auto ret_ds = std::make_shared<Dataset>();
ret_ds->Set(meta::IDS, p_id);
ret_ds->Set(meta::DISTANCE, p_dist);
return ret_ds;
}
int64_t
IndexAnnoy::Count() {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize");
}
return index_->get_n_items();
}
int64_t
IndexAnnoy::Dim() {
if (!index_) {
KNOWHERE_THROW_MSG("index not initialize");
}
return index_->get_dim();
}
int64_t
IndexAnnoy::IndexSize() {
if (index_size_ != -1) {
return index_size_;
}
return index_size_ = Dim() * Count() * sizeof(float);
}
} // namespace knowhere
} // namespace milvus

View File

@ -0,0 +1,74 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
#pragma once
#include <memory>
#include <mutex>
#include "annoy/src/annoylib.h"
#include "annoy/src/kissrandom.h"
#include "knowhere/common/Exception.h"
#include "knowhere/index/vector_index/VecIndex.h"
namespace milvus {
namespace knowhere {
class IndexAnnoy : public VecIndex {
public:
IndexAnnoy() {
index_type_ = IndexEnum::INDEX_ANNOY;
}
BinarySet
Serialize(const Config& config = Config()) override;
void
Load(const BinarySet& index_binary) override;
void
BuildAll(const DatasetPtr& dataset_ptr, const Config& config) override;
void
Train(const DatasetPtr& dataset_ptr, const Config& config) override {
KNOWHERE_THROW_MSG("Annoy not support build item dynamically, please invoke BuildAll interface.");
}
void
Add(const DatasetPtr& dataset_ptr, const Config& config) override {
KNOWHERE_THROW_MSG("Annoy not support add item dynamically, please invoke BuildAll interface.");
}
void
AddWithoutIds(const DatasetPtr&, const Config&) override {
KNOWHERE_THROW_MSG("Incremental index is not supported");
}
DatasetPtr
Query(const DatasetPtr& dataset_ptr, const Config& config) override;
int64_t
Count() override;
int64_t
Dim() override;
int64_t
IndexSize() override;
private:
MetricType metric_type_;
std::shared_ptr<AnnoyIndexInterface<int64_t, float>> index_ = nullptr;
};
} // namespace knowhere
} // namespace milvus

View File

@ -34,6 +34,7 @@ static std::unordered_map<int32_t, std::string> old_index_type_str_map = {
{(int32_t)OldIndexType::SPTAG_KDT_RNT_CPU, IndexEnum::INDEX_SPTAG_KDT_RNT},
{(int32_t)OldIndexType::SPTAG_BKT_RNT_CPU, IndexEnum::INDEX_SPTAG_BKT_RNT},
{(int32_t)OldIndexType::HNSW, IndexEnum::INDEX_HNSW},
{(int32_t)OldIndexType::ANNOY, IndexEnum::INDEX_ANNOY},
{(int32_t)OldIndexType::FAISS_BIN_IDMAP, IndexEnum::INDEX_FAISS_BIN_IDMAP},
{(int32_t)OldIndexType::FAISS_BIN_IVFLAT_CPU, IndexEnum::INDEX_FAISS_BIN_IVFFLAT},
};
@ -49,6 +50,7 @@ static std::unordered_map<std::string, int32_t> str_old_index_type_map = {
{IndexEnum::INDEX_SPTAG_KDT_RNT, (int32_t)OldIndexType::SPTAG_KDT_RNT_CPU},
{IndexEnum::INDEX_SPTAG_BKT_RNT, (int32_t)OldIndexType::SPTAG_BKT_RNT_CPU},
{IndexEnum::INDEX_HNSW, (int32_t)OldIndexType::HNSW},
{IndexEnum::INDEX_ANNOY, (int32_t)OldIndexType::ANNOY},
{IndexEnum::INDEX_FAISS_BIN_IDMAP, (int32_t)OldIndexType::FAISS_BIN_IDMAP},
{IndexEnum::INDEX_FAISS_BIN_IVFFLAT, (int32_t)OldIndexType::FAISS_BIN_IVFLAT_CPU},
};

View File

@ -34,6 +34,7 @@ enum class OldIndexType {
FAISS_IVFPQ_MIX,
SPTAG_BKT_RNT_CPU,
HNSW,
ANNOY,
FAISS_BIN_IDMAP = 100,
FAISS_BIN_IVFLAT_CPU = 101,
};
@ -54,6 +55,7 @@ constexpr const char* INDEX_NSG = "NSG";
constexpr const char* INDEX_SPTAG_KDT_RNT = "SPTAG_KDT_RNT";
constexpr const char* INDEX_SPTAG_BKT_RNT = "SPTAG_BKT_RNT";
constexpr const char* INDEX_HNSW = "HNSW";
constexpr const char* INDEX_ANNOY = "ANNOY";
} // namespace IndexEnum
enum class IndexMode { MODE_CPU = 0, MODE_GPU = 1 };

View File

@ -13,6 +13,7 @@
#include "knowhere/common/Exception.h"
#include "knowhere/common/Log.h"
#include "knowhere/index/vector_index/IndexAnnoy.h"
#include "knowhere/index/vector_index/IndexBinaryIDMAP.h"
#include "knowhere/index/vector_index/IndexBinaryIVF.h"
#include "knowhere/index/vector_index/IndexHNSW.h"
@ -78,6 +79,8 @@ VecIndexFactory::CreateVecIndex(const IndexType& type, const IndexMode mode) {
return std::make_shared<knowhere::CPUSPTAGRNG>("BKT");
} else if (type == IndexEnum::INDEX_HNSW) {
return std::make_shared<knowhere::IndexHNSW>();
} else if (type == IndexEnum::INDEX_ANNOY) {
return std::make_shared<knowhere::IndexAnnoy>();
} else {
return nullptr;
}

View File

@ -44,6 +44,10 @@ constexpr const char* candidate = "candidate_pool_size";
constexpr const char* efConstruction = "efConstruction";
constexpr const char* M = "M";
constexpr const char* ef = "ef";
// Annoy Params
constexpr const char* n_trees = "n_trees";
constexpr const char* search_k = "search_k";
} // namespace IndexParams
namespace Metric {

202
core/src/index/thirdparty/annoy/LICENSE vendored Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,15 @@
How to release
--------------
1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master`
1. Update `setup.py` to the newest version, `git add setup.py && git commit -m "version 1.2.3"`
1. `python setup.py sdist bdist_wheel`
1. `git tag -a v1.2.3 -m "version 1.2.3"`
1. `git push --tags origin master` to push the last version to Github
1. Go to https://github.com/spotify/annoy/releases and click "Draft a new release"
1. `twine upload dist/annoy-1.2.3*`
TODO
----
* Wheel

View File

@ -0,0 +1,14 @@
from annoy import AnnoyIndex
a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
a.save('test.tree')
b = AnnoyIndex(3)
b.load('test.tree')
print(b.get_nns_by_item(0, 100))
print(b.get_nns_by_vector([1.0, 0.5, 0.5], 100))

View File

@ -0,0 +1,176 @@
/*
* precision_test.cpp
*
* Created on: Jul 13, 2016
* Author: Claudio Sanhueza
* Contact: csanhuezalobos@gmail.com
*/
#include <iostream>
#include <iomanip>
#include "../src/kissrandom.h"
#include "../src/annoylib.h"
#include <chrono>
#include <algorithm>
#include <map>
#include <random>
int precision(int f=40, int n=1000000){
std::chrono::high_resolution_clock::time_point t_start, t_end;
std::default_random_engine generator;
std::normal_distribution<double> distribution(0.0, 1.0);
//******************************************************
//Building the tree
AnnoyIndex<int, double, Angular, Kiss32Random> t = AnnoyIndex<int, double, Angular, Kiss32Random>(f);
std::cout << "Building index ... be patient !!" << std::endl;
std::cout << "\"Trees that are slow to grow bear the best fruit\" (Moliere)" << std::endl;
for(int i=0; i<n; ++i){
double *vec = (double *) malloc( f * sizeof(double) );
for(int z=0; z<f; ++z){
vec[z] = (distribution(generator));
}
t.add_item(i, vec);
std::cout << "Loading objects ...\t object: "<< i+1 << "\tProgress:"<< std::fixed << std::setprecision(2) << (double) i / (double)(n + 1) * 100 << "%\r";
}
std::cout << std::endl;
std::cout << "Building index num_trees = 2 * num_features ...";
t_start = std::chrono::high_resolution_clock::now();
t.build(2 * f);
t_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count();
std::cout << " Done in "<< duration << " secs." << std::endl;
std::cout << "Saving index ...";
t.save("precision.tree");
std::cout << " Done" << std::endl;
//******************************************************
std::vector<int> limits = {10, 100, 1000, 10000};
int K=10;
int prec_n = 1000;
std::map<int, double> prec_sum;
std::map<int, double> time_sum;
std::vector<int> closest;
//init precision and timers map
for(std::vector<int>::iterator it = limits.begin(); it!=limits.end(); ++it){
prec_sum[(*it)] = 0.0;
time_sum[(*it)] = 0.0;
}
// doing the work
for(int i=0; i<prec_n; ++i){
//select a random node
int j = rand() % n;
std::cout << "finding nbs for " << j << std::endl;
// getting the K closest
t.get_nns_by_item(j, K, n, &closest, nullptr);
std::vector<int> toplist;
std::vector<int> intersection;
for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){
t_start = std::chrono::high_resolution_clock::now();
t.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to "n_trees * n" if not provided.
t_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t_end - t_start ).count();
//intersecting results
std::sort(closest.begin(), closest.end(), std::less<int>());
std::sort(toplist.begin(), toplist.end(), std::less<int>());
intersection.resize(std::max(closest.size(), toplist.size()));
std::vector<int>::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin());
intersection.resize(it_set-intersection.begin());
// storing metrics
int found = intersection.size();
double hitrate = found / (double) K;
prec_sum[(*limit)] += hitrate;
time_sum[(*limit)] += duration;
//deallocate memory
vector<int>().swap(intersection);
vector<int>().swap(toplist);
}
//print resulting metrics
for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){
std::cout << "limit: " << (*limit) << "\tprecision: "<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << "% \tavg. time: "<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << "s" << std::endl;
}
closest.clear(); vector<int>().swap(closest);
}
std::cout << "\nDone" << std::endl;
return 0;
}
void help(){
std::cout << "Annoy Precision C++ example" << std::endl;
std::cout << "Usage:" << std::endl;
std::cout << "(default) ./precision" << std::endl;
std::cout << "(using parameters) ./precision num_features num_nodes" << std::endl;
std::cout << std::endl;
}
void feedback(int f, int n){
std::cout<<"Runing precision example with:" << std::endl;
std::cout<<"num. features: "<< f << std::endl;
std::cout<<"num. nodes: "<< n << std::endl;
std::cout << std::endl;
}
int main(int argc, char **argv) {
int f, n;
if(argc == 1){
f = 40;
n = 1000000;
feedback(f,n);
precision(40, 1000000);
}
else if(argc == 3){
f = atoi(argv[1]);
n = atoi(argv[2]);
feedback(f,n);
precision(f, n);
}
else {
help();
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,46 @@
from __future__ import print_function
import random, time
from annoy import AnnoyIndex
try:
xrange
except NameError:
# Python 3 compat
xrange = range
n, f = 100000, 40
t = AnnoyIndex(f, 'angular')
for i in xrange(n):
v = []
for z in xrange(f):
v.append(random.gauss(0, 1))
t.add_item(i, v)
t.build(2 * f)
t.save('test.tree')
limits = [10, 100, 1000, 10000]
k = 10
prec_sum = {}
prec_n = 1000
time_sum = {}
for i in xrange(prec_n):
j = random.randrange(0, n)
closest = set(t.get_nns_by_item(j, k, n))
for limit in limits:
t0 = time.time()
toplist = t.get_nns_by_item(j, k, limit)
T = time.time() - t0
found = len(closest.intersection(toplist))
hitrate = 1.0 * found / k
prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
time_sum[limit] = time_sum.get(limit, 0.0) + T
for limit in limits:
print('limit: %-9d precision: %6.2f%% avg time: %.6fs'
% (limit, 100.0 * prec_sum[limit] / (i + 1),
time_sum[limit] / (i + 1)))

View File

@ -0,0 +1,7 @@
#!/bin/bash
echo "compiling precision example..."
cmd="g++ precision_test.cpp -o precision_test -std=c++11"
eval $cmd
echo "Done"

View File

@ -0,0 +1,10 @@
from annoy import AnnoyIndex
a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
print(a.get_nns_by_item(0, 100))
print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100))

View File

@ -0,0 +1,92 @@
#include "annoylib.h"
#include "kissrandom.h"
namespace GoAnnoy {
class AnnoyIndex {
protected:
::AnnoyIndexInterface<int32_t, float> *ptr;
int f;
public:
~AnnoyIndex() {
delete ptr;
};
void addItem(int item, const float* w) {
ptr->add_item(item, w);
};
void build(int q) {
ptr->build(q);
};
bool save(const char* filename, bool prefault) {
return ptr->save(filename, prefault);
};
bool save(const char* filename) {
return ptr->save(filename, true);
};
void unload() {
ptr->unload();
};
bool load(const char* filename, bool prefault) {
return ptr->load(filename, prefault);
};
bool load(const char* filename) {
return ptr->load(filename, true);
};
float getDistance(int i, int j) {
return ptr->get_distance(i, j);
};
void getNnsByItem(int item, int n, int search_k, vector<int32_t>* result, vector<float>* distances) {
ptr->get_nns_by_item(item, n, search_k, result, distances);
};
void getNnsByVector(const float* w, int n, int search_k, vector<int32_t>* result, vector<float>* distances) {
ptr->get_nns_by_vector(w, n, search_k, result, distances);
};
void getNnsByItem(int item, int n, int search_k, vector<int32_t>* result) {
ptr->get_nns_by_item(item, n, search_k, result, NULL);
};
void getNnsByVector(const float* w, int n, int search_k, vector<int32_t>* result) {
ptr->get_nns_by_vector(w, n, search_k, result, NULL);
};
int getNItems() {
return (int)ptr->get_n_items();
};
void verbose(bool v) {
ptr->verbose(v);
};
void getItem(int item, vector<float> *v) {
v->resize(this->f);
ptr->get_item(item, &v->front());
};
bool onDiskBuild(const char* filename) {
return ptr->on_disk_build(filename);
};
};
class AnnoyIndexAngular : public AnnoyIndex
{
public:
AnnoyIndexAngular(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Angular, ::Kiss64Random>(f);
this->f = f;
}
};
class AnnoyIndexEuclidean : public AnnoyIndex {
public:
AnnoyIndexEuclidean(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Euclidean, ::Kiss64Random>(f);
this->f = f;
}
};
class AnnoyIndexManhattan : public AnnoyIndex {
public:
AnnoyIndexManhattan(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Manhattan, ::Kiss64Random>(f);
this->f = f;
}
};
}

View File

@ -0,0 +1,96 @@
%module annoyindex
%{
#include "annoygomodule.h"
%}
// const float *
%typemap(gotype) (const float *) "[]float32"
%typemap(in) (const float *)
%{
float *v;
vector<float> w;
v = (float *)$input.array;
for (int i = 0; i < $input.len; i++) {
w.push_back(v[i]);
}
$1 = &w[0];
%}
// vector<int32_t> *
%typemap(gotype) (vector<int32_t> *) "*[]int"
%typemap(in) (vector<int32_t> *)
%{
$1 = new vector<int32_t>();
%}
%typemap(freearg) (vector<int32_t> *)
%{
delete $1;
%}
%typemap(argout) (vector<int32_t> *)
%{
{
$input->len = $1->size();
$input->cap = $1->size();
$input->array = malloc($input->len * sizeof(intgo));
for (int i = 0; i < $1->size(); i++) {
((intgo *)$input->array)[i] = (intgo)(*$1)[i];
}
}
%}
// vector<float> *
%typemap(gotype) (vector<float> *) "*[]float32"
%typemap(in) (vector<float> *)
%{
$1 = new vector<float>();
%}
%typemap(freearg) (vector<float> *)
%{
delete $1;
%}
%typemap(argout) (vector<float> *)
%{
{
$input->len = $1->size();
$input->cap = $1->size();
$input->array = malloc($input->len * sizeof(float));
for (int i = 0; i < $1->size(); i++) {
((float *)$input->array)[i] = (float)(*$1)[i];
}
}
%}
%typemap(gotype) (const char *) "string"
%typemap(in) (const char *)
%{
$1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char));
strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n);
%}
%typemap(freearg) (const char *)
%{
free($1);
%}
/* Let's just grab the original header file here */
%include "annoygomodule.h"
%feature("notabstract") GoAnnoyIndexAngular;
%feature("notabstract") GoAnnoyIndexEuclidean;
%feature("notabstract") GoAnnoyIndexManhattan;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,318 @@
// Copyright (c) 2016 Boris Nagaev
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#include <cstring>
#include <typeinfo>
#include <lua.hpp>
#include "annoylib.h"
#include "kissrandom.h"
#if LUA_VERSION_NUM == 501
#define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs)
#define compat_rawlen lua_objlen
#else
#define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0)
#define compat_rawlen lua_rawlen
#endif
template<typename Distance>
class LuaAnnoy {
public:
typedef int32_t AnnoyS;
typedef float AnnoyT;
typedef AnnoyIndex<AnnoyS, AnnoyT, Distance, Kiss64Random> Impl;
typedef LuaAnnoy<Distance> ThisClass;
class LuaArrayProxy {
public:
LuaArrayProxy(lua_State* L, int object, int f)
: L_(L)
, object_(object)
{
luaL_checktype(L, object, LUA_TTABLE);
int v_len = compat_rawlen(L, object);
luaL_argcheck(L, v_len == f, object, "Length of v != f");
}
double operator[](int index) const {
lua_rawgeti(L_, object_, index + 1);
double result = lua_tonumber(L_, -1);
lua_pop(L_, 1);
return result;
}
private:
lua_State* L_;
int object_;
};
static void toVector(lua_State* L, int object, int f, AnnoyT* dst) {
LuaArrayProxy proxy(L, object, f);
for (int i = 0; i < f; i++) {
dst[i] = proxy[i];
}
}
template <typename Vector>
static void pushVector(lua_State* L, const Vector& v) {
lua_createtable(L, v.size(), 0);
for (int j = 0; j < v.size(); j++) {
lua_pushnumber(L, v[j]);
lua_rawseti(L, -2, j + 1);
}
}
static const char* typeAsString() {
return typeid(Impl).name();
}
static Impl* getAnnoy(lua_State* L, int object) {
return reinterpret_cast<Impl*>(
luaL_checkudata(L, object, typeAsString())
);
}
static int getItemIndex(lua_State* L, int object, int size = -1) {
int item = luaL_checkinteger(L, object);
luaL_argcheck(L, item >= 0, object, "Index must be >= 0");
if (size != -1) {
luaL_argcheck(L, item < size, object, "Index must be < size");
}
return item;
}
static int gc(lua_State* L) {
Impl* self = getAnnoy(L, 1);
self->~Impl();
return 0;
}
static int tostring(lua_State* L) {
Impl* self = getAnnoy(L, 1);
lua_pushfstring(
L,
"annoy.AnnoyIndex object (%dx%d, %s distance)",
self->get_n_items(), self->get_f(), Distance::name()
);
return 1;
}
static int add_item(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int item = getItemIndex(L, 2);
self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f()));
return 0;
}
static int build(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int n_trees = luaL_checkinteger(L, 2);
self->build(n_trees);
lua_pushboolean(L, true);
return 1;
}
static int on_disk_build(lua_State* L) {
Impl* self = getAnnoy(L, 1);
const char* filename = luaL_checkstring(L, 2);
self->on_disk_build(filename);
lua_pushboolean(L, true);
return 1;
}
static int save(lua_State* L) {
int nargs = lua_gettop(L);
Impl* self = getAnnoy(L, 1);
const char* filename = luaL_checkstring(L, 2);
bool prefault = true;
if (nargs >= 3) {
prefault = lua_toboolean(L, 3);
}
self->save(filename, prefault);
lua_pushboolean(L, true);
return 1;
}
static int load(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int nargs = lua_gettop(L);
const char* filename = luaL_checkstring(L, 2);
bool prefault = true;
if (nargs >= 3) {
prefault = lua_toboolean(L, 3);
}
if (!self->load(filename, prefault)) {
return luaL_error(L, "Can't load file: %s", filename);
}
lua_pushboolean(L, true);
return 1;
}
static int unload(lua_State* L) {
Impl* self = getAnnoy(L, 1);
self->unload();
lua_pushboolean(L, true);
return 1;
}
struct Searcher {
std::vector<AnnoyS> result;
std::vector<AnnoyT> distances;
Impl* self;
int n;
int search_k;
bool include_distances;
Searcher(lua_State* L) {
int nargs = lua_gettop(L);
self = getAnnoy(L, 1);
n = luaL_checkinteger(L, 3);
search_k = -1;
if (nargs >= 4) {
search_k = luaL_checkinteger(L, 4);
}
include_distances = false;
if (nargs >= 5) {
include_distances = lua_toboolean(L, 5);
}
}
int pushResults(lua_State* L) {
pushVector(L, result);
if (include_distances) {
pushVector(L, distances);
}
return include_distances ? 2 : 1;
}
};
static int get_nns_by_item(lua_State* L) {
Searcher s(L);
int item = getItemIndex(L, 2, s.self->get_n_items());
s.self->get_nns_by_item(item, s.n, s.search_k, &s.result,
s.include_distances ? &s.distances : NULL);
return s.pushResults(L);
}
static int get_nns_by_vector(lua_State* L) {
Searcher s(L);
std::vector<AnnoyT> _vec(s.self->get_f());
AnnoyT* vec = &(_vec[0]);
toVector(L, 2, s.self->get_f(), vec);
s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result,
s.include_distances ? &s.distances : NULL);
return s.pushResults(L);
}
static int get_item_vector(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int item = getItemIndex(L, 2, self->get_n_items());
std::vector<AnnoyT> _vec(self->get_f());
AnnoyT* vec = &(_vec[0]);
self->get_item(item, vec);
pushVector(L, _vec);
return 1;
}
static int get_distance(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int i = getItemIndex(L, 2, self->get_n_items());
int j = getItemIndex(L, 3, self->get_n_items());
AnnoyT distance = self->get_distance(i, j);
lua_pushnumber(L, distance);
return 1;
}
static int get_n_items(lua_State* L) {
Impl* self = getAnnoy(L, 1);
lua_pushnumber(L, self->get_n_items());
return 1;
}
static const luaL_Reg* getMetatable() {
static const luaL_Reg funcs[] = {
{"__gc", &ThisClass::gc},
{"__tostring", &ThisClass::tostring},
{NULL, NULL},
};
return funcs;
}
static const luaL_Reg* getMethods() {
static const luaL_Reg funcs[] = {
{"add_item", &ThisClass::add_item},
{"build", &ThisClass::build},
{"save", &ThisClass::save},
{"load", &ThisClass::load},
{"unload", &ThisClass::unload},
{"get_nns_by_item", &ThisClass::get_nns_by_item},
{"get_nns_by_vector", &ThisClass::get_nns_by_vector},
{"get_item_vector", &ThisClass::get_item_vector},
{"get_distance", &ThisClass::get_distance},
{"get_n_items", &ThisClass::get_n_items},
{"on_disk_build", &ThisClass::on_disk_build},
{NULL, NULL},
};
return funcs;
}
static void createNew(lua_State* L, int f) {
void* self = lua_newuserdata(L, sizeof(Impl));
if (luaL_newmetatable(L, typeAsString())) {
compat_setfuncs(L, getMetatable());
lua_newtable(L);
compat_setfuncs(L, getMethods());
lua_setfield(L, -2, "__index");
}
new (self) Impl(f);
lua_setmetatable(L, -2);
}
};
static int lua_an_make(lua_State* L) {
int f = luaL_checkinteger(L, 1);
const char* metric = "angular";
if (lua_gettop(L) >= 2) {
metric = luaL_checkstring(L, 2);
}
if (strcmp(metric, "angular") == 0) {
LuaAnnoy<Angular>::createNew(L, f);
return 1;
} else if (strcmp(metric, "euclidean") == 0) {
LuaAnnoy<Euclidean>::createNew(L, f);
return 1;
} else if (strcmp(metric, "manhattan") == 0) {
LuaAnnoy<Manhattan>::createNew(L, f);
return 1;
} else {
return luaL_error(L, "Unknown metric: %s", metric);
}
}
static const luaL_Reg LUA_ANNOY_FUNCS[] = {
{"AnnoyIndex", lua_an_make},
{NULL, NULL},
};
extern "C" {
int luaopen_annoy(lua_State* L) {
lua_newtable(L);
compat_setfuncs(L, LUA_ANNOY_FUNCS);
return 1;
}
}
// vim: tabstop=2 shiftwidth=2

View File

@ -0,0 +1,632 @@
// Copyright (c) 2013 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#include "annoylib.h"
#include "kissrandom.h"
#include "Python.h"
#include "structmember.h"
#include <exception>
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef signed __int32 int32_t;
#else
#include <stdint.h>
#endif
#if defined(USE_AVX512)
#define AVX_INFO "Using 512-bit AVX instructions"
#elif defined(USE_AVX128)
#define AVX_INFO "Using 128-bit AVX instructions"
#else
#define AVX_INFO "Not using AVX instructions"
#endif
#if defined(_MSC_VER)
#define COMPILER_INFO "Compiled using MSC"
#elif defined(__GNUC__)
#define COMPILER_INFO "Compiled on GCC"
#else
#define COMPILER_INFO "Compiled on unknown platform"
#endif
#define ANNOY_DOC (COMPILER_INFO ". " AVX_INFO ".")
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
#ifndef Py_TYPE
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
#endif
#ifdef IS_PY3K
#define PyInt_FromLong PyLong_FromLong
#endif
template class AnnoyIndexInterface<int32_t, float>;
class HammingWrapper : public AnnoyIndexInterface<int32_t, float> {
// Wrapper class for Hamming distance, using composition.
// This translates binary (float) vectors into packed uint64_t vectors.
// This is questionable from a performance point of view. Should reconsider this solution.
private:
int32_t _f_external, _f_internal;
AnnoyIndex<int32_t, uint64_t, Hamming, Kiss64Random> _index;
void _pack(const float* src, uint64_t* dst) const {
for (int32_t i = 0; i < _f_internal; i++) {
dst[i] = 0;
for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) {
dst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j;
}
}
};
void _unpack(const uint64_t* src, float* dst) const {
for (int32_t i = 0; i < _f_external; i++) {
dst[i] = (src[i / 64] >> (i % 64)) & 1;
}
};
public:
HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {};
bool add_item(int32_t item, const float* w, char**error) {
vector<uint64_t> w_internal(_f_internal, 0);
_pack(w, &w_internal[0]);
return _index.add_item(item, &w_internal[0], error);
};
bool build(int q, char** error) { return _index.build(q, error); };
bool unbuild(char** error) { return _index.unbuild(error); };
bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); };
void unload() { _index.unload(); };
bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); };
float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); };
void get_nns_by_item(int32_t item, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
if (distances) {
vector<uint64_t> distances_internal;
_index.get_nns_by_item(item, n, search_k, result, &distances_internal);
distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
} else {
_index.get_nns_by_item(item, n, search_k, result, NULL);
}
};
void get_nns_by_vector(const float* w, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
vector<uint64_t> w_internal(_f_internal, 0);
_pack(w, &w_internal[0]);
if (distances) {
vector<uint64_t> distances_internal;
_index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal);
distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
} else {
_index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL);
}
};
int32_t get_n_items() const { return _index.get_n_items(); };
int32_t get_n_trees() const { return _index.get_n_trees(); };
void verbose(bool v) { _index.verbose(v); };
void get_item(int32_t item, float* v) const {
vector<uint64_t> v_internal(_f_internal, 0);
_index.get_item(item, &v_internal[0]);
_unpack(&v_internal[0], v);
};
void set_seed(int q) { _index.set_seed(q); };
bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); };
};
// annoy python object
typedef struct {
PyObject_HEAD
int f;
AnnoyIndexInterface<int32_t, float>* ptr;
} py_annoy;
static PyObject *
py_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
py_annoy *self = (py_annoy *)type->tp_alloc(type, 0);
if (self == NULL) {
return NULL;
}
const char *metric = NULL;
static char const * kwlist[] = {"f", "metric", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &self->f, &metric))
return NULL;
if (!metric) {
// This keeps coming up, see #368 etc
PyErr_WarnEx(PyExc_FutureWarning, "The default argument for metric will be removed "
"in future version of Annoy. Please pass metric='angular' explicitly.", 1);
self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random>(self->f);
} else if (!strcmp(metric, "angular")) {
self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random>(self->f);
} else if (!strcmp(metric, "euclidean")) {
self->ptr = new AnnoyIndex<int32_t, float, Euclidean, Kiss64Random>(self->f);
} else if (!strcmp(metric, "manhattan")) {
self->ptr = new AnnoyIndex<int32_t, float, Manhattan, Kiss64Random>(self->f);
} else if (!strcmp(metric, "hamming")) {
self->ptr = new HammingWrapper(self->f);
} else if (!strcmp(metric, "dot")) {
self->ptr = new AnnoyIndex<int32_t, float, DotProduct, Kiss64Random>(self->f);
} else {
PyErr_SetString(PyExc_ValueError, "No such metric");
return NULL;
}
return (PyObject *)self;
}
static int
py_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) {
// Seems to be needed for Python 3
const char *metric = NULL;
int f;
static char const * kwlist[] = {"f", "metric", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &f, &metric))
return (int) NULL;
return 0;
}
static void
py_an_dealloc(py_annoy* self) {
delete self->ptr;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyMemberDef py_annoy_members[] = {
{(char*)"f", T_INT, offsetof(py_annoy, f), 0,
(char*)""},
{NULL} /* Sentinel */
};
static PyObject *
py_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
bool prefault = false;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", "prefault", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
return NULL;
if (!self->ptr->load(filename, prefault, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
bool prefault = false;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", "prefault", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
return NULL;
if (!self->ptr->save(filename, prefault, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
PyObject*
get_nns_to_python(const vector<int32_t>& result, const vector<float>& distances, int include_distances) {
PyObject* l = PyList_New(result.size());
for (size_t i = 0; i < result.size(); i++)
PyList_SetItem(l, i, PyInt_FromLong(result[i]));
if (!include_distances)
return l;
PyObject* d = PyList_New(distances.size());
for (size_t i = 0; i < distances.size(); i++)
PyList_SetItem(d, i, PyFloat_FromDouble(distances[i]));
PyObject* t = PyTuple_New(2);
PyTuple_SetItem(t, 0, l);
PyTuple_SetItem(t, 1, d);
return t;
}
bool check_constraints(py_annoy *self, int32_t item, bool building) {
if (item < 0) {
PyErr_SetString(PyExc_IndexError, "Item index can not be negative");
return false;
} else if (!building && item >= self->ptr->get_n_items()) {
PyErr_SetString(PyExc_IndexError, "Item index larger than the largest item index");
return false;
} else {
return true;
}
}
static PyObject*
py_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) {
int32_t item, n, search_k=-1, include_distances=0;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"i", "n", "search_k", "include_distances", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|ii", (char**)kwlist, &item, &n, &search_k, &include_distances))
return NULL;
if (!check_constraints(self, item, false)) {
return NULL;
}
vector<int32_t> result;
vector<float> distances;
Py_BEGIN_ALLOW_THREADS;
self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL);
Py_END_ALLOW_THREADS;
return get_nns_to_python(result, distances, include_distances);
}
bool
convert_list_to_vector(PyObject* v, int f, vector<float>* w) {
if (PyObject_Size(v) == -1) {
char buf[256];
snprintf(buf, 256, "Expected an iterable, got an object of type \"%s\"", v->ob_type->tp_name);
PyErr_SetString(PyExc_ValueError, buf);
return false;
}
if (PyObject_Size(v) != f) {
char buf[128];
snprintf(buf, 128, "Vector has wrong length (expected %d, got %ld)", f, PyObject_Size(v));
PyErr_SetString(PyExc_IndexError, buf);
return false;
}
for (int z = 0; z < f; z++) {
PyObject *key = PyInt_FromLong(z);
PyObject *pf = PyObject_GetItem(v, key);
(*w)[z] = PyFloat_AsDouble(pf);
Py_DECREF(key);
Py_DECREF(pf);
}
return true;
}
static PyObject*
py_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) {
PyObject* v;
int32_t n, search_k=-1, include_distances=0;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"vector", "n", "search_k", "include_distances", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", (char**)kwlist, &v, &n, &search_k, &include_distances))
return NULL;
vector<float> w(self->f);
if (!convert_list_to_vector(v, self->f, &w)) {
return NULL;
}
vector<int32_t> result;
vector<float> distances;
Py_BEGIN_ALLOW_THREADS;
self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL);
Py_END_ALLOW_THREADS;
return get_nns_to_python(result, distances, include_distances);
}
static PyObject*
py_an_get_item_vector(py_annoy *self, PyObject *args) {
int32_t item;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &item))
return NULL;
if (!check_constraints(self, item, false)) {
return NULL;
}
vector<float> v(self->f);
self->ptr->get_item(item, &v[0]);
PyObject* l = PyList_New(self->f);
for (int z = 0; z < self->f; z++) {
PyList_SetItem(l, z, PyFloat_FromDouble(v[z]));
}
return l;
}
static PyObject*
py_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) {
PyObject* v;
int32_t item;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"i", "vector", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iO", (char**)kwlist, &item, &v))
return NULL;
if (!check_constraints(self, item, true)) {
return NULL;
}
vector<float> w(self->f);
if (!convert_list_to_vector(v, self->f, &w)) {
return NULL;
}
char* error;
if (!self->ptr->add_item(item, &w[0], &error)) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
py_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &filename))
return NULL;
if (!self->ptr->on_disk_build(filename, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
int q;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"n_trees", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i", (char**)kwlist, &q))
return NULL;
bool res;
char* error;
Py_BEGIN_ALLOW_THREADS;
res = self->ptr->build(q, &error);
Py_END_ALLOW_THREADS;
if (!res) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_unbuild(py_annoy *self) {
if (!self->ptr)
return NULL;
char* error;
if (!self->ptr->unbuild(&error)) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_unload(py_annoy *self) {
if (!self->ptr)
return NULL;
self->ptr->unload();
Py_RETURN_TRUE;
}
static PyObject *
py_an_get_distance(py_annoy *self, PyObject *args) {
int32_t i, j;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "ii", &i, &j))
return NULL;
if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) {
return NULL;
}
double d = self->ptr->get_distance(i,j);
return PyFloat_FromDouble(d);
}
static PyObject *
py_an_get_n_items(py_annoy *self) {
if (!self->ptr)
return NULL;
int32_t n = self->ptr->get_n_items();
return PyInt_FromLong(n);
}
static PyObject *
py_an_get_n_trees(py_annoy *self) {
if (!self->ptr)
return NULL;
int32_t n = self->ptr->get_n_trees();
return PyInt_FromLong(n);
}
static PyObject *
py_an_verbose(py_annoy *self, PyObject *args) {
int verbose;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &verbose))
return NULL;
self->ptr->verbose((bool)verbose);
Py_RETURN_TRUE;
}
static PyObject *
py_an_set_seed(py_annoy *self, PyObject *args) {
int q;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &q))
return NULL;
self->ptr->set_seed(q);
Py_RETURN_NONE;
}
static PyMethodDef AnnoyMethods[] = {
{"load", (PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."},
{"save", (PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."},
{"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
{"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
{"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."},
{"add_item",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, "Adds item `i` (any nonnegative integer) with vector `v`.\n\nNote that it will allocate memory for `max(i)+1` items."},
{"on_disk_build",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, "Build will be performed with storage on disk instead of RAM."},
{"build",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, "Builds a forest of `n_trees` trees.\n\nMore trees give higher precision when querying. After calling `build`,\nno more items can be added."},
{"unbuild",(PyCFunction)py_an_unbuild, METH_NOARGS, "Unbuilds the tree in order to allows adding new items.\n\nbuild() has to be called again afterwards in order to\nrun queries."},
{"unload",(PyCFunction)py_an_unload, METH_NOARGS, "Unloads an index from disk."},
{"get_distance",(PyCFunction)py_an_get_distance, METH_VARARGS, "Returns the distance between items `i` and `j`."},
{"get_n_items",(PyCFunction)py_an_get_n_items, METH_NOARGS, "Returns the number of items in the index."},
{"get_n_trees",(PyCFunction)py_an_get_n_trees, METH_NOARGS, "Returns the number of trees in the index."},
{"verbose",(PyCFunction)py_an_verbose, METH_VARARGS, ""},
{"set_seed",(PyCFunction)py_an_set_seed, METH_VARARGS, "Sets the seed of Annoy's random number generator."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static PyTypeObject PyAnnoyType = {
PyVarObject_HEAD_INIT(NULL, 0)
"annoy.Annoy", /*tp_name*/
sizeof(py_annoy), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)py_an_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
ANNOY_DOC, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
AnnoyMethods, /* tp_methods */
py_annoy_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)py_an_init, /* tp_init */
0, /* tp_alloc */
py_an_new, /* tp_new */
};
static PyMethodDef module_methods[] = {
{NULL} /* Sentinel */
};
#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"annoylib", /* m_name */
ANNOY_DOC, /* m_doc */
-1, /* m_size */
module_methods, /* m_methods */
NULL, /* m_reload */
NULL, /* m_traverse */
NULL, /* m_clear */
NULL, /* m_free */
};
#endif
PyObject *create_module(void) {
PyObject *m;
if (PyType_Ready(&PyAnnoyType) < 0)
return NULL;
#if PY_MAJOR_VERSION >= 3
m = PyModule_Create(&moduledef);
#else
m = Py_InitModule("annoylib", module_methods);
#endif
if (m == NULL)
return NULL;
Py_INCREF(&PyAnnoyType);
PyModule_AddObject(m, "Annoy", (PyObject *)&PyAnnoyType);
return m;
}
#if PY_MAJOR_VERSION >= 3
PyMODINIT_FUNC PyInit_annoylib(void) {
return create_module(); // it should return moudule object in py3
}
#else
PyMODINIT_FUNC initannoylib(void) {
create_module();
}
#endif
// vim: tabstop=2 shiftwidth=2

View File

@ -0,0 +1,106 @@
#ifndef KISSRANDOM_H
#define KISSRANDOM_H
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif
// KISS = "keep it simple, stupid", but high quality random number generator
// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
// http://mathforum.org/kb/message.jspa?messageID=6627731
// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
// 32 bit KISS
struct Kiss32Random {
uint32_t x;
uint32_t y;
uint32_t z;
uint32_t c;
// seed must be != 0
Kiss32Random(uint32_t seed = 123456789) {
x = seed;
y = 362436000;
z = 521288629;
c = 7654321;
}
uint32_t kiss() {
// Linear congruence generator
x = 69069 * x + 12345;
// Xor shift
y ^= y << 13;
y ^= y >> 17;
y ^= y << 5;
// Multiply-with-carry
uint64_t t = 698769069ULL * z + c;
c = t >> 32;
z = (uint32_t) t;
return x + y + z;
}
inline int flip() {
// Draw random 0 or 1
return kiss() & 1;
}
inline size_t index(size_t n) {
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
return kiss() % n;
}
inline void set_seed(uint32_t seed) {
x = seed;
}
};
// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
struct Kiss64Random {
uint64_t x;
uint64_t y;
uint64_t z;
uint64_t c;
// seed must be != 0
Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
x = seed;
y = 362436362436362436ULL;
z = 1066149217761810ULL;
c = 123456123456123456ULL;
}
uint64_t kiss() {
// Linear congruence generator
z = 6906969069LL*z+1234567;
// Xor shift
y ^= (y<<13);
y ^= (y>>17);
y ^= (y<<43);
// Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
uint64_t t = (x<<58)+c;
c = (x>>6);
x += t;
c += (x<t);
return x + y + z;
}
inline int flip() {
// Draw random 0 or 1
return kiss() & 1;
}
inline size_t index(size_t n) {
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
return kiss() % n;
}
inline void set_seed(uint32_t seed) {
x = seed;
}
};
#endif
// vim: tabstop=2 shiftwidth=2

View File

@ -0,0 +1,238 @@
// This is from https://code.google.com/p/mman-win32/
//
// Licensed under MIT
#ifndef _MMAN_WIN32_H
#define _MMAN_WIN32_H
#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later.
#define _WIN32_WINNT 0x0501 // Change this to the appropriate value to target other versions of Windows.
#endif
#include <sys/types.h>
#include <windows.h>
#include <errno.h>
#include <io.h>
#define PROT_NONE 0
#define PROT_READ 1
#define PROT_WRITE 2
#define PROT_EXEC 4
#define MAP_FILE 0
#define MAP_SHARED 1
#define MAP_PRIVATE 2
#define MAP_TYPE 0xf
#define MAP_FIXED 0x10
#define MAP_ANONYMOUS 0x20
#define MAP_ANON MAP_ANONYMOUS
#define MAP_FAILED ((void *)-1)
/* Flags for msync. */
#define MS_ASYNC 1
#define MS_SYNC 2
#define MS_INVALIDATE 4
#ifndef FILE_MAP_EXECUTE
#define FILE_MAP_EXECUTE 0x0020
#endif
static int __map_mman_error(const DWORD err, const int deferr)
{
if (err == 0)
return 0;
//TODO: implement
return err;
}
static DWORD __map_mmap_prot_page(const int prot)
{
DWORD protect = 0;
if (prot == PROT_NONE)
return protect;
if ((prot & PROT_EXEC) != 0)
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
}
else
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_READWRITE : PAGE_READONLY;
}
return protect;
}
static DWORD __map_mmap_prot_file(const int prot)
{
DWORD desiredAccess = 0;
if (prot == PROT_NONE)
return desiredAccess;
if ((prot & PROT_READ) != 0)
desiredAccess |= FILE_MAP_READ;
if ((prot & PROT_WRITE) != 0)
desiredAccess |= FILE_MAP_WRITE;
if ((prot & PROT_EXEC) != 0)
desiredAccess |= FILE_MAP_EXECUTE;
return desiredAccess;
}
inline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)
{
HANDLE fm, h;
void * map = MAP_FAILED;
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4293)
#endif
const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);
const DWORD protect = __map_mmap_prot_page(prot);
const DWORD desiredAccess = __map_mmap_prot_file(prot);
const off_t maxSize = off + (off_t)len;
const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
#ifdef _MSC_VER
#pragma warning(pop)
#endif
errno = 0;
if (len == 0
/* Unsupported flag combinations */
|| (flags & MAP_FIXED) != 0
/* Usupported protection combinations */
|| prot == PROT_EXEC)
{
errno = EINVAL;
return MAP_FAILED;
}
h = ((flags & MAP_ANONYMOUS) == 0) ?
(HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;
if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)
{
errno = EBADF;
return MAP_FAILED;
}
fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
if (fm == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
CloseHandle(fm);
if (map == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
return map;
}
inline int munmap(void *addr, size_t len)
{
if (UnmapViewOfFile(addr))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int mprotect(void *addr, size_t len, int prot)
{
DWORD newProtect = __map_mmap_prot_page(prot);
DWORD oldProtect = 0;
if (VirtualProtect(addr, len, newProtect, &oldProtect))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int msync(void *addr, size_t len, int flags)
{
if (FlushViewOfFile(addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int mlock(const void *addr, size_t len)
{
if (VirtualLock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int munlock(const void *addr, size_t len)
{
if (VirtualUnlock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
#if !defined(__MINGW32__)
inline int ftruncate(int fd, unsigned int size) {
if (fd < 0) {
errno = EBADF;
return -1;
}
HANDLE h = (HANDLE)_get_osfhandle(fd);
unsigned int cur = SetFilePointer(h, 0, NULL, FILE_CURRENT);
if (cur == ~0 || SetFilePointer(h, size, NULL, FILE_BEGIN) == ~0 || !SetEndOfFile(h)) {
int error = GetLastError();
switch (GetLastError()) {
case ERROR_INVALID_HANDLE:
errno = EBADF;
break;
default:
errno = EIO;
break;
}
return -1;
}
return 0;
}
#endif
#endif

View File

@ -89,6 +89,16 @@ if (NOT TARGET test_idmap)
endif ()
target_link_libraries(test_idmap ${depend_libs} ${unittest_libs} ${basic_libs})
#<ANNOY-TEST>
set(annoy_srcs
${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexAnnoy.cpp
)
if (NOT TARGET test_annoy)
add_executable(test_annoy test_annoy.cpp ${annoy_srcs} ${util_srcs})
endif ()
target_link_libraries(test_annoy ${depend_libs} ${unittest_libs} ${basic_libs})
#<HNSW-TEST>
set(hnsw_srcs
${INDEX_SOURCE_DIR}/knowhere/knowhere/index/vector_index/IndexHNSW.cpp
@ -144,6 +154,7 @@ install(TARGETS test_idmap DESTINATION unittest)
install(TARGETS test_binaryidmap DESTINATION unittest)
install(TARGETS test_sptag DESTINATION unittest)
install(TARGETS test_knowhere_common DESTINATION unittest)
install(TARGETS test_annoy DESTINATION unittest)
if (KNOWHERE_GPU_VERSION)
install(TARGETS test_gpuresource DESTINATION unittest)

View File

@ -0,0 +1,221 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
#include <gtest/gtest.h>
#include <src/index/knowhere/knowhere/index/vector_index/helpers/IndexParameter.h>
#include <iostream>
#include <sstream>
#include "knowhere/common/Exception.h"
#include "knowhere/index/vector_index/IndexAnnoy.h"
#include "unittest/utils.h"
using ::testing::Combine;
using ::testing::TestWithParam;
using ::testing::Values;
int
main() {
int64_t d = 64; // dimension
int64_t nb = 10000; // database size
int64_t nq = 10; // 10000; // nb of queries
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(nb);
int64_t* ids = new int64_t[nb];
float* xb = new float[d * nb];
float* xq = new float[d * nq];
for (int i = 0; i < nb; i++) {
for (int j = 0; j < d; j++) xb[d * i + j] = (float)drand48();
xb[d * i] += i / 1000.;
ids[i] = i;
}
printf("gen xb and ids done! \n");
// srand((unsigned)time(NULL));
auto random_seed = (unsigned)time(NULL);
printf("delete ids: \n");
for (int i = 0; i < nq; i++) {
auto tmp = rand_r(&random_seed) % nb;
printf("%d\n", tmp);
// std::cout << "before delete, test result: " << bitset->test(tmp) << std::endl;
bitset->set(tmp);
// std::cout << "after delete, test result: " << bitset->test(tmp) << std::endl;
for (int j = 0; j < d; j++) xq[d * i + j] = xb[d * tmp + j];
// xq[d * i] += i / 1000.;
}
printf("\n");
int k = 4;
int n_trees = 5;
int search_k = 100;
milvus::knowhere::IndexAnnoy index;
milvus::knowhere::DatasetPtr base_dataset = generate_dataset(nb, d, (const void*)xb, ids);
milvus::knowhere::Config base_conf{
{milvus::knowhere::meta::DIM, d},
{milvus::knowhere::meta::TOPK, k},
{milvus::knowhere::IndexParams::n_trees, n_trees},
{milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
};
milvus::knowhere::DatasetPtr query_dataset = generate_query_dataset(nq, d, (const void*)xq);
milvus::knowhere::Config query_conf{
{milvus::knowhere::meta::DIM, d},
{milvus::knowhere::meta::TOPK, k},
{milvus::knowhere::IndexParams::search_k, search_k},
};
index.BuildAll(base_dataset, base_conf);
printf("------------sanity check----------------\n");
{ // sanity check
auto res = index.Query(query_dataset, query_conf);
printf("Query done!\n");
const int64_t* I = res->Get<int64_t*>(milvus::knowhere::meta::IDS);
float* D = res->Get<float*>(milvus::knowhere::meta::DISTANCE);
printf("I=\n");
for (int i = 0; i < 5; i++) {
for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]);
printf("\n");
}
printf("D=\n");
for (int i = 0; i < 5; i++) {
for (int j = 0; j < k; j++) printf("%7g ", D[i * k + j]);
printf("\n");
}
}
printf("---------------search xq-------------\n");
{ // search xq
auto res = index.Query(query_dataset, query_conf);
const int64_t* I = res->Get<int64_t*>(milvus::knowhere::meta::IDS);
printf("I=\n");
for (int i = 0; i < nq; i++) {
for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]);
printf("\n");
}
}
printf("----------------search xq with delete------------\n");
{ // search xq with delete
index.SetBlacklist(bitset);
auto res = index.Query(query_dataset, query_conf);
auto I = res->Get<int64_t*>(milvus::knowhere::meta::IDS);
printf("I=\n");
for (int i = 0; i < nq; i++) {
for (int j = 0; j < k; j++) printf("%5ld ", I[i * k + j]);
printf("\n");
}
}
delete[] xb;
delete[] xq;
delete[] ids;
return 0;
}
/*
class AnnoyTest : public DataGen, public TestWithParam<std::string> {
protected:
void
SetUp() override {
IndexType = GetParam();
std::cout << "IndexType from GetParam() is: " << IndexType << std::endl;
Generate(128, 1000, 5);
index_ = std::make_shared<milvus::knowhere::IndexAnnoy>();
conf = milvus::knowhere::Config{
{milvus::knowhere::meta::DIM, dim},
{milvus::knowhere::meta::TOPK, 1},
{milvus::knowhere::IndexParams::n_trees, 4},
{milvus::knowhere::IndexParams::search_k, 100},
{milvus::knowhere::Metric::TYPE, milvus::knowhere::Metric::L2},
};
// Init_with_default();
}
protected:
milvus::knowhere::Config conf;
std::shared_ptr<milvus::knowhere::IndexAnnoy> index_ = nullptr;
std::string IndexType;
};
INSTANTIATE_TEST_CASE_P(AnnoyParameters, AnnoyTest, Values(""));
TEST_P(AnnoyTest, annoy_basic) {
assert(!xb.empty());
// index_->Train(base_dataset, conf);
index_->BuildAll(base_dataset, conf);
auto result = index_->Query(query_dataset, conf);
AssertAnns(result, nq, k);
{
auto ids = result->Get<int64_t*>(milvus::knowhere::meta::IDS);
auto dist = result->Get<float*>(milvus::knowhere::meta::DISTANCE);
std::stringstream ss_id;
std::stringstream ss_dist;
for (auto i = 0; i < nq; i++) {
for (auto j = 0; j < k; ++j) {
// ss_id << *ids->data()->GetValues<int64_t>(1, i * k + j) << " ";
// ss_dist << *dists->data()->GetValues<float>(1, i * k + j) << " ";
ss_id << *((int64_t*)(ids) + i * k + j) << " ";
ss_dist << *((float*)(dist) + i * k + j) << " ";
}
ss_id << std::endl;
ss_dist << std::endl;
}
std::cout << "id\n" << ss_id.str() << std::endl;
std::cout << "dist\n" << ss_dist.str() << std::endl;
}
}
TEST_P(AnnoyTest, annoy_delete) {
assert(!xb.empty());
// index_->Train(base_dataset, conf);
index_->BuildAll(base_dataset, conf);
// index_->Add(base_dataset, conf);
faiss::ConcurrentBitsetPtr bitset = std::make_shared<faiss::ConcurrentBitset>(nb);
for (auto i = 0; i < nq; ++ i) {
bitset->set(i);
auto result = index_->Query(query_dataset, conf);
AssertAnns(result, nq, k);
{
auto ids = result->Get<int64_t*>(milvus::knowhere::meta::IDS);
auto dist = result->Get<float*>(milvus::knowhere::meta::DISTANCE);
std::stringstream ss_id;
std::stringstream ss_dist;
for (auto i = 0; i < nq; i++) {
for (auto j = 0; j < k; ++j) {
// ss_id << *ids->data()->GetValues<int64_t>(1, i * k + j) << " ";
// ss_dist << *dists->data()->GetValues<float>(1, i * k + j) << " ";
ss_id << *((int64_t*)(ids) + i * k + j) << " ";
ss_dist << *((float*)(dist) + i * k + j) << " ";
}
ss_id << std::endl;
ss_dist << std::endl;
}
std::cout << "id\n" << ss_id.str() << std::endl;
std::cout << "dist\n" << ss_dist.str() << std::endl;
} }
}
*/

View File

@ -71,6 +71,7 @@ DeleteByIDRequest::OnExecute() {
if (table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IDMAP &&
table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IDMAP &&
table_schema.engine_type_ != (int32_t)engine::EngineType::HNSW &&
table_schema.engine_type_ != (int32_t)engine::EngineType::ANNOY &&
table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFFLAT &&
table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_BIN_IVFFLAT &&
table_schema.engine_type_ != (int32_t)engine::EngineType::FAISS_IVFSQ8 &&

View File

@ -38,7 +38,7 @@ class TestConnect:
if not connect.connected():
milvus = get_milvus(args["handler"])
uri_value = "tcp://%s:%s" % (args["ip"], args["port"])
milvus.connect(uri=uri_value)
milvus.connect(uri=uri_value, timeout=5)
res = milvus.disconnect()
with pytest.raises(Exception) as e:
res = milvus.disconnect()
@ -181,9 +181,8 @@ class TestConnect:
'''
milvus = get_milvus(args["handler"])
uri_value = "tcp://%s:%s" % (args["ip"], args["port"])
milvus.connect(uri=uri_value)
milvus.connect(uri=uri_value)
milvus.connect(uri=uri_value, timeout=5)
milvus.connect(uri=uri_value, timeout=5)
assert milvus.connected()
def test_connect_disconnect_repeatedly_once(self, args):
@ -209,10 +208,10 @@ class TestConnect:
times = 10
milvus = get_milvus(args["handler"])
uri_value = "tcp://%s:%s" % (args["ip"], args["port"])
milvus.connect(uri=uri_value)
milvus.connect(uri=uri_value, timeout=5)
for i in range(times):
milvus.disconnect()
milvus.connect(uri=uri_value)
milvus.connect(uri=uri_value, timeout=5)
assert milvus.connected()
# TODO: enable

View File

@ -851,7 +851,7 @@ class TestSearchBase:
'store_raw_vector': False}
# create collection
milvus = get_milvus(args["handler"])
milvus.connect(uri=uri)
milvus.connect(uri=uri, timeout=5)
milvus.create_collection(param)
vectors, ids = self.init_data(milvus, collection, nb=nb)
query_vecs = vectors[nb//2:nb]
@ -864,7 +864,7 @@ class TestSearchBase:
for i in range(threads_num):
milvus = get_milvus(args["handler"])
milvus.connect(uri=uri)
milvus.connect(uri=uri, timeout=5)
t = threading.Thread(target=search, args=(milvus, ))
threads.append(t)
t.start()
@ -932,7 +932,7 @@ class TestSearchBase:
'metric_type': MetricType.L2}
# create collection
milvus = get_milvus(args["handler"])
milvus.connect(uri=uri)
milvus.connect(uri=uri, timeout=5)
milvus.create_collection(param)
status, ids = milvus.add_vectors(collection, vectors)
assert status.OK()
@ -973,7 +973,7 @@ class TestSearchBase:
'metric_type': MetricType.L2}
# create collection
milvus = get_milvus(args["handler"])
milvus.connect(uri=uri)
milvus.connect(uri=uri, timeout=5)
milvus.create_collection(param)
status, ids = milvus.add_vectors(collection, vectors)
assert status.OK()