From 386e58ce0d6a05af99e9a2234d18ba6c1b1e3b08 Mon Sep 17 00:00:00 2001
From: Cai Yudong <yudong.cai@zilliz.com>
Date: Fri, 22 May 2020 09:27:16 +0800
Subject: [PATCH] upgrade faiss 1.6.3 (#2400)

* roll back to original faiss 1.6.0

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update to faiss_1.6.3

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* patch all change to faiss 1.6.3

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* faiss CPU version build pass

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* faiss GPU version build pass

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
---
 .../src/index/thirdparty/faiss/Clustering.cpp | 375 +++++++++--
 core/src/index/thirdparty/faiss/Clustering.h  |  50 +-
 core/src/index/thirdparty/faiss/DirectMap.cpp | 267 ++++++++
 core/src/index/thirdparty/faiss/DirectMap.h   | 120 ++++
 core/src/index/thirdparty/faiss/IVFlib.cpp    |  28 +-
 core/src/index/thirdparty/faiss/IVFlib.h      |   4 +
 core/src/index/thirdparty/faiss/Index.cpp     |   7 +-
 core/src/index/thirdparty/faiss/Index.h       |  46 +-
 .../index/thirdparty/faiss/Index2Layer.cpp    |   3 +-
 core/src/index/thirdparty/faiss/IndexBinary.h |  20 +-
 .../thirdparty/faiss/IndexBinaryFlat.cpp      |  11 +-
 .../index/thirdparty/faiss/IndexBinaryFlat.h  |   9 +-
 .../thirdparty/faiss/IndexBinaryFromFloat.cpp |   3 +-
 .../thirdparty/faiss/IndexBinaryFromFloat.h   |   3 +-
 .../thirdparty/faiss/IndexBinaryHNSW.cpp      |   3 +-
 .../index/thirdparty/faiss/IndexBinaryHNSW.h  |   3 +-
 .../thirdparty/faiss/IndexBinaryHash.cpp      | 496 ++++++++++++++
 .../index/thirdparty/faiss/IndexBinaryHash.h  | 120 ++++
 .../index/thirdparty/faiss/IndexBinaryIVF.cpp | 301 +++++----
 .../index/thirdparty/faiss/IndexBinaryIVF.h   |  19 +-
 core/src/index/thirdparty/faiss/IndexFlat.cpp |  15 +-
 core/src/index/thirdparty/faiss/IndexFlat.h   |   3 +-
 core/src/index/thirdparty/faiss/IndexHNSW.cpp |  84 ++-
 core/src/index/thirdparty/faiss/IndexHNSW.h   |  13 +-
 core/src/index/thirdparty/faiss/IndexIVF.cpp  | 168 +++--
 core/src/index/thirdparty/faiss/IndexIVF.h    |  35 +-
 .../index/thirdparty/faiss/IndexIVFFlat.cpp   |  73 +--
 .../src/index/thirdparty/faiss/IndexIVFFlat.h |  12 +-
 .../src/index/thirdparty/faiss/IndexIVFPQ.cpp |  85 ++-
 core/src/index/thirdparty/faiss/IndexIVFPQ.h  |  19 +-
 .../index/thirdparty/faiss/IndexIVFPQR.cpp    |   4 +-
 .../thirdparty/faiss/IndexIVFSpectralHash.cpp |   4 +-
 core/src/index/thirdparty/faiss/IndexLSH.h    |   5 +-
 .../index/thirdparty/faiss/IndexLattice.cpp   |   2 +-
 .../src/index/thirdparty/faiss/IndexLattice.h |   3 +-
 core/src/index/thirdparty/faiss/IndexPQ.cpp   |   4 +-
 core/src/index/thirdparty/faiss/IndexPQ.h     |   3 +-
 .../thirdparty/faiss/IndexPreTransform.cpp    |   4 +-
 .../index/thirdparty/faiss/IndexReplicas.h    |   3 +-
 .../thirdparty/faiss/IndexScalarQuantizer.cpp |   9 +-
 .../thirdparty/faiss/IndexScalarQuantizer.h   |   1 -
 .../index/thirdparty/faiss/IndexShards.cpp    |   3 +-
 core/src/index/thirdparty/faiss/IndexShards.h |   3 +-
 .../index/thirdparty/faiss/InvertedLists.cpp  |   2 -
 .../index/thirdparty/faiss/InvertedLists.h    |   2 +-
 core/src/index/thirdparty/faiss/Makefile      |   2 +-
 .../index/thirdparty/faiss/MetaIndexes.cpp    |   9 +-
 core/src/index/thirdparty/faiss/MetaIndexes.h |   6 +-
 core/src/index/thirdparty/faiss/MetricType.h  |  41 ++
 core/src/index/thirdparty/faiss/README.md     |   6 +-
 .../faiss/acinclude/ax_check_cpu.m4           |   2 +-
 .../faiss/benchs/bench_all_ivf/datasets.py    |   1 +
 .../thirdparty/faiss/benchs/bench_gpu_1bn.py  |   6 +-
 .../faiss/benchs/bench_polysemous_1bn.py      |   6 +-
 .../faiss/benchs/bench_polysemous_sift1m.py   |   3 +-
 .../faiss/benchs/bench_vector_ops.py          |   4 +-
 .../faiss/benchs/distributed_ondisk/README.md |   1 +
 .../distributed_ondisk/combined_index.py      |   4 +-
 .../distributed_ondisk/distributed_kmeans.py  |   6 +-
 .../distributed_query_demo.py                 |   1 +
 .../distributed_ondisk/make_index_vslice.py   |   1 +
 .../distributed_ondisk/merge_to_ondisk.py     |   2 +-
 .../faiss/benchs/distributed_ondisk/rpc.py    |   5 +-
 .../distributed_ondisk/search_server.py       |   1 +
 core/src/index/thirdparty/faiss/build.sh      |   5 +-
 .../thirdparty/faiss/c_api/AutoTune_c.cpp     |  12 +-
 .../index/thirdparty/faiss/c_api/AutoTune_c.h |   7 +-
 .../thirdparty/faiss/c_api/Clustering_c.cpp   |  20 +-
 .../thirdparty/faiss/c_api/Clustering_c.h     |  16 +-
 .../thirdparty/faiss/c_api/IndexIVF_c.cpp     |   7 +
 .../index/thirdparty/faiss/c_api/IndexIVF_c.h |   7 +
 .../faiss/c_api/IndexPreTransform_c.cpp       |  21 +
 .../faiss/c_api/IndexPreTransform_c.h         |  32 +
 .../index/thirdparty/faiss/c_api/Index_c.cpp  |   7 +-
 .../index/thirdparty/faiss/c_api/Index_c.h    |  29 +-
 .../src/index/thirdparty/faiss/c_api/Makefile |  40 +-
 .../thirdparty/faiss/c_api/clone_index_c.cpp  |  23 +
 .../thirdparty/faiss/c_api/clone_index_c.h    |  32 +
 .../index/thirdparty/faiss/c_api/example_c.c  |   1 +
 .../c_api/{ => impl}/AuxIndexStructures_c.cpp |  31 +-
 .../c_api/{ => impl}/AuxIndexStructures_c.h   |  20 +-
 .../faiss/c_api/index_factory_c.cpp           |  26 +
 .../thirdparty/faiss/c_api/index_factory_c.h  |  30 +
 .../thirdparty/faiss/c_api/index_io_c.cpp     |   9 +-
 .../index/thirdparty/faiss/c_api/index_io_c.h |   7 +-
 .../index/thirdparty/faiss/clone_index.cpp    |   6 +
 .../thirdparty/faiss/demos/demo_auto_tune.py  |  15 +-
 .../thirdparty/faiss/demos/demo_sift1M.cpp    |   2 +-
 .../faiss/demos/demo_weighted_kmeans.cpp      | 185 ++++++
 .../example_makefiles/makefile.inc.Linux      |   2 +-
 .../example_makefiles/makefile.inc.Mac.brew   |   2 +-
 .../example_makefiles/makefile.inc.Mac.port   |   2 +-
 .../index/thirdparty/faiss/gpu/GpuCloner.cpp  |   5 +-
 .../index/thirdparty/faiss/gpu/GpuCloner.h    |   6 +-
 .../thirdparty/faiss/gpu/GpuClonerOptions.cpp |   2 +-
 .../index/thirdparty/faiss/gpu/GpuDistance.cu | 188 +++---
 .../index/thirdparty/faiss/gpu/GpuDistance.h  |  93 +++
 .../index/thirdparty/faiss/gpu/GpuIndex.cu    |  23 +-
 .../src/index/thirdparty/faiss/gpu/GpuIndex.h |   7 +
 .../thirdparty/faiss/gpu/GpuIndexFlat.cu      |  51 +-
 .../index/thirdparty/faiss/gpu/GpuIndexFlat.h |  12 +-
 .../index/thirdparty/faiss/gpu/GpuIndexIVF.cu |  55 +-
 .../index/thirdparty/faiss/gpu/GpuIndexIVF.h  |   1 +
 .../thirdparty/faiss/gpu/GpuIndexIVFFlat.cu   |  49 +-
 .../thirdparty/faiss/gpu/GpuIndexIVFPQ.cu     |  21 +-
 .../faiss/gpu/GpuIndexIVFSQHybrid.cu          |   6 +-
 .../faiss/gpu/GpuIndexIVFScalarQuantizer.cu   |  10 +-
 .../faiss/gpu/StandardGpuResources.cpp        |   8 +
 .../thirdparty/faiss/gpu/impl/Distance.cu     | 152 +----
 .../thirdparty/faiss/gpu/impl/Distance.cuh    | 153 ++++-
 .../faiss/gpu/impl/DistanceUtils.cuh          | 343 ++++++++++
 .../thirdparty/faiss/gpu/impl/FlatIndex.cu    | 148 ++---
 .../thirdparty/faiss/gpu/impl/FlatIndex.cuh   |  25 +-
 .../faiss/gpu/impl/GeneralDistance.cuh        | 432 +++++++++++++
 .../faiss/gpu/impl/GpuScalarQuantizer.cuh     |  10 +-
 .../thirdparty/faiss/gpu/impl/IVFBase.cu      |   4 +
 .../thirdparty/faiss/gpu/impl/IVFBase.cuh     |   9 +
 .../thirdparty/faiss/gpu/impl/IVFFlat.cu      |  12 +-
 .../thirdparty/faiss/gpu/impl/IVFFlat.cuh     |   7 +-
 .../thirdparty/faiss/gpu/impl/IVFFlatScan.cu  |  37 +-
 .../thirdparty/faiss/gpu/impl/IVFFlatScan.cuh |   4 +-
 .../index/thirdparty/faiss/gpu/impl/IVFPQ.cu  |  92 ++-
 .../index/thirdparty/faiss/gpu/impl/IVFPQ.cuh |  19 +
 .../index/thirdparty/faiss/gpu/impl/L2Norm.cu |  42 +-
 .../thirdparty/faiss/gpu/impl/L2Norm.cuh      |   2 +-
 .../thirdparty/faiss/gpu/impl/L2Select.cu     |  41 +-
 .../thirdparty/faiss/gpu/impl/L2Select.cuh    |   8 -
 .../faiss/gpu/impl/PQCodeDistances-inl.cuh    | 561 ++++++++++++++++
 .../faiss/gpu/impl/PQCodeDistances.cu         | 150 +++--
 .../faiss/gpu/impl/PQCodeDistances.cuh        |   9 +-
 .../impl/PQScanMultiPassNoPrecomputed-inl.cuh | 606 ++++++++++++++++++
 .../gpu/impl/PQScanMultiPassNoPrecomputed.cu  | 402 ++++++------
 .../gpu/impl/PQScanMultiPassNoPrecomputed.cuh |   7 +-
 .../thirdparty/faiss/gpu/perf/PerfFlat.cu     |   1 -
 .../faiss/gpu/test/TestGpuDistance.cu         | 121 ++--
 .../faiss/gpu/test/TestGpuIndexFlat.cpp       |  76 ++-
 .../faiss/gpu/test/TestGpuIndexIVFPQ.cpp      | 112 +++-
 .../faiss/gpu/test/test_gpu_index.py          |  50 +-
 .../faiss/gpu/test/test_pytorch_faiss.py      |  12 +-
 .../faiss/gpu/utils/BlockSelectImpl.cuh       | 106 +++
 .../faiss/gpu/utils/ConversionOperators.cuh   |   6 +-
 .../thirdparty/faiss/gpu/utils/DeviceUtils.cu |  17 +-
 .../thirdparty/faiss/gpu/utils/DeviceUtils.h  |   6 +
 .../faiss/gpu/utils/DeviceVector.cuh          |   3 +-
 .../thirdparty/faiss/gpu/utils/Float16.cuh    |   6 +-
 .../faiss/gpu/utils/MathOperators.cuh         |  38 +-
 .../faiss/gpu/utils/MatrixMult-inl.cuh        | 160 +++++
 .../thirdparty/faiss/gpu/utils/MatrixMult.cu  | 187 ------
 .../thirdparty/faiss/gpu/utils/MatrixMult.cuh |  38 +-
 .../thirdparty/faiss/gpu/utils/Tensor-inl.cuh |  33 +-
 .../thirdparty/faiss/gpu/utils/Tensor.cuh     |   5 +
 .../gpu/utils/blockselect/BlockSelectImpl.cuh |  10 +-
 .../faiss/impl/AuxIndexStructures.cpp         |  17 +
 .../faiss/impl/AuxIndexStructures.h           |  17 +-
 core/src/index/thirdparty/faiss/impl/HNSW.cpp |   1 -
 .../faiss/impl/PolysemousTraining.h           |  10 +-
 .../faiss/impl/ProductQuantizer-inl.h         | 138 ++++
 .../faiss/impl/ProductQuantizer.cpp           | 114 +---
 .../thirdparty/faiss/impl/ProductQuantizer.h  | 115 ++--
 .../thirdparty/faiss/impl/ScalarQuantizer.h   |   1 -
 .../thirdparty/faiss/impl/index_read.cpp      | 110 +++-
 .../thirdparty/faiss/impl/index_write.cpp     | 106 ++-
 core/src/index/thirdparty/faiss/impl/io.cpp   | 112 +++-
 core/src/index/thirdparty/faiss/impl/io.h     |  38 ++
 .../index/thirdparty/faiss/index_factory.cpp  |   2 -
 .../index/thirdparty/faiss/index_factory.h    |   3 +-
 .../index/thirdparty/faiss/python/faiss.py    | 140 +++-
 .../index/thirdparty/faiss/python/setup.py    |   2 +-
 .../thirdparty/faiss/python/swigfaiss.swig    | 128 +++-
 .../index/thirdparty/faiss/tests/common.py    |  33 +-
 .../faiss/tests/test_binary_factory.py        |   2 +-
 .../faiss/tests/test_binary_hashindex.py      | 183 ++++++
 .../thirdparty/faiss/tests/test_binary_io.py  |   4 +-
 .../faiss/tests/test_build_blocks.py          | 161 ++++-
 .../faiss/tests/test_extra_distances.py       |   6 +
 .../thirdparty/faiss/tests/test_factory.py    |  14 +-
 .../thirdparty/faiss/tests/test_index.py      | 139 ++--
 .../faiss/tests/test_index_accuracy.py        |  32 +-
 .../faiss/tests/test_index_binary.py          | 113 +++-
 .../tests/test_index_binary_from_float.py     |   1 +
 .../faiss/tests/test_index_composite.py       |   9 +-
 .../index/thirdparty/faiss/tests/test_io.py   | 220 +++++++
 .../thirdparty/faiss/tests/test_ivflib.py     |  61 +-
 .../thirdparty/faiss/tests/test_merge.cpp     |   1 -
 .../thirdparty/faiss/tests/test_meta_index.py |   2 +-
 .../faiss/tests/test_omp_threads_py.py        |   1 +
 .../faiss/tests/test_pq_encoding.cpp          |  12 +-
 .../faiss/tests/test_referenced_objects.py    |   5 +-
 .../faiss/tutorial/cpp/5-Multiple-GPUs.cpp    | 100 +++
 .../thirdparty/faiss/tutorial/cpp/Makefile    |   2 +-
 .../thirdparty/faiss/utils/distances.cpp      |   9 +-
 .../faiss/utils/extra_distances.cpp           |  24 +-
 .../index/thirdparty/faiss/utils/hamming.cpp  |  98 ++-
 .../index/thirdparty/faiss/utils/hamming.h    |  24 +-
 .../thirdparty/faiss/utils/jaccard-inl.h      |   4 -
 .../index/thirdparty/faiss/utils/utils.cpp    |  96 ---
 core/src/index/thirdparty/faiss/utils/utils.h |  15 -
 197 files changed, 8168 insertions(+), 2206 deletions(-)
 create mode 100644 core/src/index/thirdparty/faiss/DirectMap.cpp
 create mode 100644 core/src/index/thirdparty/faiss/DirectMap.h
 create mode 100644 core/src/index/thirdparty/faiss/IndexBinaryHash.cpp
 create mode 100644 core/src/index/thirdparty/faiss/IndexBinaryHash.h
 create mode 100644 core/src/index/thirdparty/faiss/MetricType.h
 create mode 100644 core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp
 create mode 100644 core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h
 create mode 100644 core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp
 create mode 100644 core/src/index/thirdparty/faiss/c_api/clone_index_c.h
 rename core/src/index/thirdparty/faiss/c_api/{ => impl}/AuxIndexStructures_c.cpp (87%)
 rename core/src/index/thirdparty/faiss/c_api/{ => impl}/AuxIndexStructures_c.h (86%)
 create mode 100644 core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp
 create mode 100644 core/src/index/thirdparty/faiss/c_api/index_factory_c.h
 create mode 100644 core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
 create mode 100644 core/src/index/thirdparty/faiss/gpu/impl/DistanceUtils.cuh
 create mode 100644 core/src/index/thirdparty/faiss/gpu/impl/GeneralDistance.cuh
 create mode 100644 core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh
 create mode 100644 core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
 create mode 100644 core/src/index/thirdparty/faiss/gpu/utils/BlockSelectImpl.cuh
 create mode 100644 core/src/index/thirdparty/faiss/gpu/utils/MatrixMult-inl.cuh
 create mode 100644 core/src/index/thirdparty/faiss/impl/ProductQuantizer-inl.h
 create mode 100644 core/src/index/thirdparty/faiss/tests/test_binary_hashindex.py
 create mode 100644 core/src/index/thirdparty/faiss/tests/test_io.py
 create mode 100644 core/src/index/thirdparty/faiss/tutorial/cpp/5-Multiple-GPUs.cpp

diff --git a/core/src/index/thirdparty/faiss/Clustering.cpp b/core/src/index/thirdparty/faiss/Clustering.cpp
index eb414afa57..eba243d17d 100644
--- a/core/src/index/thirdparty/faiss/Clustering.cpp
+++ b/core/src/index/thirdparty/faiss/Clustering.cpp
@@ -10,11 +10,12 @@
 #include <faiss/Clustering.h>
 #include <faiss/impl/AuxIndexStructures.h>
 
-
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 
+#include <omp.h>
+
 #include <faiss/utils/utils.h>
 #include <faiss/utils/random.h>
 #include <faiss/utils/distances.h>
@@ -33,7 +34,8 @@ ClusteringParameters::ClusteringParameters ():
     frozen_centroids(false),
     min_points_per_centroid(39),
     max_points_per_centroid(256),
-    seed(1234)
+    seed(1234),
+    decode_block_size(32768)
 {}
 // 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
 
@@ -76,35 +78,233 @@ void Clustering::post_process_centroids ()
 }
 
 
-void Clustering::train (idx_t nx, const float *x_in, Index & index) {
+void Clustering::train (idx_t nx, const float *x_in, Index & index,
+                        const float *weights) {
+    train_encoded (nx, reinterpret_cast<const uint8_t *>(x_in), nullptr,
+                   index, weights);
+}
+
+
+namespace {
+
+using idx_t = Clustering::idx_t;
+
+idx_t subsample_training_set(
+          const Clustering &clus, idx_t nx, const uint8_t *x,
+          size_t line_size, const float * weights,
+          uint8_t **x_out,
+          float **weights_out
+)
+{
+    if (clus.verbose) {
+        printf("Sampling a subset of %ld / %ld for training\n",
+               clus.k * clus.max_points_per_centroid, nx);
+    }
+    std::vector<int> perm (nx);
+    rand_perm (perm.data (), nx, clus.seed);
+    nx = clus.k * clus.max_points_per_centroid;
+    uint8_t * x_new = new uint8_t [nx * line_size];
+    *x_out = x_new;
+    for (idx_t i = 0; i < nx; i++) {
+        memcpy (x_new + i * line_size, x + perm[i] * line_size, line_size);
+    }
+    if (weights) {
+        float *weights_new = new float[nx];
+        for (idx_t i = 0; i < nx; i++) {
+            weights_new[i] = weights[perm[i]];
+        }
+        *weights_out = weights_new;
+    } else {
+        *weights_out = nullptr;
+    }
+    return nx;
+}
+
+/** compute centroids as (weighted) sum of training points
+ *
+ * @param x            training vectors, size n * code_size (from codec)
+ * @param codec        how to decode the vectors (if NULL then cast to float*)
+ * @param weights      per-training vector weight, size n (or NULL)
+ * @param assign       nearest centroid for each training vector, size n
+ * @param k_frozen     do not update the k_frozen first centroids
+ * @param centroids    centroid vectors (output only), size k * d
+ * @param hassign      histogram of assignments per centroid (size k),
+ *                     should be 0 on input
+ *
+ */
+
+void compute_centroids (size_t d, size_t k, size_t n,
+                       size_t k_frozen,
+                       const uint8_t * x, const Index *codec,
+                       const int64_t * assign,
+                       const float * weights,
+                       float * hassign,
+                       float * centroids)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+
+    memset (centroids, 0, sizeof(*centroids) * d * k);
+
+    size_t line_size = codec ? codec->sa_code_size() : d * sizeof (float);
+
+#pragma omp parallel
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // this thread is taking care of centroids c0:c1
+        size_t c0 = (k * rank) / nt;
+        size_t c1 = (k * (rank + 1)) / nt;
+        std::vector<float> decode_buffer (d);
+
+        for (size_t i = 0; i < n; i++) {
+            int64_t ci = assign[i];
+            assert (ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
+            if (ci >= c0 && ci < c1)  {
+                float * c = centroids + ci * d;
+                const float * xi;
+                if (!codec) {
+                    xi = reinterpret_cast<const float*>(x + i * line_size);
+                } else {
+                    float *xif = decode_buffer.data();
+                    codec->sa_decode (1, x + i * line_size, xif);
+                    xi = xif;
+                }
+                if (weights) {
+                    float w = weights[i];
+                    hassign[ci] += w;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j] * w;
+                    }
+                } else {
+                    hassign[ci] += 1.0;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j];
+                    }
+                }
+            }
+        }
+
+    }
+
+#pragma omp parallel for
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) {
+            continue;
+        }
+        float norm = 1 / hassign[ci];
+        float * c = centroids + ci * d;
+        for (size_t j = 0; j < d; j++) {
+            c[j] *= norm;
+        }
+    }
+
+}
+
+// a bit above machine epsilon for float16
+#define EPS (1 / 1024.)
+
+/** Handle empty clusters by splitting larger ones.
+ *
+ * It works by slightly changing the centroids to make 2 clusters from
+ * a single one. Takes the same arguements as compute_centroids.
+ *
+ * @return           nb of spliting operations (larger is worse)
+ */
+int split_clusters (size_t d, size_t k, size_t n,
+                    size_t k_frozen,
+                    float * hassign,
+                    float * centroids)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+
+    /* Take care of void clusters */
+    size_t nsplit = 0;
+    RandomGenerator rng (1234);
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) { /* need to redefine a centroid */
+            size_t cj;
+            for (cj = 0; 1; cj = (cj + 1) % k) {
+                /* probability to pick this cluster for split */
+                float p = (hassign[cj] - 1.0) / (float) (n - k);
+                float r = rng.rand_float ();
+                if (r < p) {
+                    break; /* found our cluster to be split */
+                }
+            }
+            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
+
+            /* small symmetric pertubation */
+            for (size_t j = 0; j < d; j++) {
+                if (j % 2 == 0) {
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
+                }
+            }
+
+            /* assume even split of the cluster */
+            hassign[ci] = hassign[cj] / 2;
+            hassign[cj] -= hassign[ci];
+            nsplit++;
+        }
+    }
+
+    return nsplit;
+
+}
+
+
+
+};
+
+
+void Clustering::train_encoded (idx_t nx, const uint8_t *x_in,
+                                const Index * codec, Index & index,
+                                const float *weights) {
+
     FAISS_THROW_IF_NOT_FMT (nx >= k,
              "Number of training points (%ld) should be at least "
              "as large as number of clusters (%ld)", nx, k);
 
+    FAISS_THROW_IF_NOT_FMT ((!codec || codec->d == d),
+             "Codec dimension %d not the same as data dimension %d",
+             int(codec->d), int(d));
+
+    FAISS_THROW_IF_NOT_FMT (index.d == d,
+            "Index dimension %d not the same as data dimension %d",
+            int(index.d), int(d));
+
     double t0 = getmillisecs();
 
-    // yes it is the user's responsibility, but it may spare us some
-    // hard-to-debug reports.
-    for (size_t i = 0; i < nx * d; i++) {
-      FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
-                        "input contains NaN's or Inf's");
+    if (!codec) {
+        // Check for NaNs in input data. Normally it is the user's
+        // responsibility, but it may spare us some hard-to-debug
+        // reports.
+        const float *x = reinterpret_cast<const float *>(x_in);
+        for (size_t i = 0; i < nx * d; i++) {
+            FAISS_THROW_IF_NOT_MSG (finite (x[i]),
+                                    "input contains NaN's or Inf's");
+        }
     }
 
-    const float *x = x_in;
-    ScopeDeleter<float> del1;
+    const uint8_t *x = x_in;
+    std::unique_ptr<uint8_t []> del1;
+    std::unique_ptr<float []> del3;
+    size_t line_size = codec ? codec->sa_code_size() : sizeof(float) * d;
 
     if (nx > k * max_points_per_centroid) {
-        if (verbose)
-            printf("Sampling a subset of %ld / %ld for training\n",
-                   k * max_points_per_centroid, nx);
-        std::vector<int> perm (nx);
-        rand_perm (perm.data (), nx, seed);
-        nx = k * max_points_per_centroid;
-        float * x_new = new float [nx * d];
-        for (idx_t i = 0; i < nx; i++)
-            memcpy (x_new + i * d, x + perm[i] * d, sizeof(x_new[0]) * d);
-        x = x_new;
-        del1.set (x);
+        uint8_t *x_new;
+        float *weights_new;
+        nx = subsample_training_set (*this, nx, x, line_size, weights,
+                                &x_new, &weights_new);
+        del1.reset (x_new); x = x_new;
+        del3.reset (weights_new); weights = weights_new;
     } else if (nx < k * min_points_per_centroid) {
         fprintf (stderr,
                  "WARNING clustering %ld points to %ld centroids: "
@@ -112,41 +312,53 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                  nx, k, idx_t(k) * min_points_per_centroid);
     }
 
-
     if (nx == k) {
+        // this is a corner case, just copy training set to clusters
         if (verbose) {
             printf("Number of training points (%ld) same as number of "
                    "clusters, just copying\n", nx);
         }
-        // this is a corner case, just copy training set to clusters
         centroids.resize (d * k);
-        memcpy (centroids.data(), x_in, sizeof (*x_in) * d * k);
+        if (!codec) {
+            memcpy (centroids.data(), x_in, sizeof (float) * d * k);
+        } else {
+            codec->sa_decode (nx, x_in, centroids.data());
+        }
+
+        // one fake iteration...
+        ClusteringIterationStats stats = { 0.0, 0.0, 0.0, 1.0, 0 };
+        iteration_stats.push_back (stats);
+
         index.reset();
-        index.add(k, x_in);
+        index.add(k, centroids.data());
         return;
     }
 
 
-    if (verbose)
+    if (verbose) {
         printf("Clustering %d points in %ldD to %ld clusters, "
                "redo %d times, %d iterations\n",
                int(nx), d, k, nredo, niter);
+        if (codec) {
+            printf("Input data encoded in %ld bytes per vector\n",
+                   codec->sa_code_size ());
+        }
+    }
 
-    idx_t * assign = new idx_t[nx];
-    ScopeDeleter<idx_t> del (assign);
-    float * dis = new float[nx];
-    ScopeDeleter<float> del2(dis);
+    std::unique_ptr<idx_t []> assign(new idx_t[nx]);
+    std::unique_ptr<float []> dis(new float[nx]);
 
-    // for redo
+    // remember best iteration for redo
     float best_err = HUGE_VALF;
-    std::vector<float> best_obj;
+    std::vector<ClusteringIterationStats> best_obj;
     std::vector<float> best_centroids;
 
     // support input centroids
 
     FAISS_THROW_IF_NOT_MSG (
        centroids.size() % d == 0,
-       "size of provided input centroids not a multiple of dimension");
+       "size of provided input centroids not a multiple of dimension"
+    );
 
     size_t n_input_centroids = centroids.size() / d;
 
@@ -162,23 +374,36 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
     }
     t0 = getmillisecs();
 
+    // temporary buffer to decode vectors during the optimization
+    std::vector<float> decode_buffer
+        (codec ? d * decode_block_size : 0);
+
     for (int redo = 0; redo < nredo; redo++) {
 
         if (verbose && nredo > 1) {
             printf("Outer iteration %d / %d\n", redo, nredo);
         }
 
-        // initialize remaining centroids with random points from the dataset
+        // initialize (remaining) centroids with random points from the dataset
         centroids.resize (d * k);
         std::vector<int> perm (nx);
 
         rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
-        for (int i = n_input_centroids; i < k ; i++)
-            memcpy (&centroids[i * d], x + perm[i] * d,
-                    d * sizeof (float));
+
+        if (!codec) {
+            for (int i = n_input_centroids; i < k ; i++) {
+                memcpy (&centroids[i * d], x + perm[i] * line_size, line_size);
+            }
+        } else {
+            for (int i = n_input_centroids; i < k ; i++) {
+                codec->sa_decode (1, x + perm[i] * line_size, &centroids[i * d]);
+            }
+        }
 
         post_process_centroids ();
 
+        // prepare the index
+
         if (index.ntotal != 0) {
             index.reset();
         }
@@ -188,49 +413,89 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
         }
 
         index.add (k, centroids.data());
+
+        // k-means iterations
+
         float err = 0;
         for (int i = 0; i < niter; i++) {
             double t0s = getmillisecs();
-            index.assign(nx, x, assign, dis);
+
+            if (!codec) {
+                index.assign (nx, reinterpret_cast<const float *>(x),
+                              assign.get(), dis.get());
+            } else {
+                // search by blocks of decode_block_size vectors
+                size_t code_size = codec->sa_code_size ();
+                for (size_t i0 = 0; i0 < nx; i0 += decode_block_size) {
+                    size_t i1 = i0 + decode_block_size;
+                    if (i1 > nx) { i1 = nx; }
+                    codec->sa_decode (i1 - i0, x + code_size * i0,
+                                      decode_buffer.data ());
+                    index.search (i1 - i0, decode_buffer.data (), 1,
+                                  dis.get() + i0, assign.get() + i0);
+                }
+            }
+
             InterruptCallback::check();
             t_search_tot += getmillisecs() - t0s;
 
+            // accumulate error
             err = 0;
-            for (int j = 0; j < nx; j++)
+            for (int j = 0; j < nx; j++) {
                 err += dis[j];
-            obj.push_back (err);
+            }
 
-            int nsplit = km_update_centroids (
-                  x, centroids.data(),
-                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
+            // update the centroids
+            std::vector<float> hassign (k);
+
+            size_t k_frozen = frozen_centroids ? n_input_centroids : 0;
+            compute_centroids (
+                  d, k, nx, k_frozen,
+                  x, codec, assign.get(), weights,
+                  hassign.data(), centroids.data()
+            );
+
+            int nsplit = split_clusters (
+                  d, k, nx, k_frozen,
+                  hassign.data(), centroids.data()
+            );
+
+            // collect statistics
+            ClusteringIterationStats stats =
+                { err, (getmillisecs() - t0) / 1000.0,
+                  t_search_tot / 1000, imbalance_factor (nx, k, assign.get()),
+                  nsplit };
+            iteration_stats.push_back(stats);
 
             if (verbose) {
                 printf ("  Iteration %d (%.2f s, search %.2f s): "
                         "objective=%g imbalance=%.3f nsplit=%d       \r",
-                        i, (getmillisecs() - t0) / 1000.0,
-                        t_search_tot / 1000,
-                        err, imbalance_factor (nx, k, assign),
-                        nsplit);
+                        i, stats.time, stats.time_search, stats.obj,
+                        stats.imbalance_factor, nsplit);
                 fflush (stdout);
             }
 
             post_process_centroids ();
 
-            index.reset ();
-            if (update_index)
-                index.train (k, centroids.data());
+            // add centroids to index for the next iteration (or for output)
+
+            index.reset ();
+            if (update_index) {
+                index.train (k, centroids.data());
+            }
 
-            assert (index.ntotal == 0);
             index.add (k, centroids.data());
             InterruptCallback::check ();
         }
+
         if (verbose) printf("\n");
         if (nredo > 1) {
             if (err < best_err) {
-                if (verbose)
+                if (verbose) {
                     printf ("Objective improved: keep new clusters\n");
+                }
                 best_centroids = centroids;
-                best_obj = obj;
+                best_obj = iteration_stats;
                 best_err = err;
             }
             index.reset ();
@@ -238,7 +503,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
     }
     if (nredo > 1) {
         centroids = best_centroids;
-        obj = best_obj;
+        iteration_stats = best_obj;
         index.reset();
         index.add(k, best_centroids.data());
     }
@@ -255,7 +520,7 @@ float kmeans_clustering (size_t d, size_t n, size_t k,
     IndexFlatL2 index (d);
     clus.train (n, x, index);
     memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
-    return clus.obj.back();
+    return clus.iteration_stats.back().obj;
 }
 
 } // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/Clustering.h b/core/src/index/thirdparty/faiss/Clustering.h
index fd51ef599b..46410af79f 100644
--- a/core/src/index/thirdparty/faiss/Clustering.h
+++ b/core/src/index/thirdparty/faiss/Clustering.h
@@ -26,7 +26,7 @@ struct ClusteringParameters {
     bool verbose;
     bool spherical;     ///< do we want normalized centroids?
     bool int_centroids; ///< round centroids coordinates to integer
-    bool update_index;  ///< update index after each iteration?
+    bool update_index;  ///< re-train index after each iteration?
     bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations
 
     int min_points_per_centroid; ///< otherwise you get a warning
@@ -34,12 +34,23 @@ struct ClusteringParameters {
 
     int seed; ///< seed for the random number generator
 
+    size_t decode_block_size;  ///< how many vectors at a time to decode
+
     /// sets reasonable defaults
     ClusteringParameters ();
 };
 
 
-/** clustering based on assignment - centroid update iterations
+struct ClusteringIterationStats {
+    float obj;               ///< objective values (sum of distances reported by index)
+    double time;             ///< seconds for iteration
+    double time_search;      ///< seconds for just search
+    double imbalance_factor; ///< imbalance factor of iteration
+    int nsplit;              ///< number of cluster splits
+};
+
+
+/** K-means clustering based on assignment - centroid update iterations
  *
  * The clustering is based on an Index object that assigns training
  * points to the centroids. Therefore, at each iteration the centroids
@@ -50,27 +61,44 @@ struct ClusteringParameters {
  * centroids table it is not empty on input, it is also used for
  * initialization.
  *
- * To do several clusterings, just call train() several times on
- * different training sets, clearing the centroid table in between.
  */
 struct Clustering: ClusteringParameters {
     typedef Index::idx_t idx_t;
     size_t d;              ///< dimension of the vectors
     size_t k;              ///< nb of centroids
 
-    /// centroids (k * d)
+    /** centroids (k * d)
+     * if centroids are set on input to train, they will be used as initialization
+     */
     std::vector<float> centroids;
 
-    /// objective values (sum of distances reported by index) over
-    /// iterations
-    std::vector<float> obj;
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;
 
-    /// the only mandatory parameters are k and d
     Clustering (int d, int k);
     Clustering (int d, int k, const ClusteringParameters &cp);
 
-    /// Index is used during the assignment stage
-    virtual void train (idx_t n, const float * x, faiss::Index & index);
+    /** run k-means training
+     *
+     * @param x          training vectors, size n * d
+     * @param index      index used for assignment
+     * @param x_weights  weight associated to each vector: NULL or size n
+     */
+    virtual void train (idx_t n, const float * x, faiss::Index & index,
+                        const float *x_weights = nullptr);
+
+
+    /** run with encoded vectors
+     *
+     * win addition to train()'s parameters takes a codec as parameter
+     * to decode the input vectors.
+     *
+     * @param codec      codec used to decode the vectors (nullptr =
+     *                   vectors are in fact floats)     *
+     */
+    void train_encoded (idx_t nx, const uint8_t *x_in,
+                        const Index * codec, Index & index,
+                        const float *weights = nullptr);
 
     /// Post-process the centroids after each centroid update.
     /// includes optional L2 normalization and nearest integer rounding
diff --git a/core/src/index/thirdparty/faiss/DirectMap.cpp b/core/src/index/thirdparty/faiss/DirectMap.cpp
new file mode 100644
index 0000000000..bd3cf5460f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/DirectMap.cpp
@@ -0,0 +1,267 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/DirectMap.h>
+
+#include <cstdio>
+#include <cassert>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+DirectMap::DirectMap(): type(NoMap)
+{}
+
+void DirectMap::set_type (Type new_type, const InvertedLists *invlists, size_t ntotal) {
+
+    FAISS_THROW_IF_NOT (new_type == NoMap || new_type == Array ||
+                        new_type == Hashtable);
+
+    if (new_type == type) {
+        // nothing to do
+        return;
+    }
+
+    array.clear ();
+    hashtable.clear ();
+    type = new_type;
+
+    if (new_type == NoMap) {
+        return;
+    } else if (new_type == Array) {
+        array.resize (ntotal, -1);
+    } else if (new_type == Hashtable) {
+        hashtable.reserve (ntotal);
+    }
+
+    for (size_t key = 0; key < invlists->nlist; key++) {
+        size_t list_size = invlists->list_size (key);
+        InvertedLists::ScopedIds idlist (invlists, key);
+
+        if (new_type == Array) {
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                FAISS_THROW_IF_NOT_MSG (
+                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
+                       "direct map supported only for seuquential ids");
+                array [idlist [ofs]] = lo_build(key, ofs);
+            }
+        } else if (new_type == Hashtable) {
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                hashtable [idlist [ofs]] = lo_build(key, ofs);
+            }
+        }
+    }
+}
+
+void DirectMap::clear()
+{
+    array.clear ();
+    hashtable.clear ();
+}
+
+
+DirectMap::idx_t DirectMap::get (idx_t key) const
+{
+    if (type == Array) {
+        FAISS_THROW_IF_NOT_MSG (
+             key >= 0 && key < array.size(), "invalid key"
+        );
+        idx_t lo = array[key];
+        FAISS_THROW_IF_NOT_MSG(lo >= 0, "-1 entry in direct_map");
+        return lo;
+    } else if (type == Hashtable) {
+        auto res = hashtable.find (key);
+        FAISS_THROW_IF_NOT_MSG (res != hashtable.end(), "key not found");
+        return res->second;
+    } else {
+        FAISS_THROW_MSG ("direct map not initialized");
+    }
+}
+
+
+
+void DirectMap::add_single_id (idx_t id, idx_t list_no, size_t offset)
+{
+    if (type == NoMap) return;
+
+    if (type == Array) {
+        assert (id == array.size());
+        if (list_no >= 0) {
+            array.push_back (lo_build (list_no, offset));
+        } else {
+            array.push_back (-1);
+        }
+    } else if (type == Hashtable) {
+        if (list_no >= 0) {
+            hashtable[id] = lo_build (list_no, offset);
+        }
+    }
+
+}
+
+void DirectMap::check_can_add (const idx_t *ids) {
+    if (type == Array && ids) {
+        FAISS_THROW_MSG ("cannot have array direct map and add with ids");
+    }
+}
+
+/********************* DirectMapAdd implementation */
+
+
+DirectMapAdd::DirectMapAdd (DirectMap &direct_map, size_t n, const idx_t *xids):
+    direct_map(direct_map), type(direct_map.type), n(n), xids(xids)
+{
+    if (type == DirectMap::Array)  {
+        FAISS_THROW_IF_NOT (xids == nullptr);
+        ntotal = direct_map.array.size();
+        direct_map.array.resize (ntotal + n, -1);
+    } else if (type == DirectMap::Hashtable) {
+        // can't parallel update hashtable so use temp array
+        all_ofs.resize (n, -1);
+    }
+}
+
+
+void DirectMapAdd::add (size_t i, idx_t list_no, size_t ofs)
+{
+    if (type == DirectMap::Array) {
+        direct_map.array [ntotal + i] = lo_build (list_no, ofs);
+    } else if (type == DirectMap::Hashtable) {
+        all_ofs [i] = lo_build (list_no, ofs);
+    }
+}
+
+DirectMapAdd::~DirectMapAdd ()
+{
+    if (type == DirectMap::Hashtable) {
+        for (int i = 0; i < n; i++) {
+            idx_t id = xids ? xids[i] : ntotal + i;
+            direct_map.hashtable [id] = all_ofs [i];
+        }
+    }
+}
+
+/********************************************************/
+
+using ScopedCodes = InvertedLists::ScopedCodes;
+using ScopedIds = InvertedLists::ScopedIds;
+
+
+size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists *invlists)
+{
+    size_t nlist = invlists->nlist;
+    std::vector<idx_t> toremove(nlist);
+
+    size_t nremove = 0;
+
+    if (type == NoMap) {
+        // exhaustive scan of IVF
+#pragma omp parallel for
+        for (idx_t i = 0; i < nlist; i++) {
+            idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+            ScopedIds idsi (invlists, i);
+            while (j < l) {
+                if (sel.is_member (idsi[j])) {
+                    l--;
+                    invlists->update_entry (
+                        i, j,
+                        invlists->get_single_id (i, l),
+                        ScopedCodes (invlists, i, l).get()
+                    );
+                } else {
+                    j++;
+                }
+            }
+            toremove[i] = l0 - l;
+        }
+        // this will not run well in parallel on ondisk because of
+        // possible shrinks
+        for (idx_t i = 0; i < nlist; i++) {
+            if (toremove[i] > 0) {
+                nremove += toremove[i];
+                invlists->resize(i, invlists->list_size(i) - toremove[i]);
+            }
+        }
+    } else if (type == Hashtable) {
+        const IDSelectorArray *sela =
+            dynamic_cast<const IDSelectorArray*>(&sel);
+        FAISS_THROW_IF_NOT_MSG (
+             sela,
+             "remove with hashtable works only with IDSelectorArray"
+        );
+
+        for (idx_t i = 0; i < sela->n; i++) {
+            idx_t id = sela->ids[i];
+            auto res = hashtable.find (id);
+            if (res != hashtable.end()) {
+                size_t list_no = lo_listno (res->second);
+                size_t offset = lo_offset (res->second);
+                idx_t last = invlists->list_size (list_no) - 1;
+                hashtable.erase (res);
+                if (offset < last) {
+                    idx_t last_id = invlists->get_single_id (list_no, last);
+                    invlists->update_entry (
+                        list_no, offset,
+                        last_id,
+                        ScopedCodes (invlists, list_no, last).get()
+                    );
+                    // update hash entry for last element
+                    hashtable [last_id] = list_no << 32 | offset;
+                }
+                invlists->resize(list_no, last);
+                nremove++;
+            }
+        }
+
+    } else {
+        FAISS_THROW_MSG("remove not supported with this direct_map format");
+    }
+    return nremove;
+}
+
+void DirectMap::update_codes (InvertedLists *invlists,
+                              int n, const idx_t *ids,
+                              const idx_t *assign,
+                              const uint8_t *codes)
+{
+    FAISS_THROW_IF_NOT (type == Array);
+
+    size_t code_size = invlists->code_size;
+
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < array.size(),
+                                "id to update out of range");
+        { // remove old one
+            idx_t dm = array [id];
+            int64_t ofs = lo_offset (dm);
+            int64_t il = lo_listno (dm);
+            size_t l = invlists->list_size (il);
+            if (ofs != l - 1) { // move l - 1 to ofs
+                int64_t id2 = invlists->get_single_id (il, l - 1);
+                array[id2] = lo_build (il, ofs);
+                invlists->update_entry (il, ofs, id2,
+                                        invlists->get_single_code (il, l - 1));
+            }
+            invlists->resize (il, l - 1);
+        }
+        { // insert new one
+            int64_t il = assign[i];
+            size_t l = invlists->list_size (il);
+            idx_t dm = lo_build (il, l);
+            array [id] = dm;
+            invlists->add_entry (il, id, codes + i * code_size);
+        }
+    }
+}
+
+
+}
diff --git a/core/src/index/thirdparty/faiss/DirectMap.h b/core/src/index/thirdparty/faiss/DirectMap.h
new file mode 100644
index 0000000000..27ea1c7260
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/DirectMap.h
@@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_DIRECT_MAP_H
+#define FAISS_DIRECT_MAP_H
+
+#include <faiss/InvertedLists.h>
+#include <unordered_map>
+
+
+namespace faiss {
+
+// When offsets list id + offset are encoded in an uint64
+// we call this LO = list-offset
+
+inline uint64_t lo_build (uint64_t list_id, uint64_t offset) {
+    return  list_id << 32 | offset;
+}
+
+inline uint64_t lo_listno (uint64_t lo) {
+    return lo >> 32;
+}
+
+inline uint64_t lo_offset (uint64_t lo) {
+    return lo & 0xffffffff;
+}
+
+/**
+ * Direct map: a way to map back from ids to inverted lists
+ */
+struct DirectMap {
+    typedef Index::idx_t idx_t;
+
+    enum Type {
+       NoMap = 0,     // default
+       Array = 1,     // sequential ids (only for add, no add_with_ids)
+       Hashtable = 2  // arbitrary ids
+    };
+    Type type;
+
+    /// map for direct access to the elements. Map ids to LO-encoded entries.
+    std::vector <idx_t> array;
+    std::unordered_map <idx_t, idx_t> hashtable;
+
+    DirectMap();
+
+    /// set type and initialize
+    void set_type (Type new_type, const InvertedLists *invlists, size_t ntotal);
+
+    /// get an entry
+    idx_t get (idx_t id) const;
+
+    /// for quick checks
+    bool no () const {return type == NoMap; }
+
+    /**
+     * update the direct_map
+     */
+
+    /// throw if Array and ids is not NULL
+    void check_can_add (const idx_t *ids);
+
+    /// non thread-safe version
+    void add_single_id (idx_t id, idx_t list_no, size_t offset);
+
+    /// remove all entries
+    void clear();
+
+    /**
+     * operations on inverted lists that require translation with a DirectMap
+     */
+
+    /// remove ids from the InvertedLists, possibly using the direct map
+    size_t remove_ids(const IDSelector& sel, InvertedLists *invlists);
+
+    /// update entries, using the direct map
+    void update_codes (InvertedLists *invlists,
+                       int n, const idx_t *ids,
+                       const idx_t *list_nos,
+                       const uint8_t *codes);
+
+
+
+};
+
+/// Thread-safe way of updating the direct_map
+struct DirectMapAdd {
+
+    typedef Index::idx_t idx_t;
+
+    using Type = DirectMap::Type;
+
+    DirectMap &direct_map;
+    DirectMap::Type type;
+    size_t ntotal;
+    size_t n;
+    const idx_t *xids;
+
+    std::vector<idx_t> all_ofs;
+
+    DirectMapAdd (DirectMap &direct_map, size_t n, const idx_t *xids);
+
+    /// add vector i (with id xids[i]) at list_no and offset
+    void add (size_t i, idx_t list_no, size_t offset);
+
+    ~DirectMapAdd ();
+};
+
+
+
+}
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IVFlib.cpp b/core/src/index/thirdparty/faiss/IVFlib.cpp
index 9af93e38dc..098b729357 100644
--- a/core/src/index/thirdparty/faiss/IVFlib.cpp
+++ b/core/src/index/thirdparty/faiss/IVFlib.cpp
@@ -13,6 +13,7 @@
 
 #include <faiss/IndexPreTransform.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/MetaIndexes.h>
 #include <faiss/utils/utils.h>
 
 
@@ -56,17 +57,35 @@ void check_compatible_for_merge (const Index * index0,
 
 }
 
-const IndexIVF * extract_index_ivf (const Index * index)
+const IndexIVF * try_extract_index_ivf (const Index * index)
 {
     if (auto *pt =
         dynamic_cast<const IndexPreTransform *>(index)) {
         index = pt->index;
     }
 
+    if (auto *idmap =
+        dynamic_cast<const IndexIDMap *>(index)) {
+        index = idmap->index;
+    }
+    if (auto *idmap =
+        dynamic_cast<const IndexIDMap2 *>(index)) {
+        index = idmap->index;
+    }
+
     auto *ivf = dynamic_cast<const IndexIVF *>(index);
 
-    FAISS_THROW_IF_NOT (ivf);
+    return ivf;
+}
 
+IndexIVF * try_extract_index_ivf (Index * index) {
+    return const_cast<IndexIVF*> (try_extract_index_ivf ((const Index*)(index)));
+}
+
+const IndexIVF * extract_index_ivf (const Index * index)
+{
+    const IndexIVF *ivf = try_extract_index_ivf (index);
+    FAISS_THROW_IF_NOT (ivf);
     return ivf;
 }
 
@@ -74,6 +93,7 @@ IndexIVF * extract_index_ivf (Index * index) {
     return const_cast<IndexIVF*> (extract_index_ivf ((const Index*)(index)));
 }
 
+
 void merge_into(faiss::Index *index0, faiss::Index *index1, bool shift_ids) {
 
     check_compatible_for_merge (index0, index1);
@@ -146,8 +166,8 @@ void search_and_return_centroids(faiss::Index *index,
             if (result_centroid_ids)
                 result_centroid_ids[i] = -1;
         } else {
-            long list_no = label >> 32;
-            long list_index = label & 0xffffffff;
+            long list_no = lo_listno (label);
+            long list_index = lo_offset (label);
             if (result_centroid_ids)
                 result_centroid_ids[i] = list_no;
             labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
diff --git a/core/src/index/thirdparty/faiss/IVFlib.h b/core/src/index/thirdparty/faiss/IVFlib.h
index 7b6f3157ea..879fd19086 100644
--- a/core/src/index/thirdparty/faiss/IVFlib.h
+++ b/core/src/index/thirdparty/faiss/IVFlib.h
@@ -35,6 +35,10 @@ void check_compatible_for_merge (const Index * index1,
 const IndexIVF * extract_index_ivf (const Index * index);
 IndexIVF * extract_index_ivf (Index * index);
 
+/// same as above but returns nullptr instead of throwing on failure
+const IndexIVF * try_extract_index_ivf (const Index * index);
+IndexIVF * try_extract_index_ivf (Index * index);
+
 /** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
  *  embedded in a IndexPreTransform. On output, the index1 is empty.
  *
diff --git a/core/src/index/thirdparty/faiss/Index.cpp b/core/src/index/thirdparty/faiss/Index.cpp
index 72d7b76280..81e8baa5fb 100644
--- a/core/src/index/thirdparty/faiss/Index.cpp
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@@ -36,7 +36,7 @@ void Index::range_search (idx_t , const float *, float,
   FAISS_THROW_MSG ("range search not implemented");
 }
 
-void Index::assign (idx_t n, const float *x, idx_t *labels, float *distance)
+void Index::assign (idx_t n, const float* x, idx_t* labels, float* distance)
 {
   float *dis_inner = (distance == nullptr) ? new float[n] : distance;
   search (n, x, 1, dis_inner, labels);
@@ -45,7 +45,10 @@ void Index::assign (idx_t n, const float *x, idx_t *labels, float *distance)
   }
 }
 
-void Index::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
+void Index::add_with_ids(
+    idx_t /*n*/,
+    const float* /*x*/,
+    const idx_t* /*xids*/) {
   FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
 }
 
diff --git a/core/src/index/thirdparty/faiss/Index.h b/core/src/index/thirdparty/faiss/Index.h
index 1d461d2d41..829d2a600b 100644
--- a/core/src/index/thirdparty/faiss/Index.h
+++ b/core/src/index/thirdparty/faiss/Index.h
@@ -10,17 +10,16 @@
 #ifndef FAISS_INDEX_H
 #define FAISS_INDEX_H
 
-
+#include <faiss/MetricType.h>
+#include <faiss/utils/ConcurrentBitset.h>
 #include <cstdio>
 #include <typeinfo>
 #include <string>
 #include <sstream>
 
-#include <faiss/utils/ConcurrentBitset.h>
-
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 6
-#define FAISS_VERSION_PATCH 0
+#define FAISS_VERSION_PATCH 3
 
 /**
  * @namespace faiss
@@ -41,39 +40,15 @@
 
 namespace faiss {
 
-
-/// Some algorithms support both an inner product version and a L2 search version.
-enum MetricType {
-    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
-    METRIC_L2 = 1,             ///< squared L2 search
-    METRIC_L1,                 ///< L1 (aka cityblock)
-    METRIC_Linf,               ///< infinity distance
-    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
-    METRIC_Jaccard,
-    METRIC_Tanimoto,
-    METRIC_Hamming,
-    METRIC_Substructure,       ///< Tversky case alpha = 0, beta = 1
-    METRIC_Superstructure,     ///< Tversky case alpha = 1, beta = 0
-
-    /// some additional metrics defined in scipy.spatial.distance
-    METRIC_Canberra = 20,
-    METRIC_BrayCurtis,
-    METRIC_JensenShannon,
-
-};
-
-
 /// Forward declarations see AuxIndexStructures.h
 struct IDSelector;
 struct RangeSearchResult;
 struct DistanceComputer;
 
-/** Abstract structure for an index
+/** Abstract structure for an index, supports adding vectors and searching them.
  *
- * Supports adding vertices and searching them.
- *
- * Currently only asymmetric queries are supported:
- * database-to-database queries are not implemented.
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
  */
 struct Index {
     using idx_t = int64_t;  ///< all indices are this type
@@ -138,7 +113,8 @@ struct Index {
      * @param distances   output pairwise distances, size n*k
      * @param bitset      flags to check the validity of vectors
      */
-    virtual void search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
+    virtual void search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels,
                          ConcurrentBitsetPtr bitset = nullptr) const = 0;
 
     /** query n raw vectors from the index by ids.
@@ -162,8 +138,8 @@ struct Index {
      * @param distances   output pairwise distances, size n*k
      * @param bitset      flags to check the validity of vectors
      */
-    virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
-                               ConcurrentBitsetPtr bitset = nullptr);
+     virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
+                                ConcurrentBitsetPtr bitset = nullptr);
 
     /** query n vectors of dimension d to the index.
      *
@@ -185,7 +161,7 @@ struct Index {
      * @param x           input vectors to search, size n * d
      * @param labels      output labels of the NNs, size n
      */
-    virtual void assign (idx_t n, const float *x, idx_t *labels, float *distance = nullptr);
+    virtual void assign (idx_t n, const float* x, idx_t* labels, float* distance = nullptr);
 
     /// removes all elements from the database.
     virtual void reset() = 0;
diff --git a/core/src/index/thirdparty/faiss/Index2Layer.cpp b/core/src/index/thirdparty/faiss/Index2Layer.cpp
index ca2d9ea959..cbdfd75426 100644
--- a/core/src/index/thirdparty/faiss/Index2Layer.cpp
+++ b/core/src/index/thirdparty/faiss/Index2Layer.cpp
@@ -42,7 +42,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 
 /*************************************
  * Index2Layer implementation
@@ -167,7 +166,7 @@ void Index2Layer::search(
     idx_t /*k*/,
     float* /*distances*/,
     idx_t* /*labels*/,
-    ConcurrentBitsetPtr bitset) const {
+    ConcurrentBitsetPtr) const {
   FAISS_THROW_MSG("not implemented");
 }
 
diff --git a/core/src/index/thirdparty/faiss/IndexBinary.h b/core/src/index/thirdparty/faiss/IndexBinary.h
index 41200a127e..86a45668f1 100644
--- a/core/src/index/thirdparty/faiss/IndexBinary.h
+++ b/core/src/index/thirdparty/faiss/IndexBinary.h
@@ -95,10 +95,11 @@ struct IndexBinary {
    * @param distances   output pairwise distances, size n*k
    * @param bitset      flags to check the validity of vectors
    */
-  virtual void search (idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
-                       ConcurrentBitsetPtr bitset = nullptr) const = 0;
+  virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                      int32_t *distances, idx_t *labels,
+                      ConcurrentBitsetPtr bitset = nullptr) const = 0;
 
-  /** query n raw vectors from the index by ids.
+  /** Query n raw vectors from the index by ids.
    *
    * return n raw vectors.
    *
@@ -122,12 +123,15 @@ struct IndexBinary {
   virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                              ConcurrentBitsetPtr bitset = nullptr);
 
-
-    /** Query n vectors of dimension d to the index.
+  /** Query n vectors of dimension d to the index.
    *
-   * return all vectors with distance < radius. Note that many
-   * indexes do not implement the range_search (only the k-NN search
-   * is mandatory).
+   * return all vectors with distance < radius. Note that many indexes
+   * do not implement the range_search (only the k-NN search is
+   * mandatory). The distances are converted to float to reuse the
+   * RangeSearchResult structure, but they are integer. By convention,
+   * only distances < radius (strict comparison) are returned,
+   * ie. radius = 0 does not return any result and 1 returns only
+   * exact same vectors.
    *
    * @param x           input vectors to search, size n * d / 8
    * @param radius      search radius
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
index 3b1a902b84..f301376cb7 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
@@ -39,7 +39,8 @@ void IndexBinaryFlat::reset() {
 }
 
 void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
-                             int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const {
     const idx_t block_size = query_batch_size;
     if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
         float *D = reinterpret_cast<float*>(distances);
@@ -63,7 +64,6 @@ void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
                 D[i] = -log2(1-D[i]);
             }
         }
-
     } else if (metric_type == METRIC_Substructure || metric_type == METRIC_Superstructure) {
         float *D = reinterpret_cast<float*>(distances);
         for (idx_t s = 0; s < n; s += block_size) {
@@ -76,7 +76,6 @@ void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
             binary_distence_knn_mc(metric_type, x + s * code_size, xb.data(), nn, ntotal, k, code_size,
                     D + s * k, labels + s * k, bitset);
         }
-
     } else {
         for (idx_t s = 0; s < n; s += block_size) {
             idx_t nn = block_size;
@@ -123,5 +122,11 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
   memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
 }
 
+void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+    hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result);
+}
 
 }  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFlat.h b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
index 66a37d5aa2..012b9b43f4 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
@@ -37,8 +37,13 @@ struct IndexBinaryFlat : IndexBinary {
 
   void reset() override;
 
-  void search (idx_t n, const uint8_t *x, idx_t k,
-               int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
+
+  void range_search(idx_t n, const uint8_t *x, int radius,
+                   RangeSearchResult *result,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
   void reconstruct(idx_t key, uint8_t *recons) const override;
 
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
index 35b50c3e29..67bd9a28dc 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
@@ -50,7 +50,8 @@ void IndexBinaryFromFloat::reset() {
 }
 
 void IndexBinaryFromFloat::search(idx_t n, const uint8_t *x, idx_t k,
-                                  int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                                  int32_t *distances, idx_t *labels,
+                                  ConcurrentBitsetPtr bitset) const {
   constexpr idx_t bs = 32768;
   std::unique_ptr<float[]> xf(new float[bs * d]);
   std::unique_ptr<float[]> df(new float[bs * k]);
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
index b2388d5a64..b630c832e4 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
@@ -41,7 +41,8 @@ struct IndexBinaryFromFloat : IndexBinary {
   void reset() override;
 
   void search(idx_t n, const uint8_t *x, idx_t k,
-              int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
 
   void train(idx_t n, const uint8_t *x) override;
 };
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
index 8645813df6..87234e4aac 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
@@ -196,7 +196,8 @@ void IndexBinaryHNSW::train(idx_t n, const uint8_t *x)
 }
 
 void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
-                             int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
 {
 #pragma omp parallel
   {
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
index deb5b94a90..be10fee692 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
@@ -45,7 +45,8 @@ struct IndexBinaryHNSW : IndexBinary {
 
   /// entry point for search
   void search(idx_t n, const uint8_t *x, idx_t k,
-              int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
 
   void reconstruct(idx_t key, uint8_t* recons) const override;
 
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHash.cpp b/core/src/index/thirdparty/faiss/IndexBinaryHash.cpp
new file mode 100644
index 0000000000..008da09455
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHash.cpp
@@ -0,0 +1,496 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryHash.h>
+
+#include <cstdio>
+#include <memory>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+void IndexBinaryHash::InvertedList::add (
+        idx_t id, size_t code_size, const uint8_t *code)
+{
+    ids.push_back(id);
+    vecs.insert(vecs.end(), code, code + code_size);
+}
+
+IndexBinaryHash::IndexBinaryHash(int d, int b):
+    IndexBinary(d), b(b), nflip(0)
+{
+    is_trained = true;
+}
+
+IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0)
+{
+    is_trained = true;
+}
+
+void IndexBinaryHash::reset()
+{
+    invlists.clear();
+    ntotal = 0;
+}
+
+
+void IndexBinaryHash::add(idx_t n, const uint8_t *x)
+{
+    add_with_ids(n, x, nullptr);
+}
+
+void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids)
+{
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    // simplistic add function. Cannot really be parallelized.
+
+    for (idx_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        const uint8_t * xi = x + i * code_size;
+        idx_t hash = *((uint64_t*)xi) & mask;
+        invlists[hash].add(id, code_size, xi);
+    }
+    ntotal += n;
+}
+
+namespace {
+
+
+/** Enumerate all bit vectors of size nbit with up to maxflip 1s
+ * test in P127257851 P127258235
+ */
+struct FlipEnumerator {
+    int nbit, nflip, maxflip;
+    uint64_t mask, x;
+
+    FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) {
+        nflip = 0;
+        mask = 0;
+        x = 0;
+    }
+
+    bool next() {
+        if (x == mask) {
+            if (nflip == maxflip) {
+                return false;
+            }
+            // increase Hamming radius
+            nflip++;
+            mask = (((uint64_t)1 << nflip) - 1);
+            x = mask << (nbit - nflip);
+            return true;
+        }
+
+        int i = __builtin_ctzll(x);
+
+        if (i > 0) {
+            x ^= (uint64_t)3 << (i - 1);
+        } else {
+            // nb of LSB 1s
+            int n1 = __builtin_ctzll(~x);
+            // clear them
+            x &= ((uint64_t)(-1) << n1);
+            int n2 = __builtin_ctzll(x);
+            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
+        }
+        return true;
+    }
+
+};
+
+using idx_t = Index::idx_t;
+
+
+struct RangeSearchResults {
+    int radius;
+    RangeQueryResult &qres;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < radius) {
+            qres.add (dis, id);
+        }
+    }
+
+};
+
+struct KnnSearchResults {
+    // heap params
+    idx_t k;
+    int32_t * heap_sim;
+    idx_t * heap_ids;
+
+    using C = CMax<int, idx_t>;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < heap_sim[0]) {
+            heap_pop<C> (k, heap_sim, heap_ids);
+            heap_push<C> (k, heap_sim, heap_ids, dis, id);
+        }
+    }
+
+};
+
+template<class HammingComputer, class SearchResults>
+void
+search_single_query_template(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+    size_t code_size = index.code_size;
+    uint64_t mask = ((uint64_t)1 << index.b) - 1;
+    uint64_t qhash = *((uint64_t*)q) & mask;
+    HammingComputer hc (q, code_size);
+    FlipEnumerator fe(index.b, index.nflip);
+
+    // loop over neighbors that are at most at nflip bits
+    do {
+        uint64_t hash = qhash ^ fe.x;
+        auto it = index.invlists.find (hash);
+
+        if (it == index.invlists.end()) {
+            continue;
+        }
+
+        const IndexBinaryHash::InvertedList &il = it->second;
+
+        size_t nv = il.ids.size();
+
+        if (nv == 0) {
+            n0++;
+        } else {
+            const uint8_t *codes = il.vecs.data();
+            for (size_t i = 0; i < nv; i++) {
+                int dis = hc.hamming (codes);
+                res.add(dis, il.ids[i]);
+                codes += code_size;
+            }
+            ndis += nv;
+            nlist++;
+        }
+    } while(fe.next());
+}
+
+template<class SearchResults>
+void
+search_single_query(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+#define HC(name) search_single_query_template<name>(index, q, res, n0, nlist, ndis);
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+
+} // anonymous namespace
+
+
+
+void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_single_query (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_single_query (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryHash::hashtable_size() const
+{
+    return invlists.size();
+}
+
+
+void IndexBinaryHash::display() const
+{
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        printf("%ld: [", it->first);
+        const std::vector<idx_t> & v = it->second.ids;
+        for (auto x: v) {
+            printf("%ld ", 0 + x);
+        }
+        printf("]\n");
+
+    }
+}
+
+
+void IndexBinaryHashStats::reset()
+{
+    memset ((void*)this, 0, sizeof (*this));
+}
+
+IndexBinaryHashStats indexBinaryHash_stats;
+
+/*******************************************************
+ * IndexBinaryMultiHash implementation
+ ******************************************************/
+
+
+IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b):
+    IndexBinary(d),
+    storage(new IndexBinaryFlat(d)), own_fields(true),
+    maps(nhash), nhash(nhash), b(b), nflip(0)
+{
+    FAISS_THROW_IF_NOT(nhash * b <= d);
+}
+
+IndexBinaryMultiHash::IndexBinaryMultiHash():
+    storage(nullptr), own_fields(true),
+    nhash(0), b(0), nflip(0)
+{}
+
+IndexBinaryMultiHash::~IndexBinaryMultiHash()
+{
+    if (own_fields) {
+        delete storage;
+    }
+}
+
+
+void IndexBinaryMultiHash::reset()
+{
+    storage->reset();
+    ntotal = 0;
+    for(auto map: maps) {
+        map.clear();
+    }
+}
+
+void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x)
+{
+    storage->add(n, x);
+    // populate maps
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    for(idx_t i = 0; i < n; i++) {
+        const uint8_t *xi = x + i * code_size;
+        int ho = 0;
+        for(int h = 0; h < nhash; h++) {
+            uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+            hash &= mask;
+            maps[h][hash].push_back(i + ntotal);
+            ho += b;
+        }
+    }
+    ntotal += n;
+}
+
+
+namespace {
+
+template <class HammingComputer, class SearchResults>
+static
+void verify_shortlist(
+        const IndexBinaryFlat & index,
+        const uint8_t * q,
+        const std::unordered_set<Index::idx_t> & shortlist,
+        SearchResults &res)
+{
+    size_t code_size = index.code_size;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+    HammingComputer hc (q, code_size);
+    const uint8_t *codes = index.xb.data();
+
+    for (auto i: shortlist) {
+        int dis = hc.hamming (codes + i * code_size);
+        res.add(dis, i);
+    }
+}
+
+template<class SearchResults>
+void
+search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi,
+                         SearchResults &res,
+                         size_t &n0, size_t &nlist, size_t &ndis)
+{
+
+    std::unordered_set<idx_t> shortlist;
+    int b = index.b;
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    int ho = 0;
+    for(int h = 0; h < index.nhash; h++) {
+        uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+        qhash &= mask;
+        const IndexBinaryMultiHash::Map & map = index.maps[h];
+
+        FlipEnumerator fe(index.b, index.nflip);
+        // loop over neighbors that are at most at nflip bits
+        do {
+            uint64_t hash = qhash ^ fe.x;
+            auto it = map.find (hash);
+
+            if (it != map.end()) {
+                const std::vector<idx_t> & v = it->second;
+                for (auto i: v) {
+                    shortlist.insert(i);
+                }
+                nlist++;
+            } else {
+                n0++;
+            }
+        } while(fe.next());
+
+        ho += b;
+    }
+    ndis += shortlist.size();
+
+    // verify shortlist
+
+#define HC(name) verify_shortlist<name> (*index.storage, xi, shortlist, res)
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+} // anonymous namespace
+
+void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryMultiHash::hashtable_size() const
+{
+    size_t tot = 0;
+    for (auto map: maps) {
+        tot += map.size();
+    }
+
+    return tot;
+}
+
+
+}
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryHash.h b/core/src/index/thirdparty/faiss/IndexBinaryHash.h
new file mode 100644
index 0000000000..5dbcad626d
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHash.h
@@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_BINARY_HASH_H
+#define FAISS_BINARY_HASH_H
+
+
+
+#include <vector>
+#include <unordered_map>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+struct RangeSearchResult;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryHash : IndexBinary {
+
+    struct InvertedList {
+        std::vector<idx_t> ids;
+        std::vector<uint8_t> vecs;
+
+        void add (idx_t id, size_t code_size, const uint8_t *code);
+    };
+
+    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
+    InvertedListMap invlists;
+
+    int b, nflip;
+
+    IndexBinaryHash(int d, int b);
+
+    IndexBinaryHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    void display() const;
+    size_t hashtable_size() const;
+
+};
+
+struct IndexBinaryHashStats {
+    size_t nq;       // nb of queries run
+    size_t n0;       // nb of empty lists
+    size_t nlist;    // nb of non-empty inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+
+    IndexBinaryHashStats () {reset (); }
+    void reset ();
+};
+
+extern IndexBinaryHashStats indexBinaryHash_stats;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryMultiHash: IndexBinary {
+
+    // where the vectors are actually stored
+    IndexBinaryFlat *storage;
+    bool own_fields;
+
+    // maps hash values to the ids that hash to them
+    using Map = std::unordered_map<idx_t, std::vector<idx_t> >;
+
+    // the different hashes, size nhash
+    std::vector<Map> maps;
+
+    int nhash; ///< nb of hash maps
+    int b; ///< nb bits per hash map
+    int nflip; ///< nb bit flips to use at search time
+
+    IndexBinaryMultiHash(int d, int nhash, int b);
+
+    IndexBinaryMultiHash();
+
+    ~IndexBinaryMultiHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
+     void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    size_t hashtable_size() const;
+
+};
+
+}
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
index f853933877..14eec85da2 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
@@ -12,17 +12,20 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexBinaryIVF.h>
 
-#include <cstdio>
-#include <memory>
 #include <cmath>
+#include <cstdio>
+#include <omp.h>
+
+#include <memory>
 
 #include <faiss/utils/BinaryDistance.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 #include <faiss/utils/Heap.h>
-
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexLSH.h>
 
 
 namespace faiss {
@@ -33,7 +36,6 @@ IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
       own_invlists(true),
       nprobe(1),
       max_codes(0),
-      maintain_direct_map(false),
       quantizer(quantizer),
       nlist(nlist),
       own_fields(false),
@@ -51,7 +53,6 @@ IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist, M
       own_invlists(true),
       nprobe(1),
       max_codes(0),
-      maintain_direct_map(false),
       quantizer(quantizer),
       nlist(nlist),
       own_fields(false),
@@ -68,7 +69,6 @@ IndexBinaryIVF::IndexBinaryIVF()
       own_invlists(false),
       nprobe(1),
       max_codes(0),
-      maintain_direct_map(false),
       quantizer(nullptr),
       nlist(0),
       own_fields(false),
@@ -87,8 +87,7 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
                               const idx_t *precomputed_idx) {
   FAISS_THROW_IF_NOT(is_trained);
   assert(invlists);
-  FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
-                         "cannot have direct map and add with ids");
+  direct_map.check_can_add (xids);
 
   const idx_t * idx;
 
@@ -107,13 +106,15 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
     idx_t id = xids ? xids[i] : ntotal + i;
     idx_t list_no = idx[i];
 
-    if (list_no < 0)
-      continue;
-    const uint8_t *xi = x + i * code_size;
-    size_t offset = invlists->add_entry(list_no, id, xi);
+    if (list_no < 0) {
+        direct_map.add_single_id (id, -1, 0);
+    } else {
+        const uint8_t *xi = x + i * code_size;
+        size_t offset = invlists->add_entry(list_no, id, xi);
+
+        direct_map.add_single_id (id, list_no, offset);
+    }
 
-    if (maintain_direct_map)
-      direct_map.push_back(list_no << 32 | offset);
     n_add++;
   }
   if (verbose) {
@@ -123,30 +124,23 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
   ntotal += n_add;
 }
 
-void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
-  // nothing to do
-  if (new_maintain_direct_map == maintain_direct_map)
-    return;
-
-  if (new_maintain_direct_map) {
-    direct_map.resize(ntotal, -1);
-    for (size_t key = 0; key < nlist; key++) {
-      size_t list_size = invlists->list_size(key);
-      const idx_t *idlist = invlists->get_ids(key);
-
-      for (size_t ofs = 0; ofs < list_size; ofs++) {
-        FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
-                               "direct map supported only for seuquential ids");
-        direct_map[idlist[ofs]] = key << 32 | ofs;
-      }
+void IndexBinaryIVF::make_direct_map (bool b)
+{
+    if (b) {
+        direct_map.set_type (DirectMap::Array, invlists, ntotal);
+    } else {
+        direct_map.set_type (DirectMap::NoMap, invlists, ntotal);
     }
-  } else {
-    direct_map.clear();
-  }
-  maintain_direct_map = new_maintain_direct_map;
 }
 
-void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
+void IndexBinaryIVF::set_direct_map_type (DirectMap::Type type)
+{
+    direct_map.set_type (type, invlists, ntotal);
+}
+
+
+void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k,
+                            int32_t *distances, idx_t *labels,
                             ConcurrentBitsetPtr bitset) const {
   std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
   std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
@@ -164,10 +158,7 @@ void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k, int32_t *distanc
 }
 
 void IndexBinaryIVF::get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, ConcurrentBitsetPtr bitset) {
-
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);
 
     /* only get vector by 1 id */
     FAISS_ASSERT(n == 1);
@@ -180,9 +171,7 @@ void IndexBinaryIVF::get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, Con
 
 void IndexBinaryIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                                    ConcurrentBitsetPtr bitset) {
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);
 
     auto x = new uint8_t[n * d];
     for (idx_t i = 0; i < n; ++i) {
@@ -194,11 +183,8 @@ void IndexBinaryIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *
 }
 
 void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
-  FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
-                         "direct map is not initialized");
-  idx_t list_no = direct_map[key] >> 32;
-  idx_t offset = direct_map[key] & 0xffffffff;
-  reconstruct_from_offset(list_no, offset, recons);
+    idx_t lo = direct_map.get (key);
+    reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
 }
 
 void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
@@ -267,39 +253,9 @@ void IndexBinaryIVF::reset() {
 }
 
 size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
-  FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
-                         "direct map remove not implemented");
-
-  std::vector<idx_t> toremove(nlist);
-
-#pragma omp parallel for
-  for (idx_t i = 0; i < nlist; i++) {
-    idx_t l0 = invlists->list_size (i), l = l0, j = 0;
-    const idx_t *idsi = invlists->get_ids(i);
-    while (j < l) {
-      if (sel.is_member(idsi[j])) {
-        l--;
-        invlists->update_entry(
-          i, j,
-          invlists->get_single_id(i, l),
-          invlists->get_single_code(i, l));
-      } else {
-        j++;
-      }
-    }
-    toremove[i] = l0 - l;
-  }
-  // this will not run well in parallel on ondisk because of possible shrinks
-  size_t nremove = 0;
-  for (idx_t i = 0; i < nlist; i++) {
-    if (toremove[i] > 0) {
-      nremove += toremove[i];
-      invlists->resize(
-        i, invlists->list_size(i) - toremove[i]);
-    }
-  }
-  ntotal -= nremove;
-  return nremove;
+    size_t nremove = direct_map.remove_ids (sel, invlists);
+    ntotal -= nremove;
+    return nremove;
 }
 
 void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
@@ -319,9 +275,6 @@ void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
     Clustering clus(d, nlist, cp);
     quantizer->reset();
 
-    std::unique_ptr<float[]> x_f(new float[n * d]);
-    binary_to_real(n * d, x, x_f.get());
-
     IndexFlat index_tmp;
 
     if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
@@ -338,8 +291,12 @@ void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
              clustering_index->d);
     }
 
-    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
+    // LSH codec that is able to convert the binary vectors to floats.
+    IndexLSH codec(d, d, false, false);
 
+    clus.train_encoded (n, x, &codec, clustering_index ? *clustering_index : index_tmp);
+
+    // convert clusters to binary
     std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
     real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
 
@@ -355,8 +312,7 @@ void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
   FAISS_THROW_IF_NOT(other.d == d);
   FAISS_THROW_IF_NOT(other.nlist == nlist);
   FAISS_THROW_IF_NOT(other.code_size == code_size);
-  FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
-                          !other.maintain_direct_map),
+  FAISS_THROW_IF_NOT_MSG(direct_map.no() && other.direct_map.no(),
                          "direct map copy not implemented");
   FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
                          "can only merge indexes of the same type");
@@ -383,13 +339,15 @@ namespace {
 using idx_t = Index::idx_t;
 
 
-template<class HammingComputer, bool store_pairs>
+template<class HammingComputer>
 struct IVFBinaryScannerL2: BinaryInvertedListScanner {
 
     HammingComputer hc;
     size_t code_size;
+    bool store_pairs;
 
-    IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
+    IVFBinaryScannerL2 (size_t code_size, bool store_pairs):
+        code_size (code_size), store_pairs(store_pairs)
     {}
 
     void set_query (const uint8_t *query_vector) override {
@@ -418,7 +376,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
         for (size_t j = 0; j < n; j++) {
             if (!bitset || !bitset->test(ids[j])) {
                 uint32_t dis = hc.hamming (codes);
-
                 if (dis < simi[0]) {
                     idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
                     heap_swap_top<C> (k, simi, idxi, dis, id);
@@ -430,12 +387,26 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
         return nup;
     }
 
-
+    void scan_codes_range (size_t n,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           int radius,
+                           RangeQueryResult &result) const
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
+                result.add (dis, id);
+            }
+            codes += code_size;
+        }
+    }
 };
 
 template<class DistanceComputer, bool store_pairs>
 struct IVFBinaryScannerJaccard: BinaryInvertedListScanner {
-
     DistanceComputer hc;
     size_t code_size;
 
@@ -478,35 +449,11 @@ struct IVFBinaryScannerJaccard: BinaryInvertedListScanner {
         }
         return nup;
     }
-
 };
 
 template <bool store_pairs>
 BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
 
-    switch (code_size) {
-#define HANDLE_CS(cs)                                                  \
-    case cs:                                                            \
-        return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
-      HANDLE_CS(4);
-      HANDLE_CS(8);
-      HANDLE_CS(16);
-      HANDLE_CS(20);
-      HANDLE_CS(32);
-      HANDLE_CS(64);
-#undef HANDLE_CS
-    default:
-        if (code_size % 8 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM8,
-                store_pairs> (code_size);
-        } else if (code_size % 4 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM4,
-                store_pairs> (code_size);
-        } else {
-            return new IVFBinaryScannerL2<HammingComputerDefault,
-                store_pairs> (code_size);
-        }
-    }
 }
 
 template <bool store_pairs>
@@ -703,7 +650,6 @@ void search_knn_binary_dis_heap(const IndexBinaryIVF& ivf,
     indexIVF_stats.nlist += nlistv;
     indexIVF_stats.ndis += ndis;
     indexIVF_stats.nheap_updates += nheap;
-
 }
 
 template<class HammingComputer, bool store_pairs>
@@ -763,12 +709,11 @@ void search_knn_hamming_count(const IndexBinaryIVF& ivf,
         : ivf.invlists->get_ids(key);
 
       for (size_t j = 0; j < list_size; j++) {
-          if(!bitset || !bitset->test(ids[j])){
-              const uint8_t * yj = list_vecs + ivf.code_size * j;
-
-              idx_t id = store_pairs ? (key << 32 | j) : ids[j];
-              csi.update_counter(yj, id);
-          }
+        if (!bitset || !bitset->test(ids[j])) {
+          const uint8_t *yj = list_vecs + ivf.code_size * j;
+          idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+          csi.update_counter(yj, id);
+        }
       }
       if (ids)
           ivf.invlists->release_ids (key, ids);
@@ -816,7 +761,7 @@ void search_knn_hamming_count_1 (
 #define HANDLE_CS(cs)                                                  \
     case cs:                                                            \
        search_knn_hamming_count<HammingComputer ## cs, store_pairs>(    \
-           ivf, nx, x, keys, k, distances, labels, params, bitset);             \
+           ivf, nx, x, keys, k, distances, labels, params, bitset);     \
       break;
       HANDLE_CS(4);
       HANDLE_CS(8);
@@ -838,7 +783,6 @@ void search_knn_hamming_count_1 (
         }
         break;
     }
-
 }
 
 }  // namespace
@@ -846,25 +790,26 @@ void search_knn_hamming_count_1 (
 BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
       (bool store_pairs) const
 {
-    switch (metric_type) {
-    case METRIC_Jaccard:
-    case METRIC_Tanimoto:
-        if (store_pairs) {
-            return select_IVFBinaryScannerJaccard<true> (code_size);
-        } else {
-            return select_IVFBinaryScannerJaccard<false> (code_size);
-        }
-    case METRIC_Substructure:
-    case METRIC_Superstructure:
-        // unsupported
-        return nullptr;
+
+#define HC(name) return new IVFBinaryScannerL2<name> (code_size, store_pairs)
+    switch (code_size) {
+    case 4: HC(HammingComputer4);
+    case 8: HC(HammingComputer8);
+    case 16: HC(HammingComputer16);
+    case 20: HC(HammingComputer20);
+    case 32: HC(HammingComputer32);
+    case 64: HC(HammingComputer64);
     default:
-        if (store_pairs) {
-            return select_IVFBinaryScannerL2<true>(code_size);
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else if (code_size % 4 == 0) {
+            HC(HammingComputerM4);
         } else {
-            return select_IVFBinaryScannerL2<false>(code_size);
+            HC(HammingComputerDefault);
         }
     }
+#undef HC
+
 }
 
 void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
@@ -875,7 +820,6 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
                                         const IVFSearchParameters *params,
                                         ConcurrentBitsetPtr bitset
                                         ) const {
-
     if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
         if (use_heap) {
             float *D = new float[k * n];
@@ -914,6 +858,83 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
     }
 }
 
+void IndexBinaryIVF::range_search(
+        idx_t n, const uint8_t *x, int radius,
+        RangeSearchResult *res,
+        ConcurrentBitsetPtr bitset) const
+{
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+
+    bool store_pairs = false;
+    size_t nlistv = 0, ndis = 0;
+
+    std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
+
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(res);
+        std::unique_ptr<BinaryInvertedListScanner> scanner
+            (get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT (scanner.get ());
+
+        all_pres[omp_get_thread_num()] = &pres;
+
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres)
+        {
+
+            idx_t key = idx[i * nprobe + ik];  /* select the list  */
+            if (key < 0) return;
+            FAISS_THROW_IF_NOT_FMT (
+                    key < (idx_t) nlist,
+                    "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                    key, ik, nlist);
+            const size_t list_size = invlists->list_size(key);
+
+            if (list_size == 0) return;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+            InvertedLists::ScopedIds ids (invlists, key);
+
+            scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range (list_size, scodes.get(),
+                                       ids.get(), radius, qres);
+        };
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            scanner->set_query (x + i * code_size);
+
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                scan_list_func (i, ik, qres);
+            }
+
+        }
+
+        pres.finalize();
+
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.search_time += getmillisecs() - t0;
+
+}
+
+
+
+
 IndexBinaryIVF::~IndexBinaryIVF() {
   if (own_invlists) {
     delete invlists;
diff --git a/core/src/index/thirdparty/faiss/IndexBinaryIVF.h b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
index 2662ab4879..3a45b82ff6 100644
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
@@ -46,8 +46,7 @@ struct IndexBinaryIVF : IndexBinary {
     bool use_heap = true;
 
     /// map for direct access to the elements. Enables reconstruct().
-    bool maintain_direct_map;
-    std::vector<idx_t> direct_map;
+    DirectMap direct_map;
 
     IndexBinary *quantizer;   ///< quantizer that maps vectors to inverted lists
     size_t nlist;             ///< number of possible key values
@@ -113,8 +112,8 @@ struct IndexBinaryIVF : IndexBinary {
                                          bool store_pairs=false) const;
 
     /** assign the vectors, then call search_preassign */
-    void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
-                ConcurrentBitsetPtr bitset = nullptr) const override;
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
 
     /** get raw vectors by ids */
     void get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, ConcurrentBitsetPtr bitset = nullptr) override;
@@ -122,6 +121,10 @@ struct IndexBinaryIVF : IndexBinary {
     void search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                        ConcurrentBitsetPtr bitset = nullptr) override;
 
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
     void reconstruct(idx_t key, uint8_t *recons) const override;
 
     /** Reconstruct a subset of the indexed vectors.
@@ -177,6 +180,8 @@ struct IndexBinaryIVF : IndexBinary {
      */
     void make_direct_map(bool new_maintain_direct_map=true);
 
+    void set_direct_map_type (DirectMap::Type type);
+
     void replace_invlists(InvertedLists *il, bool own=false);
 };
 
@@ -211,6 +216,12 @@ struct BinaryInvertedListScanner {
                                size_t k,
                                ConcurrentBitsetPtr bitset = nullptr) const = 0;
 
+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   int radius,
+                                   RangeQueryResult &result) const = 0;
+
     virtual ~BinaryInvertedListScanner () {}
 
 };
diff --git a/core/src/index/thirdparty/faiss/IndexFlat.cpp b/core/src/index/thirdparty/faiss/IndexFlat.cpp
index 7cc2304881..7780650da3 100644
--- a/core/src/index/thirdparty/faiss/IndexFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexFlat.cpp
@@ -38,26 +38,28 @@ void IndexFlat::reset() {
     ntotal = 0;
 }
 
-void IndexFlat::search(idx_t n, const float* x, idx_t k, float* distances, idx_t* labels,
-                       ConcurrentBitsetPtr bitset) const
+
+void IndexFlat::search (idx_t n, const float *x, idx_t k,
+                        float *distances, idx_t *labels,
+                        ConcurrentBitsetPtr bitset) const
 {
     // we see the distances and labels as heaps
 
     if (metric_type == METRIC_INNER_PRODUCT) {
         float_minheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
         knn_inner_product (x, xb.data(), d, n, ntotal, &res, bitset);
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
         knn_L2sqr (x, xb.data(), d, n, ntotal, &res, bitset);
     } else if (metric_type == METRIC_Jaccard) {
         float_maxheap_array_t res = {
                 size_t(n), size_t(k), labels, distances};
-        knn_jaccard (x, xb.data(), d, n, ntotal, &res, bitset);
+        knn_jaccard(x, xb.data(), d, n, ntotal, &res, bitset);
     } else {
         float_maxheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
         knn_extra_metrics (x, xb.data(), d, n, ntotal,
                            metric_type, metric_arg,
                            &res, bitset);
@@ -67,7 +69,6 @@ void IndexFlat::search(idx_t n, const float* x, idx_t k, float* distances, idx_t
 void IndexFlat::assign(idx_t n, const float * x, idx_t * labels, float* distances)
 {
     // usually used in IVF k-means algorithm
-
     float *dis_inner = (distances == nullptr) ? new float[n] : distances;
     switch (metric_type) {
         case METRIC_INNER_PRODUCT:
diff --git a/core/src/index/thirdparty/faiss/IndexFlat.h b/core/src/index/thirdparty/faiss/IndexFlat.h
index 6f63d22c2e..a04d32a614 100644
--- a/core/src/index/thirdparty/faiss/IndexFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexFlat.h
@@ -19,6 +19,7 @@ namespace faiss {
 
 /** Index that stores the full vectors and performs exhaustive search */
 struct IndexFlat: Index {
+
     /// database vectors, size ntotal * d
     std::vector<float> xb;
 
@@ -154,7 +155,7 @@ struct IndexRefineFlat: Index {
 };
 
 
-/// optimized version for 1D "vectors"
+/// optimized version for 1D "vectors".
 struct IndexFlat1D:IndexFlatL2 {
     bool continuous_update; ///< is the permutation updated continuously?
 
diff --git a/core/src/index/thirdparty/faiss/IndexHNSW.cpp b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
index 45fb595034..c06f9840e2 100644
--- a/core/src/index/thirdparty/faiss/IndexHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
@@ -26,7 +26,6 @@
 #include <stdint.h>
 
 #ifdef __SSE__
-#include <immintrin.h>
 #endif
 
 #include <faiss/utils/distances.h>
@@ -55,7 +54,6 @@ namespace faiss {
 using idx_t = Index::idx_t;
 using MinimaxHeap = HNSW::MinimaxHeap;
 using storage_idx_t = HNSW::storage_idx_t;
-using NodeDistCloser = HNSW::NodeDistCloser;
 using NodeDistFarther = HNSW::NodeDistFarther;
 
 HNSWStats hnsw_stats;
@@ -67,6 +65,50 @@ HNSWStats hnsw_stats;
 namespace {
 
 
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer: DistanceComputer {
+
+    /// owned by this
+    DistanceComputer *basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer *basedis):
+        basedis(basedis)
+    {}
+
+    void set_query(const float *x) override {
+        basedis->set_query(x);
+    }
+
+     /// compute distance of vector i to current query
+    float operator () (idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+     /// compute distance between two stored vectors
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer ()
+    {
+        delete basedis;
+    }
+
+};
+
+DistanceComputer *storage_distance_computer(const Index *storage)
+{
+    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+        return new NegativeDistanceComputer(storage->get_distance_computer());
+    } else {
+        return storage->get_distance_computer();
+    }
+}
+
+
+
 void hnsw_add_vertices(IndexHNSW &index_hnsw,
                        size_t n0,
                        size_t n, const float *x,
@@ -152,7 +194,7 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
                 VisitedTable vt (ntotal);
 
                 DistanceComputer *dis =
-                    index_hnsw.storage->get_distance_computer();
+                    storage_distance_computer (index_hnsw.storage);
                 ScopeDeleter1<DistanceComputer> del(dis);
                 int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
                 size_t counter = 0;
@@ -210,8 +252,8 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
  * IndexHNSW implementation
  **************************************************************/
 
-IndexHNSW::IndexHNSW(int d, int M):
-    Index(d, METRIC_L2),
+IndexHNSW::IndexHNSW(int d, int M, MetricType metric):
+    Index(d, metric),
     hnsw(M),
     own_fields(false),
     storage(nullptr),
@@ -258,7 +300,8 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
 #pragma omp parallel reduction(+ : nreorder)
         {
             VisitedTable vt (ntotal);
-            DistanceComputer *dis = storage->get_distance_computer();
+
+            DistanceComputer *dis = storage_distance_computer(storage);
             ScopeDeleter1<DistanceComputer> del(dis);
 
 #pragma omp for
@@ -290,6 +333,14 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
         }
         InterruptCallback::check ();
     }
+
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        // we need to revert the negated distances
+        for (size_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
+
     hnsw_stats.nreorder += nreorder;
 }
 
@@ -323,7 +374,7 @@ void IndexHNSW::shrink_level_0_neighbors(int new_size)
 {
 #pragma omp parallel
     {
-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
         ScopeDeleter1<DistanceComputer> del(dis);
 
 #pragma omp for
@@ -367,7 +418,7 @@ void IndexHNSW::search_level_0(
     storage_idx_t ntotal = hnsw.levels.size();
 #pragma omp parallel
     {
-        DistanceComputer *qdis = storage->get_distance_computer();
+        DistanceComputer *qdis = storage_distance_computer(storage);
         ScopeDeleter1<DistanceComputer> del(qdis);
 
         VisitedTable vt (ntotal);
@@ -436,7 +487,7 @@ void IndexHNSW::init_level_0_from_knngraph(
 
 #pragma omp parallel for
     for (idx_t i = 0; i < ntotal; i++) {
-        DistanceComputer *qdis = storage->get_distance_computer();
+        DistanceComputer *qdis = storage_distance_computer(storage);
         float vec[d];
         storage->reconstruct(i, vec);
         qdis->set_query(vec);
@@ -480,7 +531,7 @@ void IndexHNSW::init_level_0_from_entry_points(
     {
         VisitedTable vt (ntotal);
 
-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
         ScopeDeleter1<DistanceComputer> del(dis);
         float vec[storage->d];
 
@@ -518,7 +569,7 @@ void IndexHNSW::reorder_links()
         std::vector<float> distances (M);
         std::vector<size_t> order (M);
         std::vector<storage_idx_t> tmp (M);
-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
         ScopeDeleter1<DistanceComputer> del(dis);
 
 #pragma omp for
@@ -826,8 +877,8 @@ IndexHNSWFlat::IndexHNSWFlat()
     is_trained = true;
 }
 
-IndexHNSWFlat::IndexHNSWFlat(int d, int M):
-    IndexHNSW(new IndexFlatL2(d), M)
+IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric):
+    IndexHNSW(new IndexFlat(d, metric), M)
 {
     own_fields = true;
     is_trained = true;
@@ -860,8 +911,9 @@ void IndexHNSWPQ::train(idx_t n, const float* x)
  **************************************************************/
 
 
-IndexHNSWSQ::IndexHNSWSQ(int d, QuantizerType qtype, int M):
-    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
+IndexHNSWSQ::IndexHNSWSQ(int d, QuantizerType qtype, int M,
+                         MetricType metric):
+    IndexHNSW (new IndexScalarQuantizer (d, qtype, metric), M)
 {
     is_trained = false;
     own_fields = true;
@@ -986,7 +1038,7 @@ void IndexHNSW2Level::search (idx_t n, const float *x, idx_t k,
 #pragma omp parallel
         {
             VisitedTable vt (ntotal);
-            DistanceComputer *dis = storage->get_distance_computer();
+            DistanceComputer *dis = storage_distance_computer(storage);
             ScopeDeleter1<DistanceComputer> del(dis);
 
             int candidates_size = hnsw.upper_beam;
diff --git a/core/src/index/thirdparty/faiss/IndexHNSW.h b/core/src/index/thirdparty/faiss/IndexHNSW.h
index 354a991f8b..a8cb10512f 100644
--- a/core/src/index/thirdparty/faiss/IndexHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.h
@@ -79,7 +79,7 @@ struct IndexHNSW : Index {
 
     ReconstructFromNeighbors *reconstruct_from_neighbors;
 
-    explicit IndexHNSW (int d = 0, int M = 32);
+    explicit IndexHNSW (int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW (Index *storage, int M = 32);
 
     ~IndexHNSW() override;
@@ -91,7 +91,8 @@ struct IndexHNSW : Index {
 
     /// entry point for search
     void search (idx_t n, const float *x, idx_t k,
-                 float *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+                 float *distances, idx_t *labels,
+                 ConcurrentBitsetPtr bitset = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
@@ -132,7 +133,7 @@ struct IndexHNSW : Index {
 
 struct IndexHNSWFlat : IndexHNSW {
     IndexHNSWFlat();
-    IndexHNSWFlat(int d, int M);
+    IndexHNSWFlat(int d, int M, MetricType metric = METRIC_L2);
 };
 
 /** PQ index topped with with a HNSW structure to access elements
@@ -149,7 +150,7 @@ struct IndexHNSWPQ : IndexHNSW {
  */
 struct IndexHNSWSQ : IndexHNSW {
     IndexHNSWSQ();
-    IndexHNSWSQ(int d, QuantizerType qtype, int M);
+    IndexHNSWSQ(int d, QuantizerType qtype, int M, MetricType metric = METRIC_L2);
 };
 
 /** 2-level code structure with fast random access
@@ -162,8 +163,8 @@ struct IndexHNSW2Level : IndexHNSW {
 
     /// entry point for search
     void search (idx_t n, const float *x, idx_t k,
-                 float *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
-
+                 float *distances, idx_t *labels,
+                 ConcurrentBitsetPtr bitset = nullptr) const override;
 };
 
 
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.cpp b/core/src/index/thirdparty/faiss/IndexIVF.cpp
index 6f8871034f..f7687cfe65 100644
--- a/core/src/index/thirdparty/faiss/IndexIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp
@@ -174,8 +174,7 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d,
     code_size (code_size),
     nprobe (1),
     max_codes (0),
-    parallel_mode (0),
-    maintain_direct_map (false)
+    parallel_mode (0)
 {
     FAISS_THROW_IF_NOT (d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
@@ -189,8 +188,7 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d,
 IndexIVF::IndexIVF ():
     invlists (nullptr), own_invlists (false),
     code_size (0),
-    nprobe (1), max_codes (0), parallel_mode (0),
-    maintain_direct_map (false)
+    nprobe (1), max_codes (0), parallel_mode (0)
 {}
 
 void IndexIVF::add (idx_t n, const float * x)
@@ -216,6 +214,8 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
     }
 
     FAISS_THROW_IF_NOT (is_trained);
+    direct_map.check_can_add (xids);
+
     std::unique_ptr<idx_t []> idx(new idx_t[n]);
     quantizer->assign (n, x, idx.get());
     size_t nadd = 0, nminus1 = 0;
@@ -227,6 +227,8 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
     std::unique_ptr<uint8_t []> flat_codes(new uint8_t [n * code_size]);
     encode_vectors (n, x, idx.get(), flat_codes.get());
 
+    DirectMapAdd dm_adder(direct_map, n, xids);
+
 #pragma omp parallel reduction(+: nadd)
     {
         int nt = omp_get_num_threads();
@@ -237,13 +239,21 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
             idx_t list_no = idx [i];
             if (list_no >= 0 && list_no % nt == rank) {
                 idx_t id = xids ? xids[i] : ntotal + i;
-                invlists->add_entry (list_no, id,
-                                     flat_codes.get() + i * code_size);
+                size_t ofs = invlists->add_entry (
+                     list_no, id,
+                     flat_codes.get() + i * code_size
+                );
+
+                dm_adder.add (i, list_no, ofs);
+
                 nadd++;
+            } else if (rank == 0 && list_no == -1) {
+                dm_adder.add (i, -1, 0);
             }
         }
     }
 
+
     if (verbose) {
         printf("    added %ld / %ld vectors (%ld -1s)\n", nadd, n, nminus1);
     }
@@ -272,33 +282,25 @@ void IndexIVF::restore_quantizer() {
     }
 }
 
-void IndexIVF::make_direct_map (bool new_maintain_direct_map)
+void IndexIVF::make_direct_map (bool b)
 {
-    // nothing to do
-    if (new_maintain_direct_map == maintain_direct_map)
-        return;
-
-    if (new_maintain_direct_map) {
-        direct_map.resize (ntotal, -1);
-        for (size_t key = 0; key < nlist; key++) {
-            size_t list_size = invlists->list_size (key);
-            ScopedIds idlist (invlists, key);
-
-            for (long ofs = 0; ofs < list_size; ofs++) {
-                FAISS_THROW_IF_NOT_MSG (
-                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
-                       "direct map supported only for seuquential ids");
-                direct_map [idlist [ofs]] = key << 32 | ofs;
-            }
-        }
+    if (b) {
+        direct_map.set_type (DirectMap::Array, invlists, ntotal);
     } else {
-        direct_map.clear ();
+        direct_map.set_type (DirectMap::NoMap, invlists, ntotal);
     }
-    maintain_direct_map = new_maintain_direct_map;
 }
 
-void IndexIVF::search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
-                       ConcurrentBitsetPtr bitset) const {
+void IndexIVF::set_direct_map_type (DirectMap::Type type)
+{
+    direct_map.set_type (type, invlists, ntotal);
+}
+
+
+void IndexIVF::search (idx_t n, const float *x, idx_t k,
+                       float *distances, idx_t *labels,
+                       ConcurrentBitsetPtr bitset) const
+{
     std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
     std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
 
@@ -315,10 +317,7 @@ void IndexIVF::search (idx_t n, const float *x, idx_t k, float *distances, idx_t
 }
 
 void IndexIVF::get_vector_by_id (idx_t n, const idx_t *xid, float *x, ConcurrentBitsetPtr bitset) {
-
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);
 
     /* only get vector by 1 id */
     FAISS_ASSERT(n == 1);
@@ -331,9 +330,7 @@ void IndexIVF::get_vector_by_id (idx_t n, const idx_t *xid, float *x, Concurrent
 
 void IndexIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
                              ConcurrentBitsetPtr bitset) {
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);
 
     auto x = new float[n * d];
     for (idx_t i = 0; i < n; ++i) {
@@ -362,10 +359,13 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
 
     bool interrupt = false;
 
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+
     // don't start parallel section if single query
     bool do_parallel =
-        parallel_mode == 0 ? n > 1 :
-        parallel_mode == 1 ? nprobe > 1 :
+        pmode == 0 ? n > 1 :
+        pmode == 1 ? nprobe > 1 :
         nprobe * n > 1;
 
 #pragma omp parallel if(do_parallel) reduction(+: nlistv, ndis, nheap)
@@ -382,6 +382,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
         // intialize + reorder a result heap
 
         auto init_result = [&](float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
             if (metric_type == METRIC_INNER_PRODUCT) {
                 heap_heapify<HeapForIP> (k, simi, idxi);
             } else {
@@ -390,6 +391,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
         };
 
         auto reorder_result = [&] (float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
             if (metric_type == METRIC_INNER_PRODUCT) {
                 heap_reorder<HeapForIP> (k, simi, idxi);
             } else {
@@ -400,7 +402,8 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
         // single list scan using the current scanner (with query
         // set porperly) and storing results in simi and idxi
         auto scan_one_list = [&] (idx_t key, float coarse_dis_i,
-                                  float *simi, idx_t *idxi, ConcurrentBitsetPtr bitset) {
+                                  float *simi, idx_t *idxi,
+                                  ConcurrentBitsetPtr bitset) {
 
             if (key < 0) {
                 // not enough centroids for multiprobe
@@ -441,7 +444,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
          * Actual loops, depending on parallel_mode
          ****************************************************/
 
-        if (parallel_mode == 0) {
+        if (pmode == 0) {
 
 #pragma omp for
             for (size_t i = 0; i < n; i++) {
@@ -481,7 +484,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
                 }
 
             } // parallel for
-        } else if (parallel_mode == 1) {
+        } else if (pmode == 1) {
             std::vector <idx_t> local_idx (k);
             std::vector <float> local_dis (k);
 
@@ -524,7 +527,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
             }
         } else {
             FAISS_THROW_FMT ("parallel_mode %d not supported\n",
-                             parallel_mode);
+                             pmode);
         }
     } // parallel section
 
@@ -674,13 +677,8 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner (
 
 void IndexIVF::reconstruct (idx_t key, float* recons) const
 {
-    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
-                            "direct map is not initialized");
-    FAISS_THROW_IF_NOT_MSG (key >= 0 && key < direct_map.size(),
-                            "invalid key");
-    idx_t list_no = direct_map[key] >> 32;
-    idx_t offset = direct_map[key] & 0xffffffff;
-    reconstruct_from_offset (list_no, offset, recons);
+    idx_t lo = direct_map.get (key);
+    reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
 }
 
 
@@ -748,8 +746,8 @@ void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
                 // Fill with NaNs
                 memset(reconstructed, -1, sizeof(*reconstructed) * d);
             } else {
-                int list_no = key >> 32;
-                int offset = key & 0xffffffff;
+                int list_no = lo_listno (key);
+                int offset = lo_offset (key);
 
                 // Update label to the actual id
                 labels[ij] = invlists->get_single_id (list_no, offset);
@@ -777,42 +775,41 @@ void IndexIVF::reset ()
 
 size_t IndexIVF::remove_ids (const IDSelector & sel)
 {
-    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
-                    "direct map remove not implemented");
-
-    std::vector<idx_t> toremove(nlist);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < nlist; i++) {
-        idx_t l0 = invlists->list_size (i), l = l0, j = 0;
-        ScopedIds idsi (invlists, i);
-        while (j < l) {
-            if (sel.is_member (idsi[j])) {
-                l--;
-                invlists->update_entry (
-                     i, j,
-                     invlists->get_single_id (i, l),
-                     ScopedCodes (invlists, i, l).get());
-            } else {
-                j++;
-            }
-        }
-        toremove[i] = l0 - l;
-    }
-    // this will not run well in parallel on ondisk because of possible shrinks
-    size_t nremove = 0;
-    for (idx_t i = 0; i < nlist; i++) {
-        if (toremove[i] > 0) {
-            nremove += toremove[i];
-            invlists->resize(
-                i, invlists->list_size(i) - toremove[i]);
-        }
-    }
+    size_t nremove = direct_map.remove_ids (sel, invlists);
     ntotal -= nremove;
     return nremove;
 }
 
 
+void IndexIVF::update_vectors (int n, const idx_t *new_ids, const float *x)
+{
+
+    if (direct_map.type == DirectMap::Hashtable) {
+        // just remove then add
+        IDSelectorArray sel(n, new_ids);
+        size_t nremove = remove_ids (sel);
+        FAISS_THROW_IF_NOT_MSG (nremove == n,
+                                "did not find all entries to remove");
+        add_with_ids (n, x, new_ids);
+        return;
+    }
+
+    FAISS_THROW_IF_NOT (direct_map.type == DirectMap::Array);
+    // here it is more tricky because we don't want to introduce holes
+    // in continuous range of ids
+
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+
+    std::vector<uint8_t> flat_codes (n * code_size);
+    encode_vectors (n, x, assign.data(), flat_codes.data());
+
+    direct_map.update_codes (invlists, n, new_ids, assign.data(), flat_codes.data());
+
+}
+
+
 
 
 void IndexIVF::train (idx_t n, const float *x)
@@ -845,15 +842,14 @@ void IndexIVF::check_compatible_for_merge (const IndexIVF &other) const
     FAISS_THROW_IF_NOT (other.code_size == code_size);
     FAISS_THROW_IF_NOT_MSG (typeid (*this) == typeid (other),
                   "can only merge indexes of the same type");
+    FAISS_THROW_IF_NOT_MSG (this->direct_map.no() && other.direct_map.no(),
+                            "merge direct_map not implemented");
 }
 
 
 void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
 {
     check_compatible_for_merge (other);
-    FAISS_THROW_IF_NOT_MSG ((!maintain_direct_map &&
-                             !other.maintain_direct_map),
-                  "direct map copy not implemented");
 
     invlists->merge_from (other.invlists, add_id);
 
@@ -883,7 +879,7 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,
 
     FAISS_THROW_IF_NOT (nlist == other.nlist);
     FAISS_THROW_IF_NOT (code_size == other.code_size);
-    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
+    FAISS_THROW_IF_NOT (other.direct_map.no());
     FAISS_THROW_IF_NOT_FMT (
           subset_type == 0 || subset_type == 1 || subset_type == 2,
           "subset type %d not implemented", subset_type);
@@ -950,6 +946,7 @@ IndexIVF::dump() {
         auto codes = invlists->get_codes(i);
         int code_size = invlists->code_size;
 
+
         std::cout << "Bucket ID: " << i << ", with code size: " << code_size << ", vectors number: " << numVecs << std::endl;
         if(code_size == 8) {
             // int8 types
@@ -965,6 +962,7 @@ IndexIVF::dump() {
     }
 }
 
+
 IndexIVF::~IndexIVF()
 {
     if (own_invlists) {
diff --git a/core/src/index/thirdparty/faiss/IndexIVF.h b/core/src/index/thirdparty/faiss/IndexIVF.h
index dd90c9ca96..2409f9aab8 100644
--- a/core/src/index/thirdparty/faiss/IndexIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexIVF.h
@@ -12,15 +12,16 @@
 
 
 #include <vector>
+#include <unordered_map>
 #include <stdint.h>
 
 #include <faiss/Index.h>
 #include <faiss/InvertedLists.h>
+#include <faiss/DirectMap.h>
 #include <faiss/Clustering.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/ConcurrentBitset.h>
 
-
 namespace faiss {
 
 
@@ -34,7 +35,6 @@ struct Level1Quantizer {
     Index * quantizer_backup = nullptr; ///< quantizer for backup
     size_t nlist;             ///< number of possible key values
 
-
     /**
      * = 0: use the quantizer as index in a kmeans training
      * = 1: just pass on the training set to the train() of the quantizer
@@ -109,14 +109,18 @@ struct IndexIVF: Index, Level1Quantizer {
     /** Parallel mode determines how queries are parallelized with OpenMP
      *
      * 0 (default): parallelize over queries
-     * 1: parallelize over over inverted lists
+     * 1: parallelize over inverted lists
      * 2: parallelize over both
+     *
+     * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
+     * prevent the heap to be initialized and finalized
      */
     int parallel_mode;
+    const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
 
-    /// map for direct access to the elements. Enables reconstruct().
-    bool maintain_direct_map;
-    std::vector <idx_t> direct_map;
+    /** optional map that maps back ids to invlist entries. This
+     *  enables reconstruct() */
+    DirectMap direct_map;
 
     /** The Inverted file takes a quantizer (an Index) on input,
      * which implements the function mapping a vector to a list
@@ -179,12 +183,13 @@ struct IndexIVF: Index, Level1Quantizer {
                                      const float *centroid_dis,
                                      float *distances, idx_t *labels,
                                      bool store_pairs,
-                                     const IVFSearchParameters *params = nullptr,
+                                     const IVFSearchParameters *params=nullptr,
                                      ConcurrentBitsetPtr bitset = nullptr
                                      ) const;
 
     /** assign the vectors, then call search_preassign */
-    void search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels,
                  ConcurrentBitsetPtr bitset = nullptr) const override;
 
     /** get raw vectors by ids */
@@ -206,8 +211,19 @@ struct IndexIVF: Index, Level1Quantizer {
     virtual InvertedListScanner *get_InvertedListScanner (
         bool store_pairs=false) const;
 
+    /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2 */
     void reconstruct (idx_t key, float* recons) const override;
 
+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors (int nv, const idx_t *idx, const float *v);
+
     /** Reconstruct a subset of the indexed vectors.
      *
      * Overrides default implementation to bypass reconstruct() which requires
@@ -286,6 +302,9 @@ struct IndexIVF: Index, Level1Quantizer {
      */
     void make_direct_map (bool new_maintain_direct_map=true);
 
+    void set_direct_map_type (DirectMap::Type type);
+
+
     /// replace the inverted lists, old one is deallocated if own_invlists
     void replace_invlists (InvertedLists *il, bool own=false);
 
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
index 4e531be758..2846990f9f 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@@ -45,8 +45,7 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
 {
     FAISS_THROW_IF_NOT (is_trained);
     assert (invlists);
-    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
-                            "cannot have direct map and add with ids");
+    direct_map.check_can_add (xids);
     const int64_t * idx;
     ScopeDeleter<int64_t> del;
 
@@ -60,19 +59,21 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
     }
     int64_t n_add = 0;
     for (size_t i = 0; i < n; i++) {
-        int64_t id = xids ? xids[i] : ntotal + i;
-        int64_t list_no = idx [i];
+        idx_t id = xids ? xids[i] : ntotal + i;
+        idx_t list_no = idx [i];
+        size_t offset;
 
-        if (list_no < 0)
-            continue;
-        const float *xi = x + i * d;
-        size_t offset = invlists->add_entry (
-              list_no, id, (const uint8_t*) xi);
-
-        if (maintain_direct_map)
-            direct_map.push_back (list_no << 32 | offset);
-        n_add++;
+        if (list_no >= 0) {
+            const float *xi = x + i * d;
+            offset = invlists->add_entry (
+                     list_no, id, (const uint8_t*) xi);
+            n_add++;
+        } else {
+            offset = 0;
+        }
+        direct_map.add_single_id (id, list_no, offset);
     }
+
     if (verbose) {
         printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
                n_add, n);
@@ -154,7 +155,7 @@ struct IVFFlatScanner: InvertedListScanner {
         const float *list_vecs = (const float*)codes;
         size_t nup = 0;
         for (size_t j = 0; j < list_size; j++) {
-            if(!bitset || !bitset->test(ids[j])){
+            if (!bitset || !bitset->test(ids[j])) {
                 const float * yj = list_vecs + d * j;
                 float dis = metric == METRIC_INNER_PRODUCT ?
                             fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
@@ -181,7 +182,7 @@ struct IVFFlatScanner: InvertedListScanner {
             float dis = metric == METRIC_INNER_PRODUCT ?
                 fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
             if (C::cmp (radius, dis)) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
                 res.add (dis, id);
             }
         }
@@ -212,41 +213,6 @@ InvertedListScanner* IndexIVFFlat::get_InvertedListScanner
 
 
 
-void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
-{
-
-    FAISS_THROW_IF_NOT (maintain_direct_map);
-    FAISS_THROW_IF_NOT (is_trained);
-    std::vector<idx_t> assign (n);
-    quantizer->assign (n, x, assign.data());
-
-    for (size_t i = 0; i < n; i++) {
-        idx_t id = new_ids[i];
-        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
-                                "id to update out of range");
-        { // remove old one
-            int64_t dm = direct_map[id];
-            int64_t ofs = dm & 0xffffffff;
-            int64_t il = dm >> 32;
-            size_t l = invlists->list_size (il);
-            if (ofs != l - 1) { // move l - 1 to ofs
-                int64_t id2 = invlists->get_single_id (il, l - 1);
-                direct_map[id2] = (il << 32) | ofs;
-                invlists->update_entry (il, ofs, id2,
-                                        invlists->get_single_code (il, l - 1));
-            }
-            invlists->resize (il, l - 1);
-        }
-        { // insert new one
-            int64_t il = assign[i];
-            size_t l = invlists->list_size (il);
-            int64_t dm = (il << 32) | l;
-            direct_map[id] = dm;
-            invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
-        }
-    }
-
-}
 
 void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
                                             float* recons) const
@@ -298,8 +264,7 @@ void IndexIVFFlatDedup::add_with_ids(
 
     FAISS_THROW_IF_NOT (is_trained);
     assert (invlists);
-    FAISS_THROW_IF_NOT_MSG (
-           !maintain_direct_map,
+    FAISS_THROW_IF_NOT_MSG (direct_map.no(),
            "IVFFlatDedup not implemented with direct_map");
     int64_t * idx = new int64_t [na];
     ScopeDeleter<int64_t> del (idx);
@@ -435,7 +400,7 @@ size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)
 
     // mostly copied from IndexIVF.cpp
 
-    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+    FAISS_THROW_IF_NOT_MSG (direct_map.no(),
                     "direct map remove not implemented");
 
     std::vector<int64_t> toremove(nlist);
@@ -489,7 +454,7 @@ void IndexIVFFlatDedup::range_search(
     FAISS_THROW_MSG ("not implemented");
 }
 
-void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
+void IndexIVFFlatDedup::update_vectors (int , const idx_t *, const float *)
 {
     FAISS_THROW_MSG ("not implemented");
 }
diff --git a/core/src/index/thirdparty/faiss/IndexIVFFlat.h b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
index 08b2075044..3c5777a1c2 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
@@ -44,15 +44,6 @@ struct IndexIVFFlat: IndexIVF {
     InvertedListScanner *get_InvertedListScanner (bool store_pairs)
         const override;
 
-    /** Update a subset of vectors.
-     *
-     * The index must have a direct_map
-     *
-     * @param nv     nb of vectors to update
-     * @param idx    vector indices to update, size nv
-     * @param v      vectors of new values, size nv*d
-     */
-    virtual void update_vectors (int nv, idx_t *idx, const float *v);
 
     void reconstruct_from_offset (int64_t list_no, int64_t offset,
                                   float* recons) const override;
@@ -101,8 +92,7 @@ struct IndexIVFFlatDedup: IndexIVFFlat {
         ConcurrentBitsetPtr bitset = nullptr) const override;
 
     /// not implemented
-    void update_vectors (int nv, idx_t *idx, const float *v) override;
-
+    void update_vectors (int nv, const idx_t *idx, const float *v) override;
 
     /// not implemented
     void reconstruct_from_offset (int64_t list_no, int64_t offset,
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
index 6b47cde4da..fb786cc375 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
@@ -37,8 +37,8 @@ namespace faiss {
  ******************************************/
 
 IndexIVFPQ::IndexIVFPQ (Index * quantizer, size_t d, size_t nlist,
-                        size_t M, size_t nbits_per_idx):
-    IndexIVF (quantizer, d, nlist, 0, METRIC_L2),
+                        size_t M, size_t nbits_per_idx, MetricType metric):
+    IndexIVF (quantizer, d, nlist, 0, metric),
     pq (d, M, nbits_per_idx)
 {
     FAISS_THROW_IF_NOT (nbits_per_idx <= 8);
@@ -279,6 +279,8 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
 
     InterruptCallback::check();
 
+    direct_map.check_can_add (xids);
+
     FAISS_THROW_IF_NOT (is_trained);
     double t0 = getmillisecs ();
     const idx_t * idx;
@@ -313,13 +315,14 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
     size_t n_ignore = 0;
     for (size_t i = 0; i < n; i++) {
         idx_t key = idx[i];
+        idx_t id = xids ? xids[i] : ntotal + i;
         if (key < 0) {
+            direct_map.add_single_id (id, -1, 0);
             n_ignore ++;
             if (residuals_2)
                 memset (residuals_2, 0, sizeof(*residuals_2) * d);
             continue;
         }
-        idx_t id = xids ? xids[i] : ntotal + i;
 
         uint8_t *code = xcodes + i * code_size;
         size_t offset = invlists->add_entry (key, id, code);
@@ -332,11 +335,9 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
                 res2[j] = xi[j] - res2[j];
         }
 
-        if (maintain_direct_map)
-            direct_map.push_back (key << 32 | offset);
+        direct_map.add_single_id (id, key, offset);
     }
 
-
     double t3 = getmillisecs ();
     if(verbose) {
         char comment[100] = {0};
@@ -800,9 +801,9 @@ struct KnnSearchResults {
 
     size_t nup;
 
-    inline void add (idx_t j, float dis, faiss::ConcurrentBitsetPtr bitset = nullptr) {
+    inline void add (idx_t j, float dis, ConcurrentBitsetPtr bitset = nullptr) {
         if (C::cmp (heap_sim[0], dis)) {
-            idx_t id = ids ? ids[j] : (key << 32 | j);
+            idx_t id = ids ? ids[j] : lo_build (key, j);
             if (bitset != nullptr && bitset->test((faiss::ConcurrentBitset::id_type_t)id))
                 return;
             heap_swap_top<C> (k, heap_sim, heap_ids, dis, id);
@@ -823,7 +824,7 @@ struct RangeSearchResults {
 
     inline void add (idx_t j, float dis, faiss::ConcurrentBitsetPtr bitset = nullptr) {
         if (C::cmp (radius, dis)) {
-            idx_t id = ids ? ids[j] : (key << 32 | j);
+            idx_t id = ids ? ids[j] : lo_build (key, j);
             rres.add (dis, id);
         }
     }
@@ -836,7 +837,7 @@ struct RangeSearchResults {
  * The scanning functions call their favorite precompute_*
  * function to precompute the tables they need.
  *****************************************************/
-template <typename IDType, MetricType METRIC_TYPE>
+template <typename IDType, MetricType METRIC_TYPE, class PQDecoder>
 struct IVFPQScannerT: QueryTables {
 
     const uint8_t * list_codes;
@@ -846,7 +847,6 @@ struct IVFPQScannerT: QueryTables {
     IVFPQScannerT (const IndexIVFPQ & ivfpq, const IVFSearchParameters *params):
         QueryTables (ivfpq, params)
     {
-        FAISS_THROW_IF_NOT (pq.nbits == 8);
         assert(METRIC_TYPE == metric_type);
     }
 
@@ -872,15 +872,16 @@ struct IVFPQScannerT: QueryTables {
     template<class SearchResultType>
     void scan_list_with_table (size_t ncode, const uint8_t *codes,
                                SearchResultType & res,
-                               faiss::ConcurrentBitsetPtr bitset = nullptr) const
+                               ConcurrentBitsetPtr bitset = nullptr) const
     {
         for (size_t j = 0; j < ncode; j++) {
-
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;
             float dis = dis0;
             const float *tab = sim_table;
 
             for (size_t m = 0; m < pq.M; m++) {
-                dis += tab[*codes++];
+                dis += tab[decoder.decode()];
                 tab += pq.ksub;
             }
 
@@ -897,12 +898,14 @@ struct IVFPQScannerT: QueryTables {
                                  faiss::ConcurrentBitsetPtr bitset = nullptr) const
     {
         for (size_t j = 0; j < ncode; j++) {
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;
 
             float dis = dis0;
             const float *tab = sim_table_2;
 
             for (size_t m = 0; m < pq.M; m++) {
-                int ci = *codes++;
+                int ci = decoder.decode();
                 dis += sim_table_ptrs [m][ci] - 2 * tab [ci];
                 tab += pq.ksub;
             }
@@ -914,8 +917,8 @@ struct IVFPQScannerT: QueryTables {
     /// nothing is precomputed: access residuals on-the-fly
     template<class SearchResultType>
     void scan_on_the_fly_dist (size_t ncode, const uint8_t *codes,
-                                 SearchResultType &res,
-                                 faiss::ConcurrentBitsetPtr bitset = nullptr) const
+                               SearchResultType &res,
+                               faiss::ConcurrentBitsetPtr bitset = nullptr) const
     {
         const float *dvec;
         float dis0 = 0;
@@ -969,12 +972,13 @@ struct IVFPQScannerT: QueryTables {
             int hd = hc.hamming (b_code);
             if (hd < ht) {
                 n_hamming_pass ++;
+                PQDecoder decoder(codes, pq.nbits);
 
                 float dis = dis0;
                 const float *tab = sim_table;
 
                 for (size_t m = 0; m < pq.M; m++) {
-                    dis += tab[*b_code++];
+                    dis += tab[decoder.decode()];
                     tab += pq.ksub;
                 }
 
@@ -999,7 +1003,7 @@ struct IVFPQScannerT: QueryTables {
         case cs:                                                        \
             scan_list_polysemous_hc \
             <HammingComputer ## cs, SearchResultType>   \
-                (ncode, codes, res, bitset);             \
+                (ncode, codes, res, bitset);            \
             break
         HANDLE_CODE_SIZE(4);
         HANDLE_CODE_SIZE(8);
@@ -1030,16 +1034,18 @@ struct IVFPQScannerT: QueryTables {
  * much we precompute (2 = precompute distance tables, 1 = precompute
  * pointers to distances, 0 = compute distances one by one).
  * Currently only 2 is supported */
-template<MetricType METRIC_TYPE, class C, int precompute_mode>
+template<MetricType METRIC_TYPE, class C, class PQDecoder>
 struct IVFPQScanner:
-    IVFPQScannerT<Index::idx_t, METRIC_TYPE>,
+    IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
     InvertedListScanner
 {
     bool store_pairs;
+    int precompute_mode;
 
-    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs):
-        IVFPQScannerT<Index::idx_t, METRIC_TYPE>(ivfpq, nullptr),
-        store_pairs(store_pairs)
+    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs,
+                 int precompute_mode):
+        IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
+        store_pairs(store_pairs), precompute_mode(precompute_mode)
     {
     }
 
@@ -1055,9 +1061,10 @@ struct IVFPQScanner:
         assert(precompute_mode == 2);
         float dis = this->dis0;
         const float *tab = this->sim_table;
+        PQDecoder decoder(code, this->pq.nbits);
 
         for (size_t m = 0; m < this->pq.M; m++) {
-            dis += tab[*code++];
+            dis += tab[decoder.decode()];
             tab += this->pq.ksub;
         }
         return dis;
@@ -1124,7 +1131,22 @@ struct IVFPQScanner:
     }
 };
 
+template<class PQDecoder>
+InvertedListScanner *get_InvertedListScanner1 (const IndexIVFPQ &index,
+                                               bool store_pairs)
+{
 
+   if (index.metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFPQScanner
+            <METRIC_INNER_PRODUCT, CMin<float, idx_t>, PQDecoder>
+            (index, store_pairs, 2);
+    } else if (index.metric_type == METRIC_L2) {
+        return new IVFPQScanner
+            <METRIC_L2, CMax<float, idx_t>, PQDecoder>
+            (index, store_pairs, 2);
+    }
+    return nullptr;
+}
 
 
 } // anonymous namespace
@@ -1132,12 +1154,13 @@ struct IVFPQScanner:
 InvertedListScanner *
 IndexIVFPQ::get_InvertedListScanner (bool store_pairs) const
 {
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFPQScanner<METRIC_INNER_PRODUCT, CMin<float, idx_t>, 2>
-            (*this, store_pairs);
-    } else if (metric_type == METRIC_L2) {
-        return new IVFPQScanner<METRIC_L2, CMax<float, idx_t>, 2>
-            (*this, store_pairs);
+
+    if (pq.nbits == 8) {
+        return get_InvertedListScanner1<PQDecoder8> (*this, store_pairs);
+    } else if (pq.nbits == 16) {
+        return get_InvertedListScanner1<PQDecoder16> (*this, store_pairs);
+    } else {
+        return get_InvertedListScanner1<PQDecoderGeneric> (*this, store_pairs);
     }
     return nullptr;
 
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQ.h b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
index f556043087..4ca04e9ef9 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
@@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF {
     int polysemous_ht;             ///< Hamming thresh for polysemous filtering
 
     /** Precompute table that speed up query preprocessing at some
-     * memory cost
+     * memory cost (used only for by_residual with L2 metric)
      * =-1: force disable
      * =0: decide heuristically (default: use tables only if they are
      *     < precomputed_tables_max_bytes)
      * =1: tables that work for all quantizers (size 256 * nlist * M)
      * =2: specific version for MultiIndexQuantizer (much more compact)
      */
-    int use_precomputed_table;     ///< if by_residual, build precompute tables
+    int use_precomputed_table;
     static size_t precomputed_table_max_bytes;
 
     /// if use_precompute_table
@@ -58,7 +58,7 @@ struct IndexIVFPQ: IndexIVF {
 
     IndexIVFPQ (
             Index * quantizer, size_t d, size_t nlist,
-            size_t M, size_t nbits_per_idx);
+            size_t M, size_t nbits_per_idx, MetricType metric = METRIC_L2);
 
     void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
         override;
@@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF {
      * the duplicates are returned in pre-allocated arrays (see the
      * max sizes).
      *
-     * @params lims   limits between groups of duplicates
+     * @param lims   limits between groups of duplicates
      *                (max size ntotal / 2 + 1)
-     * @params ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
      *                duplicates (max size ntotal)
      * @return n      number of groups found
      */
@@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF {
 /// statistics are robust to internal threading, but not if
 /// IndexIVFPQ::search_preassigned is called by multiple threads
 struct IndexIVFPQStats {
-    size_t nrefine;  // nb of refines (IVFPQR)
+    size_t nrefine;  ///< nb of refines (IVFPQR)
 
     size_t n_hamming_pass;
-    // nb of passed Hamming distance tests (for polysemous)
+    ///< nb of passed Hamming distance tests (for polysemous)
 
-    // timings measured with the CPU RTC
-    // on all threads
+    // timings measured with the CPU RTC on all threads
     size_t search_cycles;
-    size_t refine_cycles; // only for IVFPQR
+    size_t refine_cycles; ///< only for IVFPQR
 
     IndexIVFPQStats () {reset (); }
     void reset ();
diff --git a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
index b94e16eac0..20d849210c 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
@@ -145,8 +145,8 @@ void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,
 
                 if (sl == -1) continue;
 
-                int list_no = sl >> 32;
-                int ofs = sl & 0xffffffff;
+                int list_no = lo_listno(sl);
+                int ofs = lo_offset(sl);
 
                 assert (list_no >= 0 && list_no < nlist);
                 assert (ofs >= 0 && ofs < invlists->list_size (list_no));
diff --git a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
index ddea7f6d87..4e27500a34 100644
--- a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
@@ -266,7 +266,7 @@ struct IVFScanner: InvertedListScanner {
     {
         size_t nup = 0;
         for (size_t j = 0; j < list_size; j++) {
-            if(!bitset || !bitset->test(ids[j])){
+            if (!bitset || !bitset->test(ids[j])) {
                 float dis = hc.hamming (codes);
 
                 if (dis < simi [0]) {
@@ -290,7 +290,7 @@ struct IVFScanner: InvertedListScanner {
         for (size_t j = 0; j < list_size; j++) {
             float dis = hc.hamming (codes);
             if (dis < radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
                 res.add (dis, id);
             }
             codes += code_size;
diff --git a/core/src/index/thirdparty/faiss/IndexLSH.h b/core/src/index/thirdparty/faiss/IndexLSH.h
index db7f9bfc71..7bcc9c5f84 100644
--- a/core/src/index/thirdparty/faiss/IndexLSH.h
+++ b/core/src/index/thirdparty/faiss/IndexLSH.h
@@ -70,7 +70,10 @@ struct IndexLSH:Index {
 
     IndexLSH ();
 
-    /* standalone codec interface */
+    /* standalone codec interface.
+     *
+     * The vectors are decoded to +/- 1 (not 0, 1) */
+
     size_t sa_code_size () const override;
 
     void sa_encode (idx_t n, const float *x,
diff --git a/core/src/index/thirdparty/faiss/IndexLattice.cpp b/core/src/index/thirdparty/faiss/IndexLattice.cpp
index 6ec49825a3..5c7be9fcbc 100644
--- a/core/src/index/thirdparty/faiss/IndexLattice.cpp
+++ b/core/src/index/thirdparty/faiss/IndexLattice.cpp
@@ -128,7 +128,7 @@ void IndexLattice::add(idx_t , const float* )
 
 
 void  IndexLattice::search(idx_t , const float* , idx_t ,
-                           float* , idx_t* , ConcurrentBitsetPtr bitset) const
+                           float* , idx_t* , ConcurrentBitsetPtr ) const
 {
     FAISS_THROW_MSG("not implemented");
 }
diff --git a/core/src/index/thirdparty/faiss/IndexLattice.h b/core/src/index/thirdparty/faiss/IndexLattice.h
index 333092ecfb..e946fac40a 100644
--- a/core/src/index/thirdparty/faiss/IndexLattice.h
+++ b/core/src/index/thirdparty/faiss/IndexLattice.h
@@ -58,7 +58,8 @@ struct IndexLattice: Index {
     /// not implemented
     void add(idx_t n, const float* x) override;
     void search(idx_t n, const float* x, idx_t k,
-                float* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+                float* distances, idx_t* labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
     void reset() override;
 
 };
diff --git a/core/src/index/thirdparty/faiss/IndexPQ.cpp b/core/src/index/thirdparty/faiss/IndexPQ.cpp
index 49ba8ce675..6e50ba1a2c 100644
--- a/core/src/index/thirdparty/faiss/IndexPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPQ.cpp
@@ -204,8 +204,8 @@ DistanceComputer * IndexPQ::get_distance_computer() const {
 
 
 void IndexPQ::search (idx_t n, const float *x, idx_t k,
-                           float *distances, idx_t *labels,
-                           ConcurrentBitsetPtr bitset) const
+                      float *distances, idx_t *labels,
+                      ConcurrentBitsetPtr bitset) const
 {
     FAISS_THROW_IF_NOT (is_trained);
     if (search_type == ST_PQ) {  // Simple PQ search
diff --git a/core/src/index/thirdparty/faiss/IndexPQ.h b/core/src/index/thirdparty/faiss/IndexPQ.h
index c681b8bfbe..25a643efe2 100644
--- a/core/src/index/thirdparty/faiss/IndexPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexPQ.h
@@ -156,7 +156,8 @@ struct MultiIndexQuantizer: Index  {
 
     void search(
         idx_t n, const float* x, idx_t k,
-        float* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+        float* distances, idx_t* labels,
+        ConcurrentBitsetPtr bitset = nullptr) const override;
 
     /// add and reset will crash at runtime
     void add(idx_t n, const float* x) override;
diff --git a/core/src/index/thirdparty/faiss/IndexPreTransform.cpp b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
index 6db0f3e48c..9172978df9 100644
--- a/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
@@ -14,7 +14,6 @@
 #include <cstring>
 #include <memory>
 
-#include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
 
 namespace faiss {
@@ -181,7 +180,8 @@ void IndexPreTransform::add_with_ids (idx_t n, const float * x,
 
 
 void IndexPreTransform::search (idx_t n, const float *x, idx_t k,
-                               float *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const
+                               float *distances, idx_t *labels,
+                               ConcurrentBitsetPtr bitset) const
 {
     FAISS_THROW_IF_NOT (is_trained);
     const float *xt = apply_chain (n, x);
diff --git a/core/src/index/thirdparty/faiss/IndexReplicas.h b/core/src/index/thirdparty/faiss/IndexReplicas.h
index 29c4803ba2..a98c28cea5 100644
--- a/core/src/index/thirdparty/faiss/IndexReplicas.h
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.h
@@ -60,7 +60,8 @@ class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
               const component_t* x,
               idx_t k,
               distance_t* distances,
-              idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              idx_t* labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
 
   /// reconstructs from the first index
   void reconstruct(idx_t, component_t *v) const override;
diff --git a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
index f3a897dcd6..d96612daef 100644
--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
@@ -254,6 +254,8 @@ void IndexIVFScalarQuantizer::add_with_ids
     size_t nadd = 0;
     std::unique_ptr<Quantizer> squant(sq.select_quantizer ());
 
+    DirectMapAdd dm_add (direct_map, n, xids);
+
 #pragma omp parallel reduction(+: nadd)
     {
         std::vector<float> residual (d);
@@ -276,13 +278,18 @@ void IndexIVFScalarQuantizer::add_with_ids
                 memset (one_code.data(), 0, code_size);
                 squant->encode_vector (xi, one_code.data());
 
-                invlists->add_entry (list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry (list_no, id, one_code.data());
 
+                dm_add.add (i, list_no, ofs);
                 nadd++;
 
+            } else if (rank == 0 && list_no == -1) {
+                dm_add.add (i, -1, 0);
             }
         }
     }
+
+
     ntotal += n;
 }
 
diff --git a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
index c254ab3ade..feb0e8314f 100644
--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
@@ -17,7 +17,6 @@
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/impl/ScalarQuantizerOp.h>
 
-
 namespace faiss {
 
 /**
diff --git a/core/src/index/thirdparty/faiss/IndexShards.cpp b/core/src/index/thirdparty/faiss/IndexShards.cpp
index d6471e6f28..0e0ac16264 100644
--- a/core/src/index/thirdparty/faiss/IndexShards.cpp
+++ b/core/src/index/thirdparty/faiss/IndexShards.cpp
@@ -264,7 +264,8 @@ IndexShardsTemplate<IndexT>::search(idx_t n,
                                     const component_t *x,
                                     idx_t k,
                                     distance_t *distances,
-                                    idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                                    idx_t *labels,
+                                    ConcurrentBitsetPtr bitset) const {
   long nshard = this->count();
 
   std::vector<distance_t> all_distances(nshard * k * n);
diff --git a/core/src/index/thirdparty/faiss/IndexShards.h b/core/src/index/thirdparty/faiss/IndexShards.h
index 2cb6bc692d..6fbca6778a 100644
--- a/core/src/index/thirdparty/faiss/IndexShards.h
+++ b/core/src/index/thirdparty/faiss/IndexShards.h
@@ -75,7 +75,8 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
   void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;
 
   void search(idx_t n, const component_t* x, idx_t k,
-              distance_t* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              distance_t* distances, idx_t* labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
 
   void train(idx_t n, const component_t* x) override;
 
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.cpp b/core/src/index/thirdparty/faiss/InvertedLists.cpp
index 93f1ba9bd2..59f5d1e7cb 100644
--- a/core/src/index/thirdparty/faiss/InvertedLists.cpp
+++ b/core/src/index/thirdparty/faiss/InvertedLists.cpp
@@ -64,8 +64,6 @@ PageLockMemory::PageLockMemory(PageLockMemory &&other) {
 
 namespace faiss {
 
-using ScopedIds = InvertedLists::ScopedIds;
-using ScopedCodes = InvertedLists::ScopedCodes;
 
 
 /*****************************************
diff --git a/core/src/index/thirdparty/faiss/InvertedLists.h b/core/src/index/thirdparty/faiss/InvertedLists.h
index 9df0cb63dc..ec77d2cb18 100644
--- a/core/src/index/thirdparty/faiss/InvertedLists.h
+++ b/core/src/index/thirdparty/faiss/InvertedLists.h
@@ -19,7 +19,6 @@
 #include <vector>
 #include <faiss/Index.h>
 
-
 #ifndef USE_CPU
 namespace faiss {
 
@@ -276,6 +275,7 @@ struct ReadOnlyArrayInvertedLists: InvertedLists {
 
     bool is_valid();
 };
+
 /*****************************************************************
  * Meta-inverted lists
  *
diff --git a/core/src/index/thirdparty/faiss/Makefile b/core/src/index/thirdparty/faiss/Makefile
index 520e9527ef..f81e67914c 100644
--- a/core/src/index/thirdparty/faiss/Makefile
+++ b/core/src/index/thirdparty/faiss/Makefile
@@ -12,7 +12,7 @@ AVX512_SRC  = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp)
 OBJ         = $(SRC:.cpp=.o)
 INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss
 
-GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/utils/*.h)
+GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/impl/*.cuh gpu/utils/*.h gpu/utils/*.cuh)
 GPU_CPPSRC  = $(wildcard gpu/*.cpp gpu/impl/*.cpp gpu/utils/*.cpp)
 GPU_CUSRC   = $(wildcard gpu/*.cu gpu/impl/*.cu gpu/utils/*.cu \
 gpu/utils/nvidia/*.cu gpu/utils/blockselect/*.cu gpu/utils/warpselect/*.cu)
diff --git a/core/src/index/thirdparty/faiss/MetaIndexes.cpp b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
index d9031a85b7..0851733f37 100644
--- a/core/src/index/thirdparty/faiss/MetaIndexes.cpp
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
@@ -22,7 +22,6 @@ namespace faiss {
 
 namespace {
 
-typedef Index::idx_t idx_t;
 
 } // namespace
 
@@ -83,9 +82,10 @@ void IndexIDMapTemplate<IndexT>::add_with_ids
 template <typename IndexT>
 void IndexIDMapTemplate<IndexT>::search
     (idx_t n, const typename IndexT::component_t *x, idx_t k,
-     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels, ConcurrentBitsetPtr bitset) const
+     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels,
+     ConcurrentBitsetPtr bitset) const
 {
-    index->search(n, x, k, distances, labels, bitset);
+    index->search (n, x, k, distances, labels, bitset);
     idx_t *li = labels;
 #pragma omp parallel for
     for (idx_t i = 0; i < n * k; i++) {
@@ -121,7 +121,8 @@ void IndexIDMapTemplate<IndexT>::search_by_id (idx_t n, const idx_t *xid, idx_t
 template <typename IndexT>
 void IndexIDMapTemplate<IndexT>::range_search
     (typename IndexT::idx_t n, const typename IndexT::component_t *x,
-     typename IndexT::distance_t radius, RangeSearchResult *result, ConcurrentBitsetPtr bitset) const
+     typename IndexT::distance_t radius, RangeSearchResult *result,
+     ConcurrentBitsetPtr bitset) const
 {
   index->range_search(n, x, radius, result, bitset);
 #pragma omp parallel for
diff --git a/core/src/index/thirdparty/faiss/MetaIndexes.h b/core/src/index/thirdparty/faiss/MetaIndexes.h
index 5ff3c5530a..adba16e16c 100644
--- a/core/src/index/thirdparty/faiss/MetaIndexes.h
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.h
@@ -37,8 +37,10 @@ struct IndexIDMapTemplate : IndexT {
     /// this will fail. Use add_with_ids
     void add(idx_t n, const component_t* x) override;
 
-    void search (idx_t n, const component_t *x, idx_t k, distance_t *distances, idx_t *labels,
-                 ConcurrentBitsetPtr bitset = nullptr) const override;
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances, idx_t* labels,
+        ConcurrentBitsetPtr bitset = nullptr) const override;
 
     void get_vector_by_id(idx_t n, const idx_t *xid, component_t *x, ConcurrentBitsetPtr bitset = nullptr) override;
 
diff --git a/core/src/index/thirdparty/faiss/MetricType.h b/core/src/index/thirdparty/faiss/MetricType.h
new file mode 100644
index 0000000000..5248f5b801
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/MetricType.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_METRIC_TYPE_H
+#define FAISS_METRIC_TYPE_H
+
+namespace faiss {
+
+/// The metric space for vector comparison for Faiss indices and algorithms.
+///
+/// Most algorithms support both inner product and L2, with the flat
+/// (brute-force) indices supporting additional metric types for vector
+/// comparison.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by a faiss::Index
+                               /// metric_arg
+    METRIC_Jaccard,
+    METRIC_Tanimoto,
+    METRIC_Hamming,
+    METRIC_Substructure,       ///< Tversky case alpha = 0, beta = 1
+    METRIC_Superstructure,     ///< Tversky case alpha = 1, beta = 0
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+};
+
+}
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/README.md b/core/src/index/thirdparty/faiss/README.md
index 039005aa28..299ad809da 100644
--- a/core/src/index/thirdparty/faiss/README.md
+++ b/core/src/index/thirdparty/faiss/README.md
@@ -4,6 +4,10 @@ Faiss is a library for efficient similarity search and clustering of dense vecto
 
 ## NEWS
 
+*NEW: version 1.6.1 (2019-11-29) bugfix.*
+
+*NEW: version 1.6.0 (2019-10-15) code structure reorg, support for codec interface.*
+
 *NEW: version 1.5.3 (2019-06-24) fix performance regression in IndexIVF.*
 
 *NEW: version 1.5.2 (2019-05-27) the license was relaxed to MIT from BSD+Patents. Read LICENSE for details.*
@@ -24,7 +28,7 @@ Faiss is a library for efficient similarity search and clustering of dense vecto
 
 ## Introduction
 
-Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
+Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
 
 Most of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. 
 
diff --git a/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4 b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
index fb1e080f19..fc61fc91e9 100644
--- a/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
@@ -8,7 +8,7 @@ AC_MSG_CHECKING([for cpu arch])
 
   case $target in
     amd64-* | x86_64-*)
-      ARCH_CPUFLAGS="-mavx2 -mf16c -msse4 -mpopcnt"
+      ARCH_CPUFLAGS="-mpopcnt -msse4"
       ARCH_CXXFLAGS="-m64"
       ;;
     aarch64*-*)
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
index 63377bc9a8..9f90643217 100644
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
@@ -64,6 +64,7 @@ def fvecs_write(fname, m):
     ivecs_write(fname, m.view('int32'))
 
 
+
 #################################################################
 # Dataset
 #################################################################
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py b/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
index f404605a22..c676f7c793 100644
--- a/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
@@ -170,7 +170,8 @@ def dataset_iterator(x, preproc, bs):
     block_ranges = [(i0, min(nb, i0 + bs))
                     for i0 in range(0, nb, bs)]
 
-    def prepare_block((i0, i1)):
+    def prepare_block(i01):
+        i0, i1 = i01
         xb = sanitize(x[i0:i1])
         return i0, preproc.apply_py(xb)
 
@@ -575,7 +576,8 @@ def compute_populated_index_2(preproc):
     coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
         vres, vdev, indexall.quantizer)
 
-    def quantize((i0, xs)):
+    def quantize(args):
+        (i0, xs) = args
         _, assign = coarse_quantizer_gpu.search(xs, 1)
         return i0, xs, assign.ravel()
 
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
index 0445c4a8be..0cf3b723a1 100644
--- a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
@@ -160,7 +160,7 @@ def matrix_slice_iterator(x, bs):
                     for i0 in range(0, nb, bs)]
 
     return rate_limited_imap(
-        lambda (i0, i1): x[i0:i1].astype('float32').copy(),
+        lambda i01: x[i01[0]:i01[1]].astype('float32').copy(),
         block_ranges)
 
 
@@ -203,6 +203,7 @@ xq = xq.astype('float32').copy()
 
 # a static C++ object that collects statistics about searches
 ivfpq_stats = faiss.cvar.indexIVFPQ_stats
+ivf_stats = faiss.cvar.indexIVF_stats
 
 
 if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
@@ -243,10 +244,11 @@ else:
         ps.set_index_parameters(index, param)
         t0 = time.time()
         ivfpq_stats.reset()
+        ivf_stats.reset()
         D, I = index.search(xq, 100)
         t1 = time.time()
         for rank in 1, 10, 100:
             n_ok = (I[:, :rank] == gt[:, :1]).sum()
             print("%.4f" % (n_ok / float(nq)), end=' ')
         print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
-        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivfpq_stats.ncode))
+        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
index 7dbb79ec0c..f54c66bc2b 100644
--- a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
@@ -36,7 +36,8 @@ faiss.omp_set_num_threads(1)
 
 print("PQ baseline", end=' ')
 index.search_type = faiss.IndexPQ.ST_PQ
-evaluate()
+t, r = evaluate(index, xq, gt, 1)
+print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))
 
 for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
     print("Polysemous", ht, end=' ')
diff --git a/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py b/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
index aed1083d46..331a9923e2 100644
--- a/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
@@ -38,7 +38,7 @@ for d in 3, 4, 12, 36, 64:
     distances = np.empty((xd, yd), dtype='float32')
 
     t0 = time.time()
-    for i in xrange(xd):
+    for i in range(xd):
         faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
                                      swig_ptr(x[i]),
                                      swig_ptr(y),
@@ -66,7 +66,7 @@ for d in 3, 4, 12, 36, 64:
     distances = np.empty((xd, yd), dtype='float32')
 
     t0 = time.time()
-    for i in xrange(xd):
+    for i in range(xd):
         faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
                             swig_ptr(x[i]),
                             swig_ptr(y),
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
index 643a99a1dd..c2c792992b 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
@@ -1,3 +1,4 @@
+
 # Distributed on-disk index for 1T-scale datasets 
 
 This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors). 
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
index c2583bc450..3df2a0180a 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+#!/usr/bin/env python3
+
 import os
 import faiss
 import numpy as np
@@ -29,7 +31,7 @@ class CombinedIndex:
                 indexes.append(index)
                 il = faiss.extract_index_ivf(index).invlists
             else:
-                assert False
+                raise AssertionError
             ilv.push_back(il)
         print()
 
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
index 423f88127c..ae7a292d3d 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 #! /usr/bin/env python3
 
 """
@@ -356,7 +357,7 @@ def main():
         elif args.indata.endswith('.npy'):
             x = np.load(args.indata, mmap_mode='r')
         else:
-            assert False
+            raise AssertionError
 
         if args.i1 == -1:
             args.i1 = len(x)
@@ -386,7 +387,8 @@ def main():
             True
         )
     else:
-        assert False
+        raise AssertionError
+
 
     if args.server:
         print('starting server')
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
index 401f056056..9453c0ec27 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import faiss
 import numpy as np
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
index 3364919403..ca58425b25 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import time
 import numpy as np
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
index 735c92b2a2..5c8f3ace94 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
@@ -49,7 +49,7 @@ if __name__ == '__main__':
 
     index0 = None
 
-    for fname, index in pool.imap(load_index, args.inputs):
+    for _, index in pool.imap(load_index, args.inputs):
         if index is None:
             continue
         index_ivf = faiss.extract_index_ivf(index)
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
index 401d0d5bcc..7b248ea0a1 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
@@ -2,6 +2,9 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
 """
 Simplistic RPC implementation.
 Exposes all functions of a Server object.
@@ -163,7 +166,7 @@ class Server:
         except EOFError:
             self.log("EOF during communication")
             traceback.print_exc(50,self.logf)
-        except:
+        except BaseException:
             # unexpected
             traceback.print_exc(50,sys.stderr)
             sys.exit(1)
diff --git a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
index 28c5efbdde..9239afd59d 100644
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
@@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import time
 import rpc
diff --git a/core/src/index/thirdparty/faiss/build.sh b/core/src/index/thirdparty/faiss/build.sh
index 9dbdd9ea73..ea6b4c0c7d 100755
--- a/core/src/index/thirdparty/faiss/build.sh
+++ b/core/src/index/thirdparty/faiss/build.sh
@@ -1,2 +1,3 @@
-./configure CPUFLAGS='-mavx -mf16c -msse4 -mpopcnt'   CXXFLAGS='-O0 -g -fPIC -m64 -Wno-sign-compare -Wall -Wextra' --prefix=$PWD --with-cuda-arch=-gencode=arch=compute_75,code=sm_75 --with-cuda=/usr/local/cuda
-make install -j
+#./configure CPUFLAGS='-mavx -mf16c -msse4 -mpopcnt'   CXXFLAGS='-O0 -g -fPIC -m64 -Wno-sign-compare -Wall -Wextra' --prefix=$PWD --with-cuda-arch=-gencode=arch=compute_75,code=sm_75 --with-cuda=/usr/local/cuda
+./configure --prefix=$PWD CFLAGS='-g -fPIC' CXXFLAGS='-O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp -g -fPIC -mf16c -O3' --without-python --with-cuda=/usr/local/cuda --with-cuda-arch='-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75'
+make install -j8
diff --git a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
index 008d6f8482..2f412d6aaa 100644
--- a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
@@ -17,16 +17,6 @@ using faiss::Index;
 using faiss::ParameterRange;
 using faiss::ParameterSpace;
 
-/** Build and index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric) {
-    try {
-        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
-            d, description, static_cast<faiss::MetricType>(metric)));
-    } CATCH_AND_HANDLE
-}
-
 const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
     return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
 }
@@ -90,4 +80,4 @@ int faiss_ParameterSpace_add_range(FaissParameterSpace* space, const char* name,
             *p_range = reinterpret_cast<FaissParameterRange*>(&range);
         }
     } CATCH_AND_HANDLE
-}
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
index 908f355a4d..d870921c04 100644
--- a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
@@ -18,11 +18,6 @@
 extern "C" {
 #endif
 
-/** Build and index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric);
-
 /// possible values of a parameter, sorted from least to most expensive/accurate
 FAISS_DECLARE_CLASS(ParameterRange)
 
@@ -66,4 +61,4 @@ int faiss_ParameterSpace_add_range(FaissParameterSpace*, const char*, FaissParam
 }
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp b/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
index 1687ed1e45..e4541458c0 100644
--- a/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
@@ -19,6 +19,7 @@ extern "C" {
 using faiss::Clustering;
 using faiss::ClusteringParameters;
 using faiss::Index;
+using faiss::ClusteringIterationStats;
 
 DEFINE_GETTER(Clustering, int, niter)
 DEFINE_GETTER(Clustering, int, nredo)
@@ -38,6 +39,12 @@ DEFINE_GETTER(Clustering, size_t, d)
 /// getter for k
 DEFINE_GETTER(Clustering, size_t, k)
 
+DEFINE_GETTER(ClusteringIterationStats, float, obj)
+DEFINE_GETTER(ClusteringIterationStats, double, time)
+DEFINE_GETTER(ClusteringIterationStats, double, time_search)
+DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
+
 void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
     ClusteringParameters d;
     params->frozen_centroids = d.frozen_centroids;
@@ -78,13 +85,12 @@ void faiss_Clustering_centroids(
     }
 }
 
-/// getter for objective values (sum of distances reported by index)
-/// over iterations
-void faiss_Clustering_obj(
-    FaissClustering* clustering, float** obj, size_t* size) {
-    std::vector<float>& v = reinterpret_cast<Clustering*>(clustering)->obj;
-    if (obj) {
-        *obj = v.data();
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+    FaissClustering* clustering, FaissClusteringIterationStats** iteration_stats, size_t* size) {
+    std::vector<ClusteringIterationStats>& v = reinterpret_cast<Clustering*>(clustering)->iteration_stats;
+    if (iteration_stats) {
+        *iteration_stats = reinterpret_cast<FaissClusteringIterationStats*>(v.data());
     }
     if (size) {
         *size = v.size();
diff --git a/core/src/index/thirdparty/faiss/c_api/Clustering_c.h b/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
index 75f25ba4f5..af82152e60 100644
--- a/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
@@ -47,7 +47,7 @@ void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
  * points to the centroids. Therefore, at each iteration the centroids
  * are added to the index.
  *
- * On output, the centoids table is set to the latest version
+ * On output, the centroids table is set to the latest version
  * of the centroids and they are also added to the index. If the
  * centroids table it is not empty on input, it is also used for
  * initialization.
@@ -75,14 +75,20 @@ FAISS_DECLARE_GETTER(Clustering, size_t, d)
 /// getter for k
 FAISS_DECLARE_GETTER(Clustering, size_t, k)
 
+FAISS_DECLARE_CLASS(ClusteringIterationStats)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
+
 /// getter for centroids (size = k * d)
 void faiss_Clustering_centroids(
     FaissClustering* clustering, float** centroids, size_t* size);
 
-/// getter for objective values (sum of distances reported by index)
-/// over iterations
-void faiss_Clustering_obj(
-    FaissClustering* clustering, float** obj, size_t* size);
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+    FaissClustering* clustering, FaissClusteringIterationStats** iteration_stats, size_t* size);
 
 /// the only mandatory parameters are k and d
 int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
index a4d4acd4c1..4f7983723b 100644
--- a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
@@ -87,6 +87,13 @@ void faiss_IndexIVF_print_stats (const FaissIndexIVF* index) {
     reinterpret_cast<const IndexIVF*>(index)->invlists->print_stats();
 }
 
+/// get inverted lists ids
+void faiss_IndexIVF_invlists_get_ids (const FaissIndexIVF* index, size_t list_no, idx_t* invlist) {
+    const idx_t* list = reinterpret_cast<const IndexIVF*>(index)->invlists->get_ids(list_no);
+    size_t list_size = reinterpret_cast<const IndexIVF*>(index)->get_list_size(list_no);
+    memcpy(invlist, list, list_size*sizeof(idx_t));
+}
+
 void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) {
     reinterpret_cast<IndexIVFStats*>(stats)->reset();    
 }
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
index b2176aac58..5aa907c8c2 100644
--- a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
@@ -114,6 +114,13 @@ double faiss_IndexIVF_imbalance_factor (const FaissIndexIVF* index);
 /// display some stats about the inverted lists of the index
 void faiss_IndexIVF_print_stats (const FaissIndexIVF* index);
 
+/// Get the IDs in an inverted list. IDs are written to `invlist`, which must be large enough
+//// to accommodate the full list.
+///
+/// @param list_no the list ID
+/// @param invlist output pointer to a slice of memory, at least as long as the list's size
+/// @see faiss_IndexIVF_get_list_size(size_t) 
+void faiss_IndexIVF_invlists_get_ids (const FaissIndexIVF* index, size_t list_no, idx_t* invlist);
 
 typedef struct FaissIndexIVFStats {
     size_t nq;       // nb of queries run
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp
new file mode 100644
index 0000000000..7d99602edd
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp
@@ -0,0 +1,21 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexPreTransform_c.h"
+#include "IndexPreTransform.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexPreTransform;
+
+DEFINE_DESTRUCTOR(IndexPreTransform)
+DEFINE_INDEX_DOWNCAST(IndexPreTransform)
+
+DEFINE_GETTER_PERMISSIVE(IndexPreTransform, FaissIndex*, index)
diff --git a/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h
new file mode 100644
index 0000000000..c6d34b23c7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_PRETRANSFORM_C_H
+#define FAISS_INDEX_PRETRANSFORM_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(IndexPreTransform)
+FAISS_DECLARE_DESTRUCTOR(IndexPreTransform)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexPreTransform)
+
+FAISS_DECLARE_GETTER(IndexPreTransform, FaissIndex*, index)
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/core/src/index/thirdparty/faiss/c_api/Index_c.cpp b/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
index 21d175a15c..38263f4333 100644
--- a/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
@@ -97,10 +97,9 @@ int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float*
     } CATCH_AND_HANDLE
 }
 
-int faiss_Index_display(const FaissIndex* index) {
+int faiss_Index_compute_residual_n(const FaissIndex* index, idx_t n, const float* x, float* residuals, const idx_t* keys) {
     try {
-        reinterpret_cast<const faiss::Index*>(index)->display();
+        reinterpret_cast<const faiss::Index *>(index)->compute_residual_n(n, x, residuals, keys);
     } CATCH_AND_HANDLE
 }
-
-}
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/Index_c.h b/core/src/index/thirdparty/faiss/c_api/Index_c.h
index 34215036d0..4b6a30c7cd 100644
--- a/core/src/index/thirdparty/faiss/c_api/Index_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.h
@@ -26,8 +26,16 @@ typedef struct FaissIDSelector_H FaissIDSelector;
 
 /// Some algorithms support both an inner product version and a L2 search version.
 typedef enum FaissMetricType {
-    METRIC_INNER_PRODUCT = 0,
-    METRIC_L2 = 1,
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,    
 } FaissMetricType;
 
 /// Opaque type for referencing to an index object
@@ -152,13 +160,24 @@ int faiss_Index_reconstruct_n (const FaissIndex* index, idx_t i0, idx_t ni, floa
  */
 int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float* residual, idx_t key);
 
-/** Display the actual class name and some more info
+/** Computes a residual vector after indexing encoding.
+ *
+ * The residual vector is the difference between a vector and the
+ * reconstruction that can be decoded from its representation in
+ * the index. The residual can be used for multiple-stage indexing
+ * methods, like IndexIVF's methods.
+ *
  * @param index       opaque pointer to index object
+ * @param n           number of vectors
+ * @param x           input vector, size (n x d)
+ * @param residuals    output residual vectors, size (n x d)
+ * @param keys         encoded index, as returned by search and assign
  */
-int faiss_Index_display(const FaissIndex* index);
+int faiss_Index_compute_residual_n(const FaissIndex* index, idx_t n, const float* x, float* residuals, const idx_t* keys);
+
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/Makefile b/core/src/index/thirdparty/faiss/c_api/Makefile
index 04c84e68d2..c47c465f00 100644
--- a/core/src/index/thirdparty/faiss/c_api/Makefile
+++ b/core/src/index/thirdparty/faiss/c_api/Makefile
@@ -13,8 +13,9 @@ DEBUGFLAG=-DNDEBUG # no debugging
 LIBNAME=libfaiss
 CLIBNAME=libfaiss_c
 LIBCOBJ=error_impl.o Index_c.o IndexFlat_c.o Clustering_c.o AutoTune_c.o \
-	AuxIndexStructures_c.o IndexIVF_c.o IndexIVFFlat_c.o IndexLSH_c.o \
-	index_io_c.o MetaIndexes_c.o IndexShards_c.o
+	impl/AuxIndexStructures_c.o IndexIVF_c.o IndexIVFFlat_c.o IndexLSH_c.o \
+	index_io_c.o MetaIndexes_c.o IndexShards_c.o index_factory_c.o \
+	clone_index_c.o IndexPreTransform_c.o
 CFLAGS=-fPIC -m64 -Wno-sign-compare -g -O3 -Wall -Wextra
 
 # Build static and shared object files by default
@@ -42,38 +43,47 @@ clean:
 
 # Dependencies
 
-error_impl.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+error_impl.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 error_impl.o: error_impl.cpp error_c.h error_impl.h macros_impl.h
 
-index_io_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+index_io_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 index_io_c.o: index_io_c.cpp error_impl.cpp ../index_io.h macros_impl.h
 
-Index_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+index_factory_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+index_factory_c.o: index_factory_c.cpp error_impl.cpp ../index_io.h macros_impl.h
+
+clone_index_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+clone_index_c.o: index_factory_c.cpp error_impl.cpp ../index_io.h macros_impl.h
+
+Index_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 Index_c.o: Index_c.cpp Index_c.h ../Index.h macros_impl.h
 
-IndexFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexFlat_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexFlat_c.o: IndexFlat_c.cpp IndexFlat_c.h ../IndexFlat.h macros_impl.h
 
-IndexIVF_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVF_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexIVF_c.o: IndexIVF_c.cpp IndexIVF_c.h ../IndexIVF.h macros_impl.h
 
-IndexIVFFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVFFlat_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexIVFFlat_c.o: IndexIVFFlat_c.cpp IndexIVFFlat_c.h ../IndexIVFFlat.h macros_impl.h
 
-IndexLSH_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexLSH_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexLSH_c.o: IndexLSH_c.cpp IndexLSH_c.h ../IndexLSH.h macros_impl.h
 
-IndexShards_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexShards_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexShards_c.o: IndexShards_c.cpp IndexShards_c.h ../Index.h ../IndexShards.h macros_impl.h
 
-Clustering_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+Clustering_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 Clustering_c.o: Clustering_c.cpp Clustering_c.h ../Clustering.h macros_impl.h
 
-AutoTune_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+AutoTune_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 AutoTune_c.o: AutoTune_c.cpp AutoTune_c.h ../AutoTune.h macros_impl.h
 
-AuxIndexStructures_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
-AuxIndexStructures_c.o: AuxIndexStructures_c.cpp AuxIndexStructures_c.h ../AuxIndexStructures.h macros_impl.h
+impl/AuxIndexStructures_c.o: CXXFLAGS += -I..  -I ../impl $(DEBUGFLAG)
+impl/AuxIndexStructures_c.o: impl/AuxIndexStructures_c.cpp impl/AuxIndexStructures_c.h ../impl/AuxIndexStructures.h macros_impl.h
 
-MetaIndexes_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+MetaIndexes_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 MetaIndexes_c.o: MetaIndexes_c.cpp MetaIndexes_c.h ../MetaIndexes.h macros_impl.h
+
+IndexPreTransform_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+IndexPreTransform_c.o: IndexPreTransform_c.cpp IndexPreTransform_c.h ../IndexPreTransform.h macros_impl.h
diff --git a/core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp b/core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp
new file mode 100644
index 0000000000..999b139a7c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp
@@ -0,0 +1,23 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+#include "clone_index_c.h"
+#include "clone_index.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+
+int faiss_clone_index (const FaissIndex *idx, FaissIndex **p_out) {
+    try {
+        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    } CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/clone_index_c.h b/core/src/index/thirdparty/faiss/c_api/clone_index_c.h
new file mode 100644
index 0000000000..3cf7e1a658
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/clone_index_c.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+
+#ifndef FAISS_CLONE_INDEX_C_H
+#define FAISS_CLONE_INDEX_C_H
+
+#include <stdio.h>
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* cloning functions */
+
+/** Clone an index. This is equivalent to `faiss::clone_index` */
+int faiss_clone_index (const FaissIndex *, FaissIndex ** p_out);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/example_c.c b/core/src/index/thirdparty/faiss/c_api/example_c.c
index 597c2920ee..2e9a78a1ad 100644
--- a/core/src/index/thirdparty/faiss/c_api/example_c.c
+++ b/core/src/index/thirdparty/faiss/c_api/example_c.c
@@ -17,6 +17,7 @@
 #include "Index_c.h"
 #include "IndexFlat_c.h"
 #include "AutoTune_c.h"
+#include "clone_index_c.h"
 
 #define FAISS_TRY(C)                                       \
     {                                                      \
diff --git a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.cpp
similarity index 87%
rename from core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp
rename to core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.cpp
index d0a0d380ee..4b3fec8fb7 100644
--- a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.cpp
@@ -9,8 +9,8 @@
 // -*- c++ -*-
 
 #include "AuxIndexStructures_c.h"
-#include "AuxIndexStructures.h"
-#include "macros_impl.h"
+#include "../../impl/AuxIndexStructures.h"
+#include "../macros_impl.h"
 #include <iostream>
 
 using faiss::BufferList;
@@ -20,6 +20,7 @@ using faiss::IDSelectorRange;
 using faiss::RangeSearchResult;
 using faiss::RangeSearchPartialResult;
 using faiss::RangeQueryResult;
+using faiss::DistanceComputer;
 
 DEFINE_GETTER(RangeSearchResult, size_t, nq)
 
@@ -191,3 +192,29 @@ int faiss_RangeSearchPartialResult_new_result(
         return 0;
     } CATCH_AND_HANDLE
 }
+
+DEFINE_DESTRUCTOR(DistanceComputer)
+
+int faiss_DistanceComputer_set_query(FaissDistanceComputer *dc, const float *x) {
+    try {
+        reinterpret_cast<DistanceComputer*>(dc)->set_query(x);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_DistanceComputer_vector_to_query_dis(FaissDistanceComputer *dc, idx_t i, float *qd) {
+    try {
+        *qd = reinterpret_cast<DistanceComputer*>(dc)->operator()(i);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_DistanceComputer_symmetric_dis(FaissDistanceComputer *dc, idx_t i, idx_t j, float *vd) {
+    try {
+        *vd = reinterpret_cast<DistanceComputer*>(dc)->symmetric_dis(i, j);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.h
similarity index 86%
rename from core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h
rename to core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.h
index ebcbc1cc34..1d66b0aac0 100644
--- a/core/src/index/thirdparty/faiss/c_api/AuxIndexStructures_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.h
@@ -11,8 +11,8 @@
 #ifndef FAISS_AUX_INDEX_STRUCTURES_C_H
 #define FAISS_AUX_INDEX_STRUCTURES_C_H
 
-#include "Index_c.h"
-#include "faiss_c.h"
+#include "../Index_c.h"
+#include "../faiss_c.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -126,6 +126,22 @@ int faiss_RangeSearchPartialResult_set_lims(
 int faiss_RangeSearchPartialResult_new_result(
     FaissRangeSearchPartialResult* res, idx_t qno, FaissRangeQueryResult** qr);
 
+
+FAISS_DECLARE_CLASS(DistanceComputer)
+/// called before computing distances
+int faiss_DistanceComputer_set_query(FaissDistanceComputer *dc, const float *x);
+
+/**
+ * Compute distance of vector i to current query.
+ * This function corresponds to the function call operator: DistanceComputer::operator()
+ */
+int faiss_DistanceComputer_vector_to_query_dis( FaissDistanceComputer *dc, idx_t i, float *qd);
+/// compute distance between two stored vectors
+int faiss_DistanceComputer_symmetric_dis(FaissDistanceComputer *dc, idx_t i, idx_t j, float *vd);
+
+FAISS_DECLARE_DESTRUCTOR(DistanceComputer)
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp b/core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp
new file mode 100644
index 0000000000..f7f00c4132
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp
@@ -0,0 +1,26 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include <cstring>
+#include "index_factory.h"
+#include "index_factory_c.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric) {
+    try {
+        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
+            d, description, static_cast<faiss::MetricType>(metric)));
+    } CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/index_factory_c.h b/core/src/index/thirdparty/faiss/c_api/index_factory_c.h
new file mode 100644
index 0000000000..4262fe09a2
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/c_api/index_factory_c.h
@@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_FACTORY_C_H
+#define FAISS_INDEX_FACTORY_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp b/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
index 479045e1fb..8c0ca4420e 100644
--- a/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
@@ -39,11 +39,4 @@ int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out)
         auto out = faiss::read_index(fname, io_flags);
         *p_out = reinterpret_cast<FaissIndex*>(out);
     } CATCH_AND_HANDLE
-}
-
-int faiss_clone_index (const FaissIndex *idx, FaissIndex **p_out) {
-    try {
-        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
-        *p_out = reinterpret_cast<FaissIndex*>(out);
-    } CATCH_AND_HANDLE
-}
+}
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/c_api/index_io_c.h b/core/src/index/thirdparty/faiss/c_api/index_io_c.h
index f20ce2e644..f703e491ca 100644
--- a/core/src/index/thirdparty/faiss/c_api/index_io_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.h
@@ -44,12 +44,7 @@ int faiss_read_index(FILE *f, int io_flags, FaissIndex **p_out);
  */
 int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out);
 
-/* cloning functions */
-
-/** Clone an index. This is equivalent to `faiss::clone_index` */
-int faiss_clone_index (const FaissIndex *, FaissIndex ** p_out);
-
 #ifdef __cplusplus
 }
 #endif
-#endif
+#endif
\ No newline at end of file
diff --git a/core/src/index/thirdparty/faiss/clone_index.cpp b/core/src/index/thirdparty/faiss/clone_index.cpp
index 8258d3fa6f..ca9809d284 100644
--- a/core/src/index/thirdparty/faiss/clone_index.cpp
+++ b/core/src/index/thirdparty/faiss/clone_index.cpp
@@ -116,6 +116,12 @@ Index *Cloner::clone_Index (const Index *index)
                dynamic_cast<const IndexPreTransform*> (index)) {
         IndexPreTransform *res = new IndexPreTransform ();
         res->d = ipt->d;
+        res->ntotal = ipt->ntotal;
+        res->is_trained = ipt->is_trained;
+        res->metric_type = ipt->metric_type;
+        res->metric_arg = ipt->metric_arg;
+
+
         res->index = clone_Index (ipt->index);
         for (int i = 0; i < ipt->chain.size(); i++)
             res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
diff --git a/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py b/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
index 3eb6421019..eb7c709a1b 100644
--- a/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
+++ b/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
@@ -25,14 +25,9 @@ import faiss
 #################################################################
 
 def ivecs_read(fname):
-    f = open(fname)
-    d, = np.fromfile(f, count = 1, dtype = 'int32')
-    sz = os.stat(fname).st_size
-    assert sz % (4 * (d + 1)) == 0
-    n = sz / (4 * (d + 1))
-    f.seek(0)
-    a = np.fromfile(f, count = n * (d +1), dtype = 'int32').reshape(n, d + 1)
-    return a[:, 1:].copy()
+    a = np.fromfile(fname, dtype="int32")
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()
 
 def fvecs_read(fname):
     return ivecs_read(fname).view('float32')
@@ -41,8 +36,8 @@ def fvecs_read(fname):
 def plot_OperatingPoints(ops, nq, **kwargs):
     ops = ops.optimal_pts
     n = ops.size() * 2 - 1
-    pyplot.plot([ops.at( i      / 2).perf for i in range(n)],
-                [ops.at((i + 1) / 2).t / nq * 1000 for i in range(n)],
+    pyplot.plot([ops.at( i      // 2).perf for i in range(n)],
+                [ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
                 **kwargs)
 
 
diff --git a/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp b/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
index 8b6fe0f4f4..dd91c59080 100644
--- a/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
+++ b/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
@@ -20,7 +20,7 @@
 #include <sys/time.h>
 
 #include <faiss/AutoTune.h>
-
+#include <faiss/index_factory.h>
 
 /**
  * To run this demo, please download the ANN_SIFT1M dataset from
diff --git a/core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp b/core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
new file mode 100644
index 0000000000..eee188e4b3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
@@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/Clustering.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/distances.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+
+
+namespace {
+
+
+enum WeightedKMeansType {
+    WKMT_FlatL2,
+    WKMT_FlatIP,
+    WKMT_FlatIP_spherical,
+    WKMT_HNSW,
+};
+
+
+float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
+                                  const float *input,
+                                  const float *weights,
+                                  float *centroids,
+                                  WeightedKMeansType index_num)
+{
+    using namespace faiss;
+    Clustering clus (d, k);
+    clus.verbose = true;
+
+    std::unique_ptr<Index> index;
+
+    switch (index_num) {
+    case WKMT_FlatL2:
+        index.reset(new IndexFlatL2 (d));
+        break;
+    case WKMT_FlatIP:
+        index.reset(new IndexFlatIP (d));
+        break;
+    case WKMT_FlatIP_spherical:
+        index.reset(new IndexFlatIP (d));
+        clus.spherical = true;
+        break;
+    case WKMT_HNSW:
+        IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
+        ihnsw->hnsw.efSearch = 128;
+        index.reset(ihnsw);
+        break;
+    }
+
+    clus.train(n, input, *index.get(), weights);
+    // on output the index contains the centroids.
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.iteration_stats.back().obj;
+}
+
+
+int d = 32;
+float sigma = 0.1;
+
+#define BIGTEST
+
+#ifdef BIGTEST
+// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
+int nc = 200000;
+int n_big = 4;
+int n_small = 2;
+#else
+int nc = 5;
+int n_big = 100;
+int n_small = 10;
+#endif
+
+int n; // number of training points
+
+void generate_trainset (std::vector<float> & ccent,
+                        std::vector<float> & x,
+                        std::vector<float> & weights)
+{
+    // same sampling as test_build_blocks.py test_weighted
+
+    ccent.resize (d * 2 * nc);
+    faiss::float_randn (ccent.data(), d * 2 * nc, 123);
+    faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
+    n = nc * n_big + nc * n_small;
+    x.resize(d * n);
+    weights.resize(n);
+    faiss::float_randn (x.data(), x.size(), 1234);
+
+    float *xi = x.data();
+    float *w = weights.data();
+    for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
+        int np = ci < nc ? n_big : n_small; // nb of points around this centroid
+        for (int i = 0; i < np; i++) {
+            for (int j = 0; j < d; j++) {
+                xi[j] = xi[j] * sigma + ccent[ci * d + j];
+            }
+            *w++ = ci < nc ? 0.1 : 10;
+            xi += d;
+        }
+    }
+}
+
+}
+
+
+int main(int argc, char **argv) {
+    std::vector<float> ccent;
+    std::vector<float> x;
+    std::vector<float> weights;
+
+    printf("generate training set\n");
+    generate_trainset(ccent, x, weights);
+
+    std::vector<float> centroids;
+    centroids.resize(nc * d);
+
+    int the_index_num = -1;
+    int the_with_weights = -1;
+
+    if (argc == 3) {
+        the_index_num = atoi(argv[1]);
+        the_with_weights = atoi(argv[2]);
+    }
+
+
+    for (int index_num = WKMT_FlatL2;
+         index_num <= WKMT_HNSW;
+         index_num++) {
+
+        if (the_index_num >= 0 && index_num != the_index_num) {
+            continue;
+        }
+
+        for (int with_weights = 0; with_weights <= 1; with_weights++) {
+            if (the_with_weights >= 0 && with_weights != the_with_weights) {
+                continue;
+            }
+
+            printf("=================== index_num=%d Run %s weights\n",
+                   index_num, with_weights ? "with" : "without");
+
+            weighted_kmeans_clustering (
+                 d, n, nc, x.data(),
+                 with_weights ? weights.data() : nullptr,
+                 centroids.data(), (WeightedKMeansType)index_num
+            );
+
+            { // compute distance of points to centroids
+                faiss::IndexFlatL2 cent_index(d);
+                cent_index.add(nc, centroids.data());
+                std::vector<float> dis (n);
+                std::vector<faiss::Index::idx_t> idx (n);
+
+                cent_index.search (nc * 2, ccent.data(), 1,
+                                   dis.data(), idx.data());
+
+                float dis1 = 0, dis2 = 0;
+                for (int i = 0; i < nc ; i++) {
+                    dis1 += dis[i];
+                }
+                printf("average distance of points from big clusters: %g\n",
+                       dis1 / nc);
+
+                for (int i = 0; i < nc ; i++) {
+                    dis2 += dis[i + nc];
+                }
+
+                printf("average distance of points from small clusters: %g\n",
+                       dis2 / nc);
+
+            }
+
+        }
+    }
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
index 409e99ccdb..12da227039 100644
--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
@@ -9,7 +9,7 @@
 
 CXX      = g++ -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -mavx -msse4 -mpopcnt
 LDFLAGS  = -fPIC -fopenmp
 
 # common linux flags
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
index 9152f6a1ac..8fa6fe7616 100644
--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
@@ -9,7 +9,7 @@
 # brew install llvm
 CXX      = /usr/local/opt/llvm/bin/clang++ -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare -I/usr/local/opt/llvm/include
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -msse4 -mpopcnt
 LLVM_VERSION_PATH=$(shell ls -rt /usr/local/Cellar/llvm/ | tail -n1)
 LDFLAGS  = -fPIC -fopenmp -L/usr/local/opt/llvm/lib -L/usr/local/Cellar/llvm/${LLVM_VERSION_PATH}/lib
 
diff --git a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
index 1ed397bfcc..6b2c292220 100644
--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
@@ -12,7 +12,7 @@
 # port install g++-mp-6
 CXX      = /opt/local/bin/g++-mp-6 -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -msse4 -mpopcnt
 LDFLAGS  = -g -fPIC -fopenmp
 
 # common linux flags
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
index 1610aaf1cd..192c02db42 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
@@ -279,7 +279,6 @@ faiss::Index * index_cpu_to_gpu(
     return cl.clone_Index(index_composition);
 }
 
-
 /**********************************************************
  * Cloning to multiple GPUs
  **********************************************************/
@@ -372,6 +371,7 @@ Index * ToGpuClonerMultiple::clone_Index_to_shards (const Index *index)
                        index_ivfflat->quantizer, index->d,
                        index_ivfflat->nlist, index_ivfflat->metric_type);
             idx2.nprobe = index_ivfflat->nprobe;
+            idx2.is_trained = index->is_trained;
             copy_ivf_shard (index_ivfflat, &idx2, n, i);
             shards[i] = sub_cloners[i].clone_Index(&idx2);
         } else if (index_ivfsq) {
@@ -380,7 +380,10 @@ Index * ToGpuClonerMultiple::clone_Index_to_shards (const Index *index)
                        index_ivfsq->sq.qtype,
                        index_ivfsq->metric_type,
                        index_ivfsq->by_residual);
+
             idx2.nprobe = index_ivfsq->nprobe;
+            idx2.is_trained = index->is_trained;
+            idx2.sq = index_ivfsq->sq;
             copy_ivf_shard (index_ivfsq, &idx2, n, i);
             shards[i] = sub_cloners[i].clone_Index(&idx2);
         } else if (index_flat) {
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
index 5c687cee20..f2c5388d93 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
@@ -73,9 +73,9 @@ faiss::Index * index_cpu_to_gpu(
        const GpuClonerOptions *options = nullptr);
 
 faiss::Index * index_cpu_to_gpu(
-        GpuResources* resources, int device,
-        IndexComposition* index_composition,
-        const GpuClonerOptions *options = nullptr);
+       GpuResources* resources, int device,
+       IndexComposition* index_composition,
+       const GpuClonerOptions *options = nullptr);
 
 faiss::Index * index_cpu_to_gpu_multiple(
        std::vector<GpuResources*> & resources,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
index a6abee6f3a..4e0b40bd84 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
@@ -13,7 +13,7 @@ GpuClonerOptions::GpuClonerOptions()
     : indicesOptions(INDICES_64_BIT),
       useFloat16CoarseQuantizer(false),
       useFloat16(false),
-      usePrecomputed(true),
+      usePrecomputed(false),
       reserveVecs(0),
       storeTransposed(false),
       storeInCpu(false),
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
index 9ae5fdad5d..f5ce8aa24e 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
@@ -17,88 +17,75 @@
 
 namespace faiss { namespace gpu {
 
-void bruteForceKnn(GpuResources* resources,
-                   faiss::MetricType metric,
-                   // A region of memory size numVectors x dims, with dims
-                   // innermost
-                   const float* vectors,
-                   bool vectorsRowMajor,
-                   int numVectors,
-                   // A region of memory size numQueries x dims, with dims
-                   // innermost
-                   const float* queries,
-                   bool queriesRowMajor,
-                   int numQueries,
-                   int dims,
-                   int k,
-                   // A region of memory size numQueries x k, with k
-                   // innermost
-                   float* outDistances,
-                   // A region of memory size numQueries x k, with k
-                   // innermost
-                   faiss::Index::idx_t* outIndices) {
+template <typename T>
+void bfKnnConvert(GpuResources* resources, const GpuDistanceParams& args) {
   auto device = getCurrentDevice();
   auto stream = resources->getDefaultStreamCurrentDevice();
   auto& mem = resources->getMemoryManagerCurrentDevice();
 
-  auto tVectors = toDevice<float, 2>(resources,
-                                     device,
-                                     const_cast<float*>(vectors),
-                                     stream,
-                                     {vectorsRowMajor ? numVectors : dims,
-                                      vectorsRowMajor ? dims : numVectors});
-  auto tQueries = toDevice<float, 2>(resources,
-                                     device,
-                                     const_cast<float*>(queries),
-                                     stream,
-                                     {queriesRowMajor ? numQueries : dims,
-                                      queriesRowMajor ? dims : numQueries});
+  auto tVectors =
+    toDevice<T, 2>(resources,
+                   device,
+                   const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
+                   stream,
+                   {args.vectorsRowMajor ? args.numVectors : args.dims,
+                    args.vectorsRowMajor ? args.dims : args.numVectors});
+  auto tQueries =
+    toDevice<T, 2>(resources,
+                   device,
+                   const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
+                   stream,
+                   {args.queriesRowMajor ? args.numQueries : args.dims,
+                    args.queriesRowMajor ? args.dims : args.numQueries});
 
-  auto tOutDistances = toDevice<float, 2>(resources,
-                                          device,
-                                          outDistances,
-                                          stream,
-                                          {numQueries, k});
+  DeviceTensor<float, 1, true> tVectorNorms;
+  if (args.vectorNorms) {
+    tVectorNorms = toDevice<float, 1>(resources,
+                                      device,
+                                      const_cast<float*>(args.vectorNorms),
+                                      stream,
+                                      {args.numVectors});
+  }
 
-  // FlatIndex only supports an interface returning int indices, allocate
-  // temporary memory for it
-  DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
+  auto tOutDistances =
+    toDevice<float, 2>(resources,
+                       device,
+                       args.outDistances,
+                       stream,
+                       {args.numQueries, args.k});
+
+  // The brute-force API only supports an interface for integer indices
+  DeviceTensor<int, 2, true>
+    tOutIntIndices(mem, {args.numQueries, args.k}, stream);
 
   // Empty bitset
   auto bitsetDevice = toDevice<uint8_t, 1>(resources, device, nullptr, stream, {0});
 
-  // Do the work
-  if (metric == faiss::MetricType::METRIC_L2) {
-    runL2Distance(resources,
-                  tVectors,
-                  vectorsRowMajor,
-                  nullptr, // compute norms in temp memory
-                  tQueries,
-                  queriesRowMajor,
-                  bitsetDevice,
-                  k,
-                  tOutDistances,
-                  tOutIntIndices);
-  } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
-    runIPDistance(resources,
-                  tVectors,
-                  vectorsRowMajor,
-                  tQueries,
-                  queriesRowMajor,
-                  bitsetDevice,
-                  k,
-                  tOutDistances,
-                  tOutIntIndices);
-  } else {
-    FAISS_THROW_MSG("metric should be METRIC_L2 or METRIC_INNER_PRODUCT");
-  }
+  // Since we've guaranteed that all arguments are on device, call the
+  // implementation
+  bfKnnOnDevice<T>(resources,
+                   device,
+                   stream,
+                   tVectors,
+                   args.vectorsRowMajor,
+                   args.vectorNorms ? &tVectorNorms : nullptr,
+                   tQueries,
+                   args.queriesRowMajor,
+                   bitsetDevice,
+                   args.k,
+                   args.metric,
+                   args.metricArg,
+                   tOutDistances,
+                   tOutIntIndices,
+                   args.ignoreOutDistances);
 
   // Convert and copy int indices out
-  auto tOutIndices = toDevice<faiss::Index::idx_t, 2>(resources,
-                                                      device,
-                                                      outIndices,
-                                                      stream,
-                                                      {numQueries, k});
+  auto tOutIndices =
+    toDevice<faiss::Index::idx_t, 2>(resources,
+                                     device,
+                                     args.outIndices,
+                                     stream,
+                                     {args.numQueries, args.k});
 
   // Convert int to idx_t
   convertTensor<int, faiss::Index::idx_t, 2>(stream,
@@ -106,8 +93,65 @@ void bruteForceKnn(GpuResources* resources,
                                              tOutIndices);
 
   // Copy back if necessary
-  fromDevice<float, 2>(tOutDistances, outDistances, stream);
-  fromDevice<faiss::Index::idx_t, 2>(tOutIndices, outIndices, stream);
+  fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
+  fromDevice<faiss::Index::idx_t, 2>(tOutIndices, args.outIndices, stream);
+}
+
+void
+bfKnn(GpuResources* resources, const GpuDistanceParams& args) {
+  // For now, both vectors and queries must be of the same data type
+  FAISS_THROW_IF_NOT_MSG(
+    args.vectorType == args.queryType,
+    "limitation: both vectorType and queryType must currently "
+    "be the same (F32 or F16");
+
+  if (args.vectorType == DistanceDataType::F32) {
+    bfKnnConvert<float>(resources, args);
+  } else if (args.vectorType == DistanceDataType::F16) {
+    bfKnnConvert<half>(resources, args);
+  } else {
+    FAISS_THROW_MSG("unknown vectorType");
+  }
+}
+
+// legacy version
+void
+bruteForceKnn(GpuResources* resources,
+              faiss::MetricType metric,
+              // A region of memory size numVectors x dims, with dims
+              // innermost
+              const float* vectors,
+              bool vectorsRowMajor,
+              int numVectors,
+              // A region of memory size numQueries x dims, with dims
+              // innermost
+              const float* queries,
+              bool queriesRowMajor,
+              int numQueries,
+              int dims,
+              int k,
+              // A region of memory size numQueries x k, with k
+              // innermost
+              float* outDistances,
+              // A region of memory size numQueries x k, with k
+              // innermost
+              faiss::Index::idx_t* outIndices) {
+  std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
+
+  GpuDistanceParams args;
+  args.metric = metric;
+  args.k = k;
+  args.dims = dims;
+  args.vectors = vectors;
+  args.vectorsRowMajor = vectorsRowMajor;
+  args.numVectors = numVectors;
+  args.queries = queries;
+  args.queriesRowMajor = queriesRowMajor;
+  args.numQueries = numQueries;
+  args.outDistances = outDistances;
+  args.outIndices = outIndices;
+
+  bfKnn(resources, args);
 }
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.h b/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
index 5002a91407..05667e70f7 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
@@ -14,6 +14,96 @@ namespace faiss { namespace gpu {
 
 class GpuResources;
 
+// Scalar type of the vector data
+enum class DistanceDataType {
+  F32 = 1,
+  F16,
+};
+
+/// Arguments to brute-force GPU k-nearest neighbor searching
+struct GpuDistanceParams {
+  GpuDistanceParams()
+      : metric(faiss::MetricType::METRIC_L2),
+        metricArg(0),
+        k(0),
+        dims(0),
+        vectors(nullptr),
+        vectorType(DistanceDataType::F32),
+        vectorsRowMajor(true),
+        numVectors(0),
+        vectorNorms(nullptr),
+        queries(nullptr),
+        queryType(DistanceDataType::F32),
+        queriesRowMajor(true),
+        numQueries(0),
+        outDistances(nullptr),
+        ignoreOutDistances(false),
+        outIndices(nullptr) {
+  }
+
+  //
+  // Search parameters
+  //
+
+  // Search parameter: distance metric
+  faiss::MetricType metric;
+
+  // Search parameter: distance metric argument (if applicable)
+  // For metric == METRIC_Lp, this is the p-value
+  float metricArg;
+
+  // Search parameter: return k nearest neighbors
+  int k;
+
+  // Vector dimensionality
+  int dims;
+
+  //
+  // Vectors being queried
+  //
+
+  // If vectorsRowMajor is true, this is
+  // numVectors x dims, with dims innermost; otherwise,
+  // dims x numVectors, with numVectors innermost
+  const void* vectors;
+  DistanceDataType vectorType;
+  bool vectorsRowMajor;
+  int numVectors;
+
+  // Precomputed L2 norms for each vector in `vectors`, which can be optionally
+  // provided in advance to speed computation for METRIC_L2
+  const float* vectorNorms;
+
+  //
+  // The query vectors (i.e., find k-nearest neighbors in `vectors` for each of
+  // the `queries`
+  //
+
+  // If queriesRowMajor is true, this is
+  // numQueries x dims, with dims innermost; otherwise,
+  // dims x numQueries, with numQueries innermost
+  const void* queries;
+  DistanceDataType queryType;
+  bool queriesRowMajor;
+  int numQueries;
+
+  //
+  // Output results
+  //
+
+  // A region of memory size numQueries x k, with k
+  // innermost (row major)
+  float* outDistances;
+
+  // Do we only care abouty the indices reported, rather than the output
+  // distances?
+  bool ignoreOutDistances;
+
+  // A region of memory size numQueries x k, with k
+  // innermost (row major)
+  faiss::Index::idx_t* outIndices;
+};
+
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
 /// neighbor searches on an externally-provided region of memory (e.g., from a
 /// pytorch tensor).
@@ -26,6 +116,9 @@ class GpuResources;
 ///
 /// For each vector in `queries`, searches all of `vectors` to find its k
 /// nearest neighbors with respect to the given metric
+void bfKnn(GpuResources* resources, const GpuDistanceParams& args);
+
+/// Deprecated legacy implementation
 void bruteForceKnn(GpuResources* resources,
                    faiss::MetricType metric,
                    // If vectorsRowMajor is true, this is
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
index 9ae1662055..173b3206f2 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
@@ -9,7 +9,6 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/Metrics.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
@@ -39,6 +38,7 @@ constexpr size_t kSearchVecSize = (size_t) 32 * 1024;
 GpuIndex::GpuIndex(GpuResources* resources,
                    int dims,
                    faiss::MetricType metric,
+                   float metricArg,
                    GpuIndexConfig config) :
     Index(dims, metric),
     resources_(resources),
@@ -62,13 +62,30 @@ GpuIndex::GpuIndex(GpuResources* resources,
                      "Must compile with CUDA 8+ for Unified Memory support");
 #endif
 
-  FAISS_THROW_IF_NOT_MSG(isMetricSupported(metric),
-                         "Unsupported metric type on GPU");
+  metric_arg = metricArg;
 
   FAISS_ASSERT(resources_);
   resources_->initializeForDevice(device_);
 }
 
+void
+GpuIndex::copyFrom(const faiss::Index* index) {
+  d = index->d;
+  metric_type = index->metric_type;
+  metric_arg = index->metric_arg;
+  ntotal = index->ntotal;
+  is_trained = index->is_trained;
+}
+
+void
+GpuIndex::copyTo(faiss::Index* index) const {
+  index->d = d;
+  index->metric_type = metric_type;
+  index->metric_arg = metric_arg;
+  index->ntotal = ntotal;
+  index->is_trained = is_trained;
+}
+
 void
 GpuIndex::setMinPagingSize(size_t size) {
   minPagedSize_ = size;
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
index 294c1fb703..ae902f57a8 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
@@ -36,6 +36,7 @@ class GpuIndex : public faiss::Index {
   GpuIndex(GpuResources* resources,
            int dims,
            faiss::MetricType metric,
+           float metricArg,
            GpuIndexConfig config);
 
   inline int getDevice() const {
@@ -88,6 +89,12 @@ class GpuIndex : public faiss::Index {
                           const Index::idx_t* keys) const override;
 
  protected:
+  /// Copy what we need from the CPU equivalent
+  void copyFrom(const faiss::Index* index);
+
+  /// Copy what we have to the CPU equivalent
+  void copyTo(faiss::Index* index) const;
+
   /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
   /// generate them sequentially based on the order in which the IDs are added
   virtual bool addImplRequiresIDs_() const = 0;
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
index 75d96cfa3b..5f4893586e 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
@@ -22,11 +22,13 @@ namespace faiss { namespace gpu {
 GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
                            const faiss::IndexFlat* index,
                            GpuIndexFlatConfig config) :
-    GpuIndex(resources, index->d, index->metric_type, config),
+    GpuIndex(resources,
+             index->d,
+             index->metric_type,
+             index->metric_arg,
+             config),
     config_(std::move(config)),
     data_(nullptr) {
-  verifySettings_();
-
   // Flat index doesn't need training
   this->is_trained = true;
 
@@ -37,11 +39,9 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
                            int dims,
                            faiss::MetricType metric,
                            GpuIndexFlatConfig config) :
-    GpuIndex(resources, dims, metric, config),
+    GpuIndex(resources, dims, metric, 0, config),
     config_(std::move(config)),
     data_(nullptr) {
-  verifySettings_();
-
   // Flat index doesn't need training
   this->is_trained = true;
 
@@ -49,9 +49,7 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
   DeviceScope scope(device_);
   data_ = new FlatIndex(resources,
                         dims,
-                        metric == faiss::METRIC_L2,
                         config_.useFloat16,
-                        config_.useFloat16Accumulator,
                         config_.storeTransposed,
                         memorySpace_);
 }
@@ -64,8 +62,7 @@ void
 GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
   DeviceScope scope(device_);
 
-  this->d = index->d;
-  this->metric_type = index->metric_type;
+  GpuIndex::copyFrom(index);
 
   // GPU code has 32 bit indices
   FAISS_THROW_IF_NOT_FMT(index->ntotal <=
@@ -74,14 +71,11 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
                          "attempting to copy CPU index with %zu parameters",
                          (size_t) std::numeric_limits<int>::max(),
                          (size_t) index->ntotal);
-  this->ntotal = index->ntotal;
 
   delete data_;
   data_ = new FlatIndex(resources_,
                         this->d,
-                        index->metric_type == faiss::METRIC_L2,
                         config_.useFloat16,
-                        config_.useFloat16Accumulator,
                         config_.storeTransposed,
                         memorySpace_);
 
@@ -95,7 +89,7 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
   xb_.clear();
 
   if (config_.storeInCpu) {
-      xb_ = index->xb;
+    xb_ = index->xb;
   }
 }
 
@@ -103,9 +97,7 @@ void
 GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
   DeviceScope scope(device_);
 
-  index->d = this->d;
-  index->ntotal = this->ntotal;
-  index->metric_type = this->metric_type;
+  GpuIndex::copyTo(index);
 
   FAISS_ASSERT(data_);
   FAISS_ASSERT(data_->getSize() == this->ntotal);
@@ -219,12 +211,12 @@ GpuIndexFlat::searchImpl_(int n,
   // Copy bitset to GPU
   if (!bitset) {
     auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+    data_->query(queries, bitsetDevice, k, metric_type, metric_arg, outDistances, outIntLabels, true);
   } else {
     auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
                                              const_cast<uint8_t*>(bitset->data()), stream,
                                              {(int) bitset->size()});
-    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+    data_->query(queries, bitsetDevice, k, metric_type, metric_arg, outDistances, outIntLabels, true);
   }
 
   // Convert int to idx_t
@@ -236,9 +228,9 @@ GpuIndexFlat::searchImpl_(int n,
 void
 GpuIndexFlat::reconstruct(faiss::Index::idx_t key,
                           float* out) const {
-  if(config_.storeInCpu && xb_.size() > 0) {
-      memcpy (out, &(this->xb_[key * this->d]), sizeof(*out) * this->d);
-      return;
+  if (config_.storeInCpu && xb_.size() > 0) {
+    memcpy (out, &(this->xb_[key * this->d]), sizeof(*out) * this->d);
+    return;
   }
 
   DeviceScope scope(device_);
@@ -322,21 +314,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
   fromDevice<float, 2>(residualDevice, residuals, stream);
 }
 
-void
-GpuIndexFlat::verifySettings_() const {
-  // If we want Hgemm, ensure that it is supported on this device
-  if (config_.useFloat16Accumulator) {
-    FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
-                       "useFloat16Accumulator can only be enabled "
-                       "with useFloat16");
-
-    FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
-                       "Device %d does not support Hgemm "
-                       "(useFloat16Accumulator)",
-                       config_.device);
-  }
-}
-
 //
 // GpuIndexFlatL2
 //
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
index 3050caa4c8..90823d69a4 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
@@ -27,19 +27,13 @@ struct FlatIndex;
 struct GpuIndexFlatConfig : public GpuIndexConfig {
   inline GpuIndexFlatConfig()
       : useFloat16(false),
-        useFloat16Accumulator(false),
         storeTransposed(false),
-        storeInCpu(false){
+        storeInCpu(false) {
   }
 
   /// Whether or not data is stored as float16
   bool useFloat16;
 
-  /// Whether or not all math is performed in float16, if useFloat16 is
-  /// specified. If true, we use cublasHgemm, supported only on CC
-  /// 5.3+. Otherwise, we use cublasSgemmEx.
-  bool useFloat16Accumulator;
-
   /// Whether or not data is stored (transparently) in a transposed
   /// layout, enabling use of the NN GEMM call, which is ~10% faster.
   /// This will improve the speed of the flat index, but will
@@ -130,10 +124,6 @@ class GpuIndexFlat : public GpuIndex {
                    faiss::Index::idx_t* labels,
                    ConcurrentBitsetPtr bitset = nullptr) const override;
 
- private:
-  /// Checks user settings for consistency
-  void verifySettings_() const;
-
  protected:
   /// Our config object
   const GpuIndexFlatConfig config_;
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu
index 3c2fcd83e4..8e873c1914 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu
@@ -19,29 +19,36 @@ namespace faiss { namespace gpu {
 GpuIndexIVF::GpuIndexIVF(GpuResources* resources,
                          int dims,
                          faiss::MetricType metric,
+                         float metricArg,
                          int nlistIn,
                          GpuIndexIVFConfig config) :
-    GpuIndex(resources, dims, metric, config),
+    GpuIndex(resources, dims, metric, metricArg, config),
     ivfConfig_(std::move(config)),
     nlist(nlistIn),
     nprobe(1),
     quantizer(nullptr) {
   init_();
+
+  // Only IP and L2 are supported for now
+  if (!(metric_type == faiss::METRIC_L2 ||
+        metric_type == faiss::METRIC_INNER_PRODUCT)) {
+    FAISS_THROW_FMT("unsupported metric type %d", (int) metric_type);
+  }
 }
 
 void
 GpuIndexIVF::init_() {
-  FAISS_ASSERT(nlist > 0);
+  FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be > 0");
 
   // Spherical by default if the metric is inner_product
-  if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
-    this->cp.spherical = true;
+  if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+    cp.spherical = true;
   }
 
   // here we set a low # iterations because this is typically used
   // for large clusterings
-  this->cp.niter = 10;
-  this->cp.verbose = this->verbose;
+  cp.niter = 10;
+  cp.verbose = verbose;
 
   if (!quantizer) {
     // Construct an empty quantizer
@@ -49,20 +56,21 @@ GpuIndexIVF::init_() {
     // FIXME: inherit our same device
     config.device = device_;
 
-    if (this->metric_type == faiss::METRIC_L2) {
-      quantizer = new GpuIndexFlatL2(resources_, this->d, config);
-    } else if (this->metric_type == faiss::METRIC_INNER_PRODUCT) {
-      quantizer = new GpuIndexFlatIP(resources_, this->d, config);
+    if (metric_type == faiss::METRIC_L2) {
+      quantizer = new GpuIndexFlatL2(resources_, d, config);
+    } else if (metric_type == faiss::METRIC_INNER_PRODUCT) {
+      quantizer = new GpuIndexFlatIP(resources_, d, config);
     } else {
       // unknown metric type
-      FAISS_THROW_IF_NOT_MSG(false, "unsupported metric type");
+      FAISS_THROW_FMT("unsupported metric type %d", (int) metric_type);
     }
   }
 }
 
 GpuIndexIVF::~GpuIndexIVF() {
-    if(remove_quantizer == 1)
-        delete quantizer;
+  if (remove_quantizer == 1) {
+    delete quantizer;
+  }
 }
 
 GpuIndexFlat*
@@ -74,8 +82,7 @@ void
 GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
   DeviceScope scope(device_);
 
-  this->d = index->d;
-  this->metric_type = index->metric_type;
+  GpuIndex::copyFrom(index);
 
   FAISS_ASSERT(index->nlist > 0);
   FAISS_THROW_IF_NOT_FMT(index->nlist <=
@@ -113,17 +120,15 @@ GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
   }
 
   if (!index->is_trained) {
-    this->is_trained = false;
-    this->ntotal = 0;
+    // copied in GpuIndex::copyFrom
+    FAISS_ASSERT(!is_trained && ntotal == 0);
     return;
   }
 
-  // Otherwise, we can populate ourselves from the other index
-  this->is_trained = true;
-
+  // copied in GpuIndex::copyFrom
   // ntotal can exceed max int, but the number of vectors per inverted
   // list cannot exceed this. We check this in the subclasses.
-  this->ntotal = index->ntotal;
+  FAISS_ASSERT(is_trained && (ntotal == index->ntotal));
 
   // Since we're trained, the quantizer must have data
   FAISS_ASSERT(index->quantizer->ntotal > 0);
@@ -222,10 +227,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
   //
   // Index information
   //
-  index->ntotal = this->ntotal;
-  index->d = this->d;
-  index->metric_type = this->metric_type;
-  index->is_trained = this->is_trained;
+  GpuIndex::copyTo(index);
 
   //
   // IndexIVF information
@@ -258,8 +260,7 @@ GpuIndexIVF::copyTo(faiss::IndexIVF* index) const {
   index->quantizer_trains_alone = 0;
   index->own_fields = true;
   index->cp = this->cp;
-  index->maintain_direct_map = false;
-  index->direct_map.clear();
+  index->make_direct_map(false);
 }
 
 int
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h
index ca9a386641..bc0dddc9a6 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.h
@@ -37,6 +37,7 @@ class GpuIndexIVF : public GpuIndex {
   GpuIndexIVF(GpuResources* resources,
               int dims,
               faiss::MetricType metric,
+              float metricArg,
               int nlist,
               GpuIndexIVFConfig config = GpuIndexIVFConfig());
 
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
index bc9bac0be0..6ca7c70ffb 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
@@ -26,6 +26,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
     GpuIndexIVF(resources,
                 index->d,
                 index->metric_type,
+                index->metric_arg,
                 index->nlist,
                 config),
     ivfFlatConfig_(config),
@@ -39,7 +40,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(GpuResources* resources,
                                  int nlist,
                                  faiss::MetricType metric,
                                  GpuIndexIVFFlatConfig config) :
-    GpuIndexIVF(resources, dims, metric, nlist, config),
+    GpuIndexIVF(resources, dims, metric, 0, nlist, config),
     ivfFlatConfig_(config),
     reserveMemoryVecs_(0),
     index_(nullptr) {
@@ -59,6 +60,7 @@ void
 GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
   reserveMemoryVecs_ = numVecs;
   if (index_) {
+    DeviceScope scope(device_);
     index_->reserveMemory(numVecs);
   }
 }
@@ -75,45 +77,47 @@ GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
   // The other index might not be trained
   if (!index->is_trained) {
+    FAISS_ASSERT(!is_trained);
     return;
   }
 
   // Otherwise, we can populate ourselves from the other index
-  this->is_trained = true;
+  FAISS_ASSERT(is_trained);
 
   // Copy our lists as well
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        index->metric_type,
+                       index->metric_arg,
                        false, // no residual
                        nullptr, // no scalar quantizer
                        ivfFlatConfig_.indicesOptions,
                        memorySpace_);
   InvertedLists *ivf = index->invlists;
 
-    if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
-        index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
-                                       (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
-        /* double t0 = getmillisecs(); */
-        /* std::cout << "Readonly Takes " << getmillisecs() - t0 << " ms" << std::endl; */
-    } else {
-        for (size_t i = 0; i < ivf->nlist; ++i) {
-            auto numVecs = ivf->list_size(i);
+  if (ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
+    index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
+                                   (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
+    /* double t0 = getmillisecs(); */
+    /* std::cout << "Readonly Takes " << getmillisecs() - t0 << " ms" << std::endl; */
+  } else {
+    for (size_t i = 0; i < ivf->nlist; ++i) {
+      auto numVecs = ivf->list_size(i);
 
-            // GPU index can only support max int entries per list
-            FAISS_THROW_IF_NOT_FMT(numVecs <=
-                                   (size_t) std::numeric_limits<int>::max(),
-                                   "GPU inverted list can only support "
-                                   "%zu entries; %zu found",
-                                   (size_t) std::numeric_limits<int>::max(),
-                                   numVecs);
+      // GPU index can only support max int entries per list
+      FAISS_THROW_IF_NOT_FMT(numVecs <=
+                             (size_t) std::numeric_limits<int>::max(),
+                             "GPU inverted list can only support "
+                             "%zu entries; %zu found",
+                             (size_t) std::numeric_limits<int>::max(),
+                             numVecs);
 
-            index_->addCodeVectorsFromCpu(i,
-                                          (const unsigned char*)(ivf->get_codes(i)),
-                                          ivf->get_ids(i),
-                                          numVecs);
-        }
+      index_->addCodeVectorsFromCpu(i,
+                                    (const unsigned char*)(ivf->get_codes(i)),
+                                    ivf->get_ids(i),
+                                    numVecs);
     }
+  }
 }
 
 void
@@ -187,6 +191,7 @@ GpuIndexIVFFlat::train(Index::idx_t n, const float* x) {
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        this->metric_type,
+                       this->metric_arg,
                        false, // no residual
                        nullptr, // no scalar quantizer
                        ivfFlatConfig_.indicesOptions,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
index 0f969c2ac9..254c0c4104 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
@@ -26,6 +26,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
     GpuIndexIVF(resources,
                 index->d,
                 index->metric_type,
+                index->metric_arg,
                 index->nlist,
                 config),
     ivfpqConfig_(config),
@@ -46,6 +47,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
     GpuIndexIVF(resources,
                 dims,
                 metric,
+                0,
                 nlist,
                 config),
     ivfpqConfig_(config),
@@ -55,9 +57,6 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources,
     index_(nullptr) {
   verifySettings_();
 
-  // FIXME make IP work fully
-  FAISS_ASSERT(this->metric_type == faiss::METRIC_L2);
-
   // We haven't trained ourselves, so don't construct the PQ index yet
   this->is_trained = false;
 }
@@ -70,9 +69,6 @@ void
 GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
   DeviceScope scope(device_);
 
-  // FIXME: support this
-  FAISS_THROW_IF_NOT_MSG(index->metric_type == faiss::METRIC_L2,
-                     "GPU: inner product unsupported");
   GpuIndexIVF::copyFrom(index);
 
   // Clear out our old data
@@ -94,16 +90,17 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
 
   // The other index might not be trained
   if (!index->is_trained) {
+    // copied in GpuIndex::copyFrom
+    FAISS_ASSERT(!is_trained);
     return;
   }
 
-  // Otherwise, we can populate ourselves from the other index
-  this->is_trained = true;
-
   // Copy our lists as well
   // The product quantizer must have data in it
   FAISS_ASSERT(index->pq.centroids.size() > 0);
   index_ = new IVFPQ(resources_,
+                     index->metric_type,
+                     index->metric_arg,
                      quantizer->getGpuData(),
                      subQuantizers_,
                      bitsPerCode_,
@@ -280,6 +277,8 @@ GpuIndexIVFPQ::trainResidualQuantizer_(Index::idx_t n, const float* x) {
   pq.train(n, residuals.data());
 
   index_ = new IVFPQ(resources_,
+                     metric_type,
+                     metric_arg,
                      quantizer->getGpuData(),
                      subQuantizers_,
                      bitsPerCode_,
@@ -457,10 +456,6 @@ GpuIndexIVFPQ::verifySettings_() const {
                      "Precomputed codes supports any number of dimensions, but "
                      "will involve memory overheads.",
                      this->d / subQuantizers_);
-
-  // TODO: fully implement METRIC_INNER_PRODUCT
-  FAISS_THROW_IF_NOT_MSG(this->metric_type == faiss::METRIC_L2,
-                     "METRIC_INNER_PRODUCT is currently unsupported");
 }
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
index 975f72d70d..27da743cb4 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
@@ -25,6 +25,7 @@ GpuIndexIVFSQHybrid::GpuIndexIVFSQHybrid(
     GpuIndexIVF(resources,
                 index->d,
                 index->metric_type,
+                index->metric_arg,
                 index->nlist,
                 config),
     ivfSQConfig_(config),
@@ -47,7 +48,7 @@ GpuIndexIVFSQHybrid::GpuIndexIVFSQHybrid(
   faiss::MetricType metric,
   bool encodeResidual,
   GpuIndexIVFSQHybridConfig config) :
-    GpuIndexIVF(resources, dims, metric, nlist, config),
+    GpuIndexIVF(resources, dims, metric, 0, nlist, config),
     ivfSQConfig_(config),
     sq(dims, qtype),
     by_residual(encodeResidual),
@@ -102,6 +103,7 @@ GpuIndexIVFSQHybrid::copyFrom(
     index_ = new IVFFlat(resources_,
                          quantizer->getGpuData(),
                          index->metric_type,
+                         index->metric_arg,
                          by_residual,
                          &sq,
                          ivfSQConfig_.indicesOptions,
@@ -165,6 +167,7 @@ GpuIndexIVFSQHybrid::copyFrom(
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        index->metric_type,
+                       index->metric_arg,
                        by_residual,
                        &sq,
                        ivfSQConfig_.indicesOptions,
@@ -284,6 +287,7 @@ GpuIndexIVFSQHybrid::train(Index::idx_t n, const float* x) {
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        this->metric_type,
+                       this->metric_arg,
                        by_residual,
                        &sq,
                        ivfSQConfig_.indicesOptions,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
index c0e194d45b..6be3bb1f79 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@@ -24,6 +24,7 @@ GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
     GpuIndexIVF(resources,
                 index->d,
                 index->metric_type,
+                index->metric_arg,
                 index->nlist,
                 config),
     ivfSQConfig_(config),
@@ -45,7 +46,7 @@ GpuIndexIVFScalarQuantizer::GpuIndexIVFScalarQuantizer(
   faiss::MetricType metric,
   bool encodeResidual,
   GpuIndexIVFScalarQuantizerConfig config) :
-    GpuIndexIVF(resources, dims, metric, nlist, config),
+    GpuIndexIVF(resources, dims, metric, 0, nlist, config),
     ivfSQConfig_(config),
     sq(dims, qtype),
     by_residual(encodeResidual),
@@ -69,6 +70,7 @@ void
 GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) {
   reserveMemoryVecs_ = numVecs;
   if (index_) {
+    DeviceScope scope(device_);
     index_->reserveMemory(numVecs);
   }
 }
@@ -100,12 +102,14 @@ GpuIndexIVFScalarQuantizer::copyFrom(
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        index->metric_type,
+                       index->metric_arg,
                        by_residual,
                        &sq,
                        ivfSQConfig_.indicesOptions,
                        memorySpace_);
 
   InvertedLists* ivf = index->invlists;
+
   if(ReadOnlyArrayInvertedLists* rol = dynamic_cast<ReadOnlyArrayInvertedLists*>(ivf)) {
       index_->copyCodeVectorsFromCpu((const float* )(rol->pin_readonly_codes->data),
                                      (const long *)(rol->pin_readonly_ids->data), rol->readonly_length);
@@ -127,7 +131,7 @@ GpuIndexIVFScalarQuantizer::copyFrom(
                   ivf->get_ids(i),
                   numVecs);
       }
-  }
+   }
 }
 
 void
@@ -143,6 +147,7 @@ GpuIndexIVFScalarQuantizer::copyTo(
 
   GpuIndexIVF::copyTo(index);
   index->sq = sq;
+  index->code_size = sq.code_size;
   index->by_residual = by_residual;
   index->code_size = sq.code_size;
 
@@ -219,6 +224,7 @@ GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) {
   index_ = new IVFFlat(resources_,
                        quantizer->getGpuData(),
                        this->metric_type,
+                       this->metric_arg,
                        by_residual,
                        &sq,
                        ivfSQConfig_.indicesOptions,
diff --git a/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp
index 63ed9ef316..e564f8e367 100644
--- a/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/StandardGpuResources.cpp
@@ -7,6 +7,7 @@
 
 
 #include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/MemorySpace.h>
 #include <faiss/impl/FaissAssert.h>
 #include <limits>
@@ -247,6 +248,13 @@ StandardGpuResources::initializeForDevice(int device) {
   FAISS_ASSERT(blasStatus == CUBLAS_STATUS_SUCCESS);
   blasHandles_[device] = blasHandle;
 
+  // Enable tensor core support if available
+#if CUDA_VERSION >= 9000
+  if (getTensorCoreSupport(device)) {
+    cublasSetMathMode(blasHandle, CUBLAS_TENSOR_OP_MATH);
+  }
+#endif
+
   FAISS_ASSERT(memory_.count(device) == 0);
 
   auto mem = std::unique_ptr<StackDeviceMemory>(
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
index 67b112434f..e4aa3af1fc 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
@@ -13,6 +13,7 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/DistanceUtils.cuh>
 #include <faiss/gpu/utils/DeviceDefs.cuh>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/Limits.cuh>
@@ -28,113 +29,18 @@
 
 namespace faiss { namespace gpu {
 
-namespace {
-
-template <typename T>
-Tensor<T, 2, true> sliceCentroids(Tensor<T, 2, true>& centroids,
-                                  bool centroidsRowMajor,
-                                  int startCentroid,
-                                  int num) {
-  // Row major is (num, dim)
-  // Col major is (dim, num)
-  if (startCentroid == 0 &&
-      num == centroids.getSize(centroidsRowMajor ? 0 : 1)) {
-    return centroids;
-  }
-
-  return centroids.narrow(centroidsRowMajor ? 0 : 1, startCentroid, num);
-}
-
-// For each chunk of k indices, increment the index by chunk * increment
-template <typename T>
-__global__ void incrementIndex(Tensor<T, 2, true> indices,
-                               int k,
-                               int increment) {
-  for (int i = threadIdx.x; i < k; i += blockDim.x) {
-    indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
-  }
-}
-
-// Used to update result indices in distance computation where the number of
-// centroids is high, and is tiled
-template <typename T>
-void runIncrementIndex(Tensor<T, 2, true>& indices,
-                       int k,
-                       int increment,
-                       cudaStream_t stream) {
-  dim3 grid(indices.getSize(1) / k, indices.getSize(0));
-  int block = std::min(k, 512);
-
-  // should be exact
-  FAISS_ASSERT(grid.x * k == indices.getSize(1));
-
-  incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
-
-  cudaDeviceSynchronize();
-}
-
-// If the inner size (dim) of the vectors is small, we want a larger query tile
-// size, like 1024
-
-void chooseTileSize(int numQueries,
-                    int numCentroids,
-                    int dim,
-                    int elementSize,
-                    size_t tempMemAvailable,
-                    int& tileRows,
-                    int& tileCols) {
-  // The matrix multiplication should be large enough to be efficient, but if it
-  // is too large, we seem to lose efficiency as opposed to double-streaming.
-  // Each tile size here defines 1/2 of the memory use due to double streaming.
-  // We ignore available temporary memory, as that is adjusted independently by
-  // the user and can thus meet these requirements (or not).
-  // For <= 4 GB GPUs, prefer 512 MB of usage.
-  // For <= 8 GB GPUs, prefer 768 MB of usage.
-  // Otherwise, prefer 1 GB of usage.
-  auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
-
-  int targetUsage = 0;
-
-  if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
-    targetUsage = 512 * 1024 * 1024;
-  } else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
-    targetUsage = 768 * 1024 * 1024;
-  } else {
-    targetUsage = 1024 * 1024 * 1024;
-  }
-
-  targetUsage /= 2 * elementSize;
-
-  // 512 seems to be a batch size sweetspot for float32.
-  // If we are on float16, increase to 512.
-  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
-  // increase to 1024.
-  int preferredTileRows = 512;
-  if (dim <= 32) {
-    preferredTileRows = 1024;
-  }
-
-  tileRows = std::min(preferredTileRows, numQueries);
-
-  // tileCols is the remainder size
-  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
-}
-
-}
-
 template <typename T>
 void runDistance(bool computeL2,
                  GpuResources* resources,
                  Tensor<T, 2, true>& centroids,
                  bool centroidsRowMajor,
-                 Tensor<T, 1, true>* centroidNorms,
+                 Tensor<float, 1, true>* centroidNorms,
                  Tensor<T, 2, true>& queries,
                  bool queriesRowMajor,
                  Tensor<uint8_t, 1, true>& bitset,
                  int k,
-                 Tensor<T, 2, true>& outDistances,
+                 Tensor<float, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
-                 bool useHgemm,
                  bool ignoreOutDistances) {
   // The # of centroids in `centroids` based on memory layout
   auto numCentroids = centroids.getSize(centroidsRowMajor ? 0 : 1);
@@ -169,11 +75,11 @@ void runDistance(bool computeL2,
   }
 
   // L2: If ||c||^2 is not pre-computed, calculate it
-  DeviceTensor<T, 1, true> cNorms;
+  DeviceTensor<float, 1, true> cNorms;
   if (computeL2 && !centroidNorms) {
     cNorms =
-      std::move(DeviceTensor<T, 1, true>(mem,
-                                         {numCentroids}, defaultStream));
+      std::move(DeviceTensor<float, 1, true>(
+                  mem, {numCentroids}, defaultStream));
     runL2Norm(centroids, centroidsRowMajor, cNorms, true, defaultStream);
     centroidNorms = &cNorms;
   }
@@ -182,7 +88,7 @@ void runDistance(bool computeL2,
   // Prepare norm vector ||q||^2; ||c||^2 is already pre-computed
   //
   int qNormSize[1] = {numQueries};
-  DeviceTensor<T, 1, true> queryNorms(mem, qNormSize, defaultStream);
+  DeviceTensor<float, 1, true> queryNorms(mem, qNormSize, defaultStream);
 
   // ||q||^2
   if (computeL2) {
@@ -208,18 +114,18 @@ void runDistance(bool computeL2,
   FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
 
   // Temporary output memory space we'll use
-  DeviceTensor<T, 2, true> distanceBuf1(
+  DeviceTensor<float, 2, true> distanceBuf1(
     mem, {tileRows, tileCols}, defaultStream);
-  DeviceTensor<T, 2, true> distanceBuf2(
+  DeviceTensor<float, 2, true> distanceBuf2(
     mem, {tileRows, tileCols}, defaultStream);
-  DeviceTensor<T, 2, true>* distanceBufs[2] =
+  DeviceTensor<float, 2, true>* distanceBufs[2] =
     {&distanceBuf1, &distanceBuf2};
 
-  DeviceTensor<T, 2, true> outDistanceBuf1(
+  DeviceTensor<float, 2, true> outDistanceBuf1(
     mem, {tileRows, numColTiles * k}, defaultStream);
-  DeviceTensor<T, 2, true> outDistanceBuf2(
+  DeviceTensor<float, 2, true> outDistanceBuf2(
     mem, {tileRows, numColTiles * k}, defaultStream);
-  DeviceTensor<T, 2, true>* outDistanceBufs[2] =
+  DeviceTensor<float, 2, true>* outDistanceBufs[2] =
     {&outDistanceBuf1, &outDistanceBuf2};
 
   DeviceTensor<int, 2, true> outIndexBuf1(
@@ -291,7 +197,6 @@ void runDistance(bool computeL2,
                     centroidsRowMajor, // transposed MM if row major
                     computeL2 ? -2.0f : 1.0f,
                     0.0f,
-                    useHgemm,
                     resources->getBlasHandleCurrentDevice(),
                     streams[curStream]);
 
@@ -390,19 +295,17 @@ void runDistance(bool computeL2,
   }
 }
 
-// Bitset added
 template <typename T>
 void runL2Distance(GpuResources* resources,
                    Tensor<T, 2, true>& centroids,
                    bool centroidsRowMajor,
-                   Tensor<T, 1, true>* centroidNorms,
+                   Tensor<float, 1, true>* centroidNorms,
                    Tensor<T, 2, true>& queries,
                    bool queriesRowMajor,
                    Tensor<uint8_t, 1, true>& bitset,
                    int k,
-                   Tensor<T, 2, true>& outDistances,
+                   Tensor<float, 2, true>& outDistances,
                    Tensor<int, 2, true>& outIndices,
-                   bool useHgemm,
                    bool ignoreOutDistances = false) {
   runDistance<T>(true, // L2
                  resources,
@@ -415,7 +318,6 @@ void runL2Distance(GpuResources* resources,
                  k,
                  outDistances,
                  outIndices,
-                 useHgemm,
                  ignoreOutDistances);
 }
 
@@ -427,9 +329,8 @@ void runIPDistance(GpuResources* resources,
                    bool queriesRowMajor,
                    Tensor<uint8_t, 1, true>& bitset,
                    int k,
-                   Tensor<T, 2, true>& outDistances,
-                   Tensor<int, 2, true>& outIndices,
-                   bool useHgemm) {
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices) {
   runDistance<T>(false, // IP
                  resources,
                  centroids,
@@ -441,7 +342,6 @@ void runIPDistance(GpuResources* resources,
                  k,
                  outDistances,
                  outIndices,
-                 useHgemm,
                  false);
 }
 
@@ -467,8 +367,7 @@ runIPDistance(GpuResources* resources,
                        bitset,
                        k,
                        outDistances,
-                       outIndices,
-                       false);
+                       outIndices);
 }
 
 void
@@ -479,9 +378,8 @@ runIPDistance(GpuResources* resources,
               bool queriesRowMajor,
               Tensor<uint8_t, 1, true>& bitset,
               int k,
-              Tensor<half, 2, true>& outDistances,
-              Tensor<int, 2, true>& outIndices,
-              bool useHgemm) {
+              Tensor<float, 2, true>& outDistances,
+              Tensor<int, 2, true>& outIndices) {
   runIPDistance<half>(resources,
                       vectors,
                       vectorsRowMajor,
@@ -490,8 +388,7 @@ runIPDistance(GpuResources* resources,
                       bitset,
                       k,
                       outDistances,
-                      outIndices,
-                      useHgemm);
+                      outIndices);
 }
 
 void
@@ -516,7 +413,6 @@ runL2Distance(GpuResources* resources,
                        k,
                        outDistances,
                        outIndices,
-                       false,
                        ignoreOutDistances);
 }
 
@@ -524,14 +420,13 @@ void
 runL2Distance(GpuResources* resources,
               Tensor<half, 2, true>& vectors,
               bool vectorsRowMajor,
-              Tensor<half, 1, true>* vectorNorms,
+              Tensor<float, 1, true>* vectorNorms,
               Tensor<half, 2, true>& queries,
               bool queriesRowMajor,
               Tensor<uint8_t, 1, true>& bitset,
               int k,
-              Tensor<half, 2, true>& outDistances,
+              Tensor<float, 2, true>& outDistances,
               Tensor<int, 2, true>& outIndices,
-              bool useHgemm,
               bool ignoreOutDistances) {
   runL2Distance<half>(resources,
                       vectors,
@@ -543,7 +438,6 @@ runL2Distance(GpuResources* resources,
                       k,
                       outDistances,
                       outIndices,
-                      useHgemm,
                       ignoreOutDistances);
 }
 
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
index 097dfbc9f6..844d420aea 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
@@ -10,6 +10,7 @@
 
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/impl/GeneralDistance.cuh>
 #include <faiss/utils/ConcurrentBitset.h>
 
 namespace faiss { namespace gpu {
@@ -53,22 +54,162 @@ void runIPDistance(GpuResources* resources,
                    bool queriesRowMajor,
                    Tensor<uint8_t, 1, true>& bitset,
                    int k,
-                   Tensor<half, 2, true>& outDistances,
-                   Tensor<int, 2, true>& outIndices,
-                   bool useHgemm);
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices);
 
 void runL2Distance(GpuResources* resources,
                    Tensor<half, 2, true>& vectors,
                    bool vectorsRowMajor,
-                   Tensor<half, 1, true>* vectorNorms,
+                   Tensor<float, 1, true>* vectorNorms,
                    Tensor<half, 2, true>& queries,
                    bool queriesRowMajor,
                    Tensor<uint8_t, 1, true>& bitset,
                    int k,
-                   Tensor<half, 2, true>& outDistances,
+                   Tensor<float, 2, true>& outDistances,
                    Tensor<int, 2, true>& outIndices,
-                   bool useHgemm,
                    bool ignoreOutDistances = false);
 
+//
+// General distance implementation, assumes that all arguments are on the
+// device. This is the top-level internal distance function to call to dispatch
+// based on metric type.
+//
+template <typename T>
+void bfKnnOnDevice(GpuResources* resources,
+                   int device,
+                   cudaStream_t stream,
+                   Tensor<T, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<T, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   faiss::MetricType metric,
+                   float metricArg,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool ignoreOutDistances) {
+  // We are guaranteed that all data arguments are resident on our preferred
+  // `device` here, and are ordered wrt `stream`
+
+  // L2 and IP are specialized to use GEMM and an optimized L2 + selection or
+  // pure k-selection kernel.
+  if ((metric == faiss::MetricType::METRIC_L2) ||
+      (metric == faiss::MetricType::METRIC_Lp &&
+       metricArg == 2)) {
+    runL2Distance(resources,
+                  vectors,
+                  vectorsRowMajor,
+                  vectorNorms,
+                  queries,
+                  queriesRowMajor,
+                  bitset,
+                  k,
+                  outDistances,
+                  outIndices);
+  } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
+    runIPDistance(resources,
+                  vectors,
+                  vectorsRowMajor,
+                  queries,
+                  queriesRowMajor,
+                  bitset,
+                  k,
+                  outDistances,
+                  outIndices);
+  } else {
+    //
+    // General pairwise distance kernel
+    //
+    // The general distance kernel does not have specializations for
+    // transpositions (NN, NT, TN); instead, the transposition is just handled
+    // upon data load for now, which could result in poor data loading behavior
+    // for NT / TN. This can be fixed at a later date if desired, but efficiency
+    // is low versus GEMM anyways.
+    //
+
+    Tensor<T, 2> tVectorsDimInnermost =
+      vectorsRowMajor ?
+      vectors.transposeInnermost(1) :
+      vectors.transposeInnermost(0);
+    Tensor<T, 2> tQueriesDimInnermost =
+      queriesRowMajor ?
+      queries.transposeInnermost(1) :
+      queries.transposeInnermost(0);
+
+    if ((metric == faiss::MetricType::METRIC_L1) ||
+        (metric == faiss::MetricType::METRIC_Lp &&
+         metricArg == 1)) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         L1Distance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Lp &&
+               metricArg == -1) {
+      // A way to test L2 distance
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         L2Distance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Lp) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         LpDistance(metricArg),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Linf) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         LinfDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_Canberra) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         CanberraDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_BrayCurtis) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         BrayCurtisDistance(),
+                         outDistances,
+                         outIndices);
+    } else if (metric == faiss::MetricType::METRIC_JensenShannon) {
+      runGeneralDistance(resources,
+                         tVectorsDimInnermost,
+                         tQueriesDimInnermost,
+                         bitset,
+                         k,
+                         JensenShannonDistance(),
+                         outDistances,
+                         outIndices);
+    } else {
+      FAISS_THROW_FMT("unsupported metric type %d", metric);
+    }
+  }
+}
+
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/DistanceUtils.cuh b/core/src/index/thirdparty/faiss/gpu/impl/DistanceUtils.cuh
new file mode 100644
index 0000000000..42d815a5f3
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/DistanceUtils.cuh
@@ -0,0 +1,343 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <faiss/gpu/utils/Tensor.cuh>
+
+//
+// Shared utilities for brute-force distance calculations
+//
+
+namespace faiss { namespace gpu {
+
+struct IPDistance {
+  __host__ __device__ IPDistance() : dist(0) {}
+
+  static constexpr bool kDirection = true; // maximize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = -std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    dist += a * b;
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const IPDistance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ IPDistance zero() const {
+    return IPDistance();
+  }
+
+  float dist;
+};
+
+struct L1Distance {
+  __host__ __device__ L1Distance() : dist(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    dist += fabsf(a - b);
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const L1Distance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ L1Distance zero() const {
+    return L1Distance();
+  }
+
+  float dist;
+};
+
+struct L2Distance {
+  __host__ __device__ L2Distance() : dist(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    float v = a - b;
+    dist += v * v;
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const L2Distance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ L2Distance zero() const {
+    return L2Distance();
+  }
+
+  float dist;
+};
+
+struct LpDistance {
+  __host__ __device__ LpDistance()
+      : p(2), dist(0) {}
+
+  __host__ __device__ LpDistance(float arg)
+      : p(arg), dist(0) {}
+
+  __host__ __device__ LpDistance(const LpDistance& v)
+      : p(v.p), dist(v.dist) {}
+
+  __host__ __device__ LpDistance& operator=(const LpDistance& v) {
+    p = v.p;
+    dist = v.dist;
+    return *this;
+  }
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    dist += powf(fabsf(a - b), p);
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const LpDistance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ LpDistance zero() const {
+    return LpDistance(p);
+  }
+
+  float p;
+  float dist;
+};
+
+struct LinfDistance {
+  __host__ __device__ LinfDistance() : dist(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    dist = fmaxf(dist, fabsf(a - b));
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const LinfDistance& v) {
+    dist = fmaxf(dist, v.dist);
+  }
+
+  __host__ __device__ LinfDistance zero() const {
+    return LinfDistance();
+  }
+
+  float dist;
+};
+
+struct CanberraDistance {
+  __host__ __device__ CanberraDistance() : dist(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    float denom = fabsf(a) + fabsf(b);
+    dist += fabsf(a - b) / denom;
+  }
+
+  __host__ __device__ float reduce() {
+    return dist;
+  }
+
+  __host__ __device__ void combine(const CanberraDistance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ CanberraDistance zero() const {
+    return CanberraDistance();
+  }
+
+  float dist;
+};
+
+struct BrayCurtisDistance {
+  __host__ __device__ BrayCurtisDistance()
+      : numerator(0), denominator(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    numerator += fabsf(a - b);
+    denominator += fabsf(a + b);
+  }
+
+  __host__ __device__ float reduce() {
+    return (numerator / denominator);
+  }
+
+  __host__ __device__ void combine(const BrayCurtisDistance& v) {
+    numerator += v.numerator;
+    denominator += v.denominator;
+  }
+
+  __host__ __device__ BrayCurtisDistance zero() const {
+    return BrayCurtisDistance();
+  }
+
+  float numerator;
+  float denominator;
+};
+
+struct JensenShannonDistance {
+  __host__ __device__ JensenShannonDistance()
+      : dist(0) {}
+
+  static constexpr bool kDirection = false; // minimize
+  static constexpr float kIdentityData = 0;
+  static constexpr float kMaxDistance = std::numeric_limits<float>::max();
+
+  __host__ __device__ void handle(float a, float b) {
+    float m = 0.5f * (a + b);
+
+    float x = m / a;
+    float y = m / b;
+
+    float kl1 = -a * log(x);
+    float kl2 = -b * log(y);
+
+    dist += kl1 + kl2;
+  }
+
+  __host__ __device__ float reduce() {
+    return 0.5 * dist;
+  }
+
+  __host__ __device__ void combine(const JensenShannonDistance& v) {
+    dist += v.dist;
+  }
+
+  __host__ __device__ JensenShannonDistance zero() const {
+    return JensenShannonDistance();
+  }
+
+  float dist;
+};
+
+template <typename T, bool InnerContig>
+Tensor<T, 2, InnerContig> sliceCentroids(Tensor<T, 2, InnerContig>& centroids,
+                                         bool centroidsRowMajor,
+                                         int startCentroid,
+                                         int num) {
+  // Row major is (num, dim)
+  // Col major is (dim, num)
+  if (startCentroid == 0 &&
+      num == centroids.getSize(centroidsRowMajor ? 0 : 1)) {
+    return centroids;
+  }
+
+  return centroids.narrow(centroidsRowMajor ? 0 : 1, startCentroid, num);
+}
+
+// For each chunk of k indices, increment the index by chunk * increment
+template <typename T>
+__global__ void incrementIndex(Tensor<T, 2, true> indices,
+                               int k,
+                               int increment) {
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    indices[blockIdx.y][blockIdx.x * k + i] += blockIdx.x * increment;
+  }
+}
+
+// Used to update result indices in distance computation where the number of
+// centroids is high, and is tiled
+template <typename T>
+void runIncrementIndex(Tensor<T, 2, true>& indices,
+                       int k,
+                       int increment,
+                       cudaStream_t stream) {
+  dim3 grid(indices.getSize(1) / k, indices.getSize(0));
+  int block = std::min(k, 512);
+
+  // should be exact
+  FAISS_ASSERT(grid.x * k == indices.getSize(1));
+
+  incrementIndex<<<grid, block, 0, stream>>>(indices, k, increment);
+}
+
+// If the inner size (dim) of the vectors is small, we want a larger query tile
+// size, like 1024
+inline void chooseTileSize(int numQueries,
+                           int numCentroids,
+                           int dim,
+                           int elementSize,
+                           size_t tempMemAvailable,
+                           int& tileRows,
+                           int& tileCols) {
+  // The matrix multiplication should be large enough to be efficient, but if it
+  // is too large, we seem to lose efficiency as opposed to double-streaming.
+  // Each tile size here defines 1/2 of the memory use due to double streaming.
+  // We ignore available temporary memory, as that is adjusted independently by
+  // the user and can thus meet these requirements (or not).
+  // For <= 4 GB GPUs, prefer 512 MB of usage.
+  // For <= 8 GB GPUs, prefer 768 MB of usage.
+  // Otherwise, prefer 1 GB of usage.
+  auto totalMem = getCurrentDeviceProperties().totalGlobalMem;
+
+  int targetUsage = 0;
+
+  if (totalMem <= ((size_t) 4) * 1024 * 1024 * 1024) {
+    targetUsage = 512 * 1024 * 1024;
+  } else if (totalMem <= ((size_t) 8) * 1024 * 1024 * 1024) {
+    targetUsage = 768 * 1024 * 1024;
+  } else {
+    targetUsage = 1024 * 1024 * 1024;
+  }
+
+  targetUsage /= 2 * elementSize;
+
+  // 512 seems to be a batch size sweetspot for float32.
+  // If we are on float16, increase to 512.
+  // If the k size (vec dim) of the matrix multiplication is small (<= 32),
+  // increase to 1024.
+  int preferredTileRows = 512;
+  if (dim <= 32) {
+    preferredTileRows = 1024;
+  }
+
+  tileRows = std::min(preferredTileRows, numQueries);
+
+  // tileCols is the remainder size
+  tileCols = std::min(targetUsage / preferredTileRows, numCentroids);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
index 510b2182fc..e7545df767 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
@@ -19,17 +19,13 @@ namespace faiss { namespace gpu {
 
 FlatIndex::FlatIndex(GpuResources* res,
                      int dim,
-                     bool l2Distance,
                      bool useFloat16,
-                     bool useFloat16Accumulator,
                      bool storeTransposed,
                      MemorySpace space) :
     resources_(res),
     dim_(dim),
     useFloat16_(useFloat16),
-    useFloat16Accumulator_(useFloat16Accumulator),
     storeTransposed_(storeTransposed),
-    l2Distance_(l2Distance),
     space_(space),
     num_(0),
     rawData_(space) {
@@ -66,6 +62,22 @@ FlatIndex::reserve(size_t numVecs, cudaStream_t stream) {
   }
 }
 
+template <>
+Tensor<float, 2, true>&
+FlatIndex::getVectorsRef<float>() {
+  // Should not call this unless we are in float32 mode
+  FAISS_ASSERT(!useFloat16_);
+  return getVectorsFloat32Ref();
+}
+
+template <>
+Tensor<half, 2, true>&
+FlatIndex::getVectorsRef<half>() {
+  // Should not call this unless we are in float16 mode
+  FAISS_ASSERT(useFloat16_);
+  return getVectorsFloat16Ref();
+}
+
 Tensor<float, 2, true>&
 FlatIndex::getVectorsFloat32Ref() {
   // Should not call this unless we are in float32 mode
@@ -105,6 +117,8 @@ void
 FlatIndex::query(Tensor<float, 2, true>& input,
                  Tensor<uint8_t, 1, true>& bitset,
                  int k,
+                 faiss::MetricType metric,
+                 float metricArg,
                  Tensor<float, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
                  bool exactDistance) {
@@ -112,46 +126,28 @@ FlatIndex::query(Tensor<float, 2, true>& input,
   auto& mem = resources_->getMemoryManagerCurrentDevice();
 
   if (useFloat16_) {
-    // We need to convert to float16
-    auto inputHalf = convertTensor<float, half, 2>(resources_,
-                                                   stream,
-                                                   input);
+    // We need to convert the input to float16 for comparison to ourselves
+    auto inputHalf =
+      convertTensor<float, half, 2>(resources_, stream, input);
 
-    DeviceTensor<half, 2, true> outDistancesHalf(
-      mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
-
-    query(inputHalf, bitset, k, outDistancesHalf, outIndices, exactDistance);
-
-    if (exactDistance) {
-      // Convert outDistances back
-      convertTensor<half, float, 2>(stream,
-                                    outDistancesHalf,
-                                    outDistances);
-    }
+    query(inputHalf, bitset, k, metric, metricArg,
+          outDistances, outIndices, exactDistance);
   } else {
-    if (l2Distance_) {
-      runL2Distance(resources_,
-                    storeTransposed_ ? vectorsTransposed_ : vectors_,
-                    !storeTransposed_, // is vectors row major?
-                    &norms_,
-                    input,
-                    true, // input is row major
-                    bitset,
-                    k,
-                    outDistances,
-                    outIndices,
-                    !exactDistance);
-    } else {
-      runIPDistance(resources_,
-                    storeTransposed_ ? vectorsTransposed_ : vectors_,
-                    !storeTransposed_, // is vectors row major?
-                    input,
-                    true, // input is row major
-                    bitset,
-                    k,
-                    outDistances,
-                    outIndices);
-    }
+    bfKnnOnDevice(resources_,
+                  getCurrentDevice(),
+                  stream,
+                  storeTransposed_ ? vectorsTransposed_ : vectors_,
+                  !storeTransposed_, // is vectors row major?
+                  &norms_,
+                  input,
+                  true, // input is row major
+                  bitset,
+                  k,
+                  metric,
+                  metricArg,
+                  outDistances,
+                  outIndices,
+                  !exactDistance);
   }
 }
 
@@ -159,37 +155,28 @@ void
 FlatIndex::query(Tensor<half, 2, true>& input,
                  Tensor<uint8_t, 1, true>& bitset,
                  int k,
-                 Tensor<half, 2, true>& outDistances,
+                 faiss::MetricType metric,
+                 float metricArg,
+                 Tensor<float, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
                  bool exactDistance) {
   FAISS_ASSERT(useFloat16_);
 
-  if (l2Distance_) {
-    runL2Distance(resources_,
-                  storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
-                  !storeTransposed_, // is vectors row major?
-                  &normsHalf_,
-                  input,
-                  true, // input is row major
-                  bitset,
-                  k,
-                  outDistances,
-                  outIndices,
-                  useFloat16Accumulator_,
-                  // FIXME
-                  !exactDistance);
-  } else {
-    runIPDistance(resources_,
-                  storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
-                  !storeTransposed_, // is vectors row major?
-                  input,
-                  true, // input is row major
-                  bitset,
-                  k,
-                  outDistances,
-                  outIndices,
-                  useFloat16Accumulator_);
-  }
+  bfKnnOnDevice(resources_,
+                getCurrentDevice(),
+                resources_->getDefaultStreamCurrentDevice(),
+                storeTransposed_ ? vectorsHalfTransposed_ : vectorsHalf_,
+                !storeTransposed_, // is vectors row major?
+                &norms_,
+                input,
+                true, // input is row major
+                bitset,
+                k,
+                metric,
+                metricArg,
+                outDistances,
+                outIndices,
+                !exactDistance);
 }
 
 void
@@ -289,17 +276,15 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) {
     }
   }
 
-  if (l2Distance_) {
-    // Precompute L2 norms of our database
-    if (useFloat16_) {
-      DeviceTensor<half, 1, true> normsHalf({(int) num_}, space_);
-      runL2Norm(vectorsHalf_, true, normsHalf, true, stream);
-      normsHalf_ = std::move(normsHalf);
-    } else {
-      DeviceTensor<float, 1, true> norms({(int) num_}, space_);
-      runL2Norm(vectors_, true, norms, true, stream);
-      norms_ = std::move(norms);
-    }
+  // Precompute L2 norms of our database
+  if (useFloat16_) {
+    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+    runL2Norm(vectorsHalf_, true, norms, true, stream);
+    norms_ = std::move(norms);
+  } else {
+    DeviceTensor<float, 1, true> norms({(int) num_}, space_);
+    runL2Norm(vectors_, true, norms, true, stream);
+    norms_ = std::move(norms);
   }
 }
 
@@ -307,6 +292,9 @@ void
 FlatIndex::reset() {
   rawData_.clear();
   vectors_ = std::move(DeviceTensor<float, 2, true>());
+  vectorsTransposed_ = std::move(DeviceTensor<float, 2, true>());
+  vectorsHalf_ = std::move(DeviceTensor<half, 2, true>());
+  vectorsHalfTransposed_ = std::move(DeviceTensor<half, 2, true>());
   norms_ = std::move(DeviceTensor<float, 1, true>());
   num_ = 0;
 }
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
index 03be3a2d4a..5bc97441c4 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <faiss/MetricType.h>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <faiss/gpu/utils/DeviceVector.cuh>
 #include <faiss/gpu/utils/MemorySpace.h>
@@ -21,22 +22,27 @@ class FlatIndex {
  public:
   FlatIndex(GpuResources* res,
             int dim,
-            bool l2Distance,
             bool useFloat16,
-            bool useFloat16Accumulator,
             bool storeTransposed,
             MemorySpace space);
 
+  /// Whether or not this flat index primarily stores data in float16
   bool getUseFloat16() const;
 
   /// Returns the number of vectors we contain
   int getSize() const;
 
+  /// Returns the dimensionality of the vectors
   int getDim() const;
 
   /// Reserve storage that can contain at least this many vectors
   void reserve(size_t numVecs, cudaStream_t stream);
 
+  /// Returns the vectors based on the type desired; the FlatIndex must be of
+  /// the same type (float16 or float32) to not assert
+  template <typename T>
+  Tensor<T, 2, true>& getVectorsRef();
+
   /// Returns a reference to our vectors currently in use
   Tensor<float, 2, true>& getVectorsFloat32Ref();
 
@@ -55,6 +61,8 @@ class FlatIndex {
   void query(Tensor<float, 2, true>& vecs,
              Tensor<uint8_t, 1, true>& bitset,
              int k,
+             faiss::MetricType metric,
+             float metricArg,
              Tensor<float, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
              bool exactDistance);
@@ -62,7 +70,9 @@ class FlatIndex {
   void query(Tensor<half, 2, true>& vecs,
              Tensor<uint8_t, 1, true>& bitset,
              int k,
-             Tensor<half, 2, true>& outDistances,
+             faiss::MetricType metric,
+             float metricArg,
+             Tensor<float, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
              bool exactDistance);
 
@@ -95,16 +105,10 @@ class FlatIndex {
   /// Float16 data format
   const bool useFloat16_;
 
-  /// For supporting hardware, whether or not we use Hgemm
-  const bool useFloat16Accumulator_;
-
   /// Store vectors in transposed layout for speed; makes addition to
   /// the index slower
   const bool storeTransposed_;
 
-  /// L2 or inner product distance?
-  bool l2Distance_;
-
   /// Memory space for our allocations
   MemorySpace space_;
 
@@ -124,9 +128,6 @@ class FlatIndex {
 
   /// Precomputed L2 norms
   DeviceTensor<float, 1, true> norms_;
-
-  /// Precomputed L2 norms, float16 form
-  DeviceTensor<half, 1, true> normsHalf_;
 };
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/GeneralDistance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/GeneralDistance.cuh
new file mode 100644
index 0000000000..5dae58638c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/GeneralDistance.cuh
@@ -0,0 +1,432 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/MetricType.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/gpu/impl/DistanceUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+
+#include <memory>
+#include <algorithm>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <iostream>
+
+//
+// Kernels for non-L2 / inner product distances
+//
+
+namespace faiss { namespace gpu {
+
+// Reduction tree operator
+template <typename DistanceOp, int N>
+struct ReduceDistanceOp {
+  __device__ static DistanceOp reduce(DistanceOp ops[N]) {
+    DistanceOp vals[N/2];
+#pragma unroll
+    for (int i = 0; i < N / 2; ++i) {
+      vals[i] = ops[i * 2];
+      vals[i].combine(ops[i * 2 + 1]);
+    }
+
+    return ReduceDistanceOp<DistanceOp, N/2>::reduce(vals);
+  }
+};
+
+template <typename DistanceOp>
+struct ReduceDistanceOp<DistanceOp, 1> {
+  __device__ static DistanceOp reduce(DistanceOp ops[1]) {
+    return ops[0];
+  }
+};
+
+// Implements a pairwise reduction tree
+template <typename T, int Unroll, int DimMultiple, typename DistanceOp>
+inline __device__ DistanceOp
+reduce(const DistanceOp& in,
+       const T queryTile[kWarpSize][DimMultiple * kWarpSize + 1],
+       const T vecTile[kWarpSize][DimMultiple * kWarpSize + 1]) {
+  DistanceOp accs[Unroll];
+#pragma unroll
+  for (int i = 0; i < Unroll; ++i) {
+    accs[i] = in.zero();
+  }
+
+  auto vecTileBase = vecTile[threadIdx.x];
+  auto queryTileBase = queryTile[threadIdx.y];
+
+#pragma unroll
+  for (int i = 0; i < Unroll; ++i) {
+#pragma unroll
+    for (int j = 0; j < (kWarpSize * DimMultiple / Unroll); ++j) {
+      int idx = i * (kWarpSize * DimMultiple / Unroll) + j;
+      accs[i].handle(ConvertTo<float>::to(queryTileBase[idx]),
+                     ConvertTo<float>::to(vecTileBase[idx]));
+    }
+  }
+
+  return ReduceDistanceOp<DistanceOp, Unroll>::reduce(accs);
+}
+
+// Our general distance matrix "multiplication" kernel
+template <typename T, typename DistanceOp, bool InnerContig>
+__launch_bounds__(kWarpSize * kWarpSize)
+__global__ void
+generalDistance(Tensor<T, 2, InnerContig> query, // m x k
+                Tensor<T, 2, InnerContig> vec, // n x k
+                DistanceOp op,
+                Tensor<float, 2, true> out) { // m x n
+  constexpr int kDimMultiple = 1;
+
+  __shared__ T queryTile[kWarpSize][kWarpSize * kDimMultiple + 1];
+  __shared__ T vecTile[kWarpSize][kWarpSize * kDimMultiple + 1];
+
+  // block y -> query
+  // block x -> vector
+
+  int queryBlock = blockIdx.y * kWarpSize;
+  int queryThread = queryBlock + threadIdx.y;
+
+  int vecBlock = blockIdx.x * kWarpSize;
+  int vecThreadLoad = vecBlock + threadIdx.y;
+  int vecThreadSave = vecBlock + threadIdx.x;
+
+  DistanceOp acc = op.zero();
+
+  auto queryTileBase = queryTile[threadIdx.y];
+  auto vecTileBase = vecTile[threadIdx.y];
+
+  auto queryBase = query[queryThread];
+  auto vecBase = vec[vecThreadLoad];
+
+  if ((blockIdx.x != (gridDim.x - 1)) && (blockIdx.y != (gridDim.y - 1))) {
+    //
+    // Interior tile
+    //
+    int limit = utils::roundDown(query.getSize(1), kWarpSize * kDimMultiple);
+
+    for (int k = threadIdx.x; k < limit; k += kWarpSize * kDimMultiple) {
+      // Load query tile
+#pragma unroll
+      for (int i = 0; i < kDimMultiple; ++i) {
+        queryTileBase[threadIdx.x + i * kWarpSize] =
+          queryBase[k + i * kWarpSize];
+        vecTileBase[threadIdx.x + i * kWarpSize] =
+          vecBase[k + i * kWarpSize];
+      }
+
+      __syncthreads();
+
+      // thread (y, x) does (query y, vec x)
+      acc.combine(
+        reduce<T, 8, kDimMultiple, DistanceOp>(op, queryTile, vecTile));
+
+      __syncthreads();
+    }
+
+    // Handle remainder
+    if (limit < query.getSize(1)) {
+#pragma unroll
+      for (int i = 0; i < kDimMultiple; ++i) {
+        int k = limit + threadIdx.x + i * kWarpSize;
+        bool kInBounds = k < query.getSize(1);
+
+        queryTileBase[threadIdx.x + i * kWarpSize] =
+                             kInBounds ?
+                             queryBase[k] : (T) 0; //DistanceOp::kIdentityData;
+
+        vecTileBase[threadIdx.x + i * kWarpSize] =
+                             kInBounds ?
+                             vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
+      }
+
+      __syncthreads();
+
+      int remainder = query.getSize(1) - limit;
+
+      // thread (y, x) does (query y, vec x)
+#pragma unroll
+      for (int i = 0; i < remainder; ++i) {
+        acc.handle(ConvertTo<float>::to(queryTileBase[i]),
+                   ConvertTo<float>::to(vecTile[threadIdx.x][i]));
+      }
+    }
+
+    // Write out results
+    out[queryThread][vecThreadSave] = acc.reduce();
+  } else {
+    //
+    // Otherwise, we're an exterior tile
+    //
+
+    bool queryThreadInBounds = queryThread < query.getSize(0);
+    bool vecThreadInBoundsLoad = vecThreadLoad < vec.getSize(0);
+    bool vecThreadInBoundsSave = vecThreadSave < vec.getSize(0);
+    int limit = utils::roundDown(query.getSize(1), kWarpSize);
+
+    for (int k = threadIdx.x; k < limit; k += kWarpSize) {
+      // Load query tile
+      queryTileBase[threadIdx.x] =
+        queryThreadInBounds ?
+        queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
+
+      vecTileBase[threadIdx.x] =
+        vecThreadInBoundsLoad ?
+        vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
+
+      __syncthreads();
+
+      // thread (y, x) does (query y, vec x)
+#pragma unroll
+      for (int i = 0; i < kWarpSize; ++i) {
+        acc.handle(ConvertTo<float>::to(queryTileBase[i]),
+                   ConvertTo<float>::to(vecTile[threadIdx.x][i]));
+      }
+
+      __syncthreads();
+    }
+
+    // Handle remainder
+    if (limit < query.getSize(1)) {
+      int k = limit + threadIdx.x;
+      bool kInBounds = k < query.getSize(1);
+
+      // Load query tile
+      queryTileBase[threadIdx.x] =
+                           queryThreadInBounds && kInBounds ?
+                           queryBase[k] : (T) 0; // DistanceOp::kIdentityData;
+
+      vecTileBase[threadIdx.x] =
+                           vecThreadInBoundsLoad && kInBounds ?
+                           vecBase[k] : (T) 0; // DistanceOp::kIdentityData;
+
+      __syncthreads();
+
+      int remainder = query.getSize(1) - limit;
+
+      // thread (y, x) does (query y, vec x)
+      for (int i = 0; i < remainder; ++i) {
+        acc.handle(ConvertTo<float>::to(queryTileBase[i]),
+                   ConvertTo<float>::to(vecTile[threadIdx.x][i]));
+      }
+    }
+
+    // Write out results
+    if (queryThreadInBounds && vecThreadInBoundsSave) {
+      out[queryThread][vecThreadSave] = acc.reduce();
+    }
+  }
+}
+
+
+template <typename T, typename DistanceOp, bool InnerContig>
+void runGeneralDistanceKernel(Tensor<T, 2, InnerContig>& vecs,
+                              Tensor<T, 2, InnerContig>& query,
+                              Tensor<float, 2, true>& out,
+                              const DistanceOp& op,
+                              cudaStream_t stream) {
+  FAISS_ASSERT(vecs.getSize(1) == query.getSize(1));
+  FAISS_ASSERT(out.getSize(0) == query.getSize(0));
+  FAISS_ASSERT(out.getSize(1) == vecs.getSize(0));
+
+  dim3 grid(utils::divUp(vecs.getSize(0), kWarpSize),
+            utils::divUp(query.getSize(0), kWarpSize));
+  dim3 block(kWarpSize, kWarpSize);
+
+  generalDistance<<<grid, block, 0, stream>>>(query, vecs, op, out);
+}
+
+template <typename T, typename DistanceOp, bool InnerContig>
+void runGeneralDistance(GpuResources* resources,
+                        Tensor<T, 2, InnerContig>& centroids,
+                        Tensor<T, 2, InnerContig>& queries,
+                        Tensor<uint8_t, 1, true>& bitset,
+                        int k,
+                        const DistanceOp& op,
+                        Tensor<float, 2, true>& outDistances,
+                        Tensor<int, 2, true>& outIndices) {
+  // The # of centroids in `centroids` based on memory layout
+  auto numCentroids = centroids.getSize(0);
+
+  // The # of queries in `queries` based on memory layout
+  auto numQueries = queries.getSize(0);
+
+  // The dimensions of the vectors to consider
+  auto dim = queries.getSize(1);
+  FAISS_ASSERT((numQueries == 0 || numCentroids == 0) ||
+               dim == centroids.getSize(1));
+
+  FAISS_ASSERT(outDistances.getSize(0) == numQueries);
+  FAISS_ASSERT(outIndices.getSize(0) == numQueries);
+  FAISS_ASSERT(outDistances.getSize(1) == k);
+  FAISS_ASSERT(outIndices.getSize(1) == k);
+
+  auto& mem = resources->getMemoryManagerCurrentDevice();
+  auto defaultStream = resources->getDefaultStreamCurrentDevice();
+
+  // If we're quering against a 0 sized set, just return empty results
+  if (centroids.numElements() == 0) {
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outDistances.data(), outDistances.end(),
+                 Limits<T>::getMax());
+
+    thrust::fill(thrust::cuda::par.on(defaultStream),
+                 outIndices.data(), outIndices.end(),
+                 -1);
+
+    return;
+  }
+
+  // By default, aim to use up to 512 MB of memory for the processing, with both
+  // number of queries and number of centroids being at least 512.
+  int tileRows = 0;
+  int tileCols = 0;
+  chooseTileSize(numQueries,
+                 numCentroids,
+                 dim,
+                 sizeof(T),
+                 mem.getSizeAvailable(),
+                 tileRows,
+                 tileCols);
+
+  int numColTiles = utils::divUp(numCentroids, tileCols);
+
+  // We can have any number of vectors to query against, even less than k, in
+  // which case we'll return -1 for the index
+  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K); // select limitation
+
+  // Temporary output memory space we'll use
+  DeviceTensor<float, 2, true> distanceBuf1(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<float, 2, true> distanceBuf2(
+    mem, {tileRows, tileCols}, defaultStream);
+  DeviceTensor<float, 2, true>* distanceBufs[2] =
+    {&distanceBuf1, &distanceBuf2};
+
+  DeviceTensor<float, 2, true> outDistanceBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<float, 2, true> outDistanceBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<float, 2, true>* outDistanceBufs[2] =
+    {&outDistanceBuf1, &outDistanceBuf2};
+
+  DeviceTensor<int, 2, true> outIndexBuf1(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true> outIndexBuf2(
+    mem, {tileRows, numColTiles * k}, defaultStream);
+  DeviceTensor<int, 2, true>* outIndexBufs[2] =
+    {&outIndexBuf1, &outIndexBuf2};
+
+  auto streams = resources->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {defaultStream});
+
+  int curStream = 0;
+  bool interrupt = false;
+
+  // Tile over the input queries
+  for (int i = 0; i < numQueries; i += tileRows) {
+    if (interrupt || InterruptCallback::is_interrupted()) {
+      interrupt = true;
+      break;
+    }
+
+    int curQuerySize = std::min(tileRows, numQueries - i);
+
+    auto outDistanceView =
+      outDistances.narrow(0, i, curQuerySize);
+    auto outIndexView =
+      outIndices.narrow(0, i, curQuerySize);
+
+    auto queryView =
+      queries.narrow(0, i, curQuerySize);
+
+    auto outDistanceBufRowView =
+      outDistanceBufs[curStream]->narrow(0, 0, curQuerySize);
+    auto outIndexBufRowView =
+      outIndexBufs[curStream]->narrow(0, 0, curQuerySize);
+
+    // Tile over the centroids
+    for (int j = 0; j < numCentroids; j += tileCols) {
+      if (InterruptCallback::is_interrupted()) {
+        interrupt = true;
+        break;
+      }
+
+      int curCentroidSize = std::min(tileCols, numCentroids - j);
+      int curColTile = j / tileCols;
+
+      auto centroidsView =
+        sliceCentroids(centroids, true, j, curCentroidSize);
+
+      auto distanceBufView = distanceBufs[curStream]->
+        narrow(0, 0, curQuerySize).narrow(1, 0, curCentroidSize);
+
+      auto outDistanceBufColView =
+        outDistanceBufRowView.narrow(1, k * curColTile, k);
+      auto outIndexBufColView =
+        outIndexBufRowView.narrow(1, k * curColTile, k);
+
+      runGeneralDistanceKernel(centroidsView,
+                               queryView,
+                               distanceBufView,
+                               op,
+                               streams[curStream]);
+
+      // For IP, just k-select the output for this tile
+      if (tileCols == numCentroids) {
+        // Write into the final output
+        runBlockSelect(distanceBufView,
+                       bitset,
+                       outDistanceView,
+                       outIndexView,
+                       DistanceOp::kDirection, k, streams[curStream]);
+      } else {
+        // Write into the intermediate output
+        runBlockSelect(distanceBufView,
+                       bitset,
+                       outDistanceBufColView,
+                       outIndexBufColView,
+                       DistanceOp::kDirection, k, streams[curStream]);
+      }
+    }
+
+    // As we're finished with processing a full set of centroids, perform the
+    // final k-selection
+    if (tileCols != numCentroids) {
+      // The indices are tile-relative; for each tile of k, we need to add
+      // tileCols to the index
+      runIncrementIndex(outIndexBufRowView, k, tileCols, streams[curStream]);
+
+      runBlockSelectPair(outDistanceBufRowView,
+                         outIndexBufRowView,
+                         bitset,
+                         outDistanceView,
+                         outIndexView,
+                         DistanceOp::kDirection, k, streams[curStream]);
+    }
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  // Have the desired ordering stream wait on the multi-stream
+  streamWait({defaultStream}, streams);
+
+  if (interrupt) {
+    FAISS_THROW_MSG("interrupted");
+  }
+
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh b/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
index ad41143291..32675a5a4e 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/GpuScalarQuantizer.cuh
@@ -159,8 +159,8 @@ struct Codec<(int)QuantizerType::QT_fp16, 2> {
     half2* p = (half2*) &((uint8_t*) data)[vec * bytesPerVec];
     half2 pd = p[d];
 
-    out[0] = Convert<half, float>()(pd.x);
-    out[1] = Convert<half, float>()(pd.y);
+    out[0] = Convert<half, float>()(__low2half(pd));
+    out[1] = Convert<half, float>()(__high2half(pd));
   }
 
   inline __device__ float decodePartial(void* data, int vec, int d,
@@ -176,11 +176,7 @@ struct Codec<(int)QuantizerType::QT_fp16, 2> {
     half h0 = Convert<float, half>()(v[0]);
     half h1 = Convert<float, half>()(v[1]);
 
-    half2 h;
-    h.x = h0;
-    h.y = h1;
-
-    p[d] = h;
+    p[d] = __halves2half2(h0, h1);
   }
 
   inline __device__ void encodePartial(void* data, int vec, int d,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu
index 2c8b7fbcac..48c362e36e 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cu
@@ -22,11 +22,15 @@
 namespace faiss { namespace gpu {
 
 IVFBase::IVFBase(GpuResources* resources,
+                 faiss::MetricType metric,
+                 float metricArg,
                  FlatIndex* quantizer,
                  int bytesPerVector,
                  IndicesOptions indicesOptions,
                  MemorySpace space) :
     resources_(resources),
+    metric_(metric),
+    metricArg_(metricArg),
     quantizer_(quantizer),
     bytesPerVector_(bytesPerVector),
     indicesOptions_(indicesOptions),
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh
index f1c39867fa..987439269d 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFBase.cuh
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/utils/DeviceVector.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
@@ -25,6 +26,8 @@ struct FlatIndex;
 class IVFBase {
  public:
   IVFBase(GpuResources* resources,
+          faiss::MetricType metric,
+          float metricArg,
           /// We do not own this reference
           FlatIndex* quantizer,
           int bytesPerVector,
@@ -89,6 +92,12 @@ class IVFBase {
   /// Collection of GPU resources that we use
   GpuResources* resources_;
 
+  /// Metric type of the index
+  faiss::MetricType metric_;
+
+  /// Metric arg
+  float metricArg_;
+
   /// Quantizer object
   FlatIndex* quantizer_;
 
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
index 8de3964e1d..acebb5799e 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@@ -30,17 +30,19 @@ namespace faiss { namespace gpu {
 IVFFlat::IVFFlat(GpuResources* resources,
                  FlatIndex* quantizer,
                  faiss::MetricType metric,
+                 float metricArg,
                  bool useResidual,
                  faiss::ScalarQuantizer* scalarQ,
                  IndicesOptions indicesOptions,
                  MemorySpace space) :
     IVFBase(resources,
+            metric,
+            metricArg,
             quantizer,
             scalarQ ? scalarQ->code_size :
             sizeof(float) * quantizer->getDim(),
             indicesOptions,
             space),
-    metric_(metric),
     useResidual_(useResidual),
     scalarQ_(scalarQ ? new GpuScalarQuantizer(*scalarQ) : nullptr) {
 }
@@ -48,7 +50,6 @@ IVFFlat::IVFFlat(GpuResources* resources,
 IVFFlat::~IVFFlat() {
 }
 
-
 void
 IVFFlat::copyCodeVectorsFromCpu(const float* vecs,
                                 const long* indices,
@@ -176,7 +177,8 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
 
   /* pseudo bitset */
   DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-  quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);
+  quantizer_->query(vecs, bitset, 1, metric_, metricArg_,
+                    listDistance2d, listIds2d, false);
 
   // Calculate residuals for these vectors, if needed
   DeviceTensor<float, 2, true>
@@ -352,12 +354,14 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
   DeviceTensor<int, 2, true>
     coarseIndices(mem, {queries.getSize(0), nprobe}, stream);
 
+  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
   // Find the `nprobe` closest lists; we can use int indices both
   // internally and externally
-  DeviceTensor<uint8_t, 1, true> coarseBitset(mem, {0}, stream);
   quantizer_->query(queries,
                     coarseBitset,
                     nprobe,
+                    metric_,
+                    metricArg_,
                     coarseDistances,
                     coarseIndices,
                     false);
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
index 1697a7b1ee..6b29419121 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
@@ -20,6 +20,7 @@ class IVFFlat : public IVFBase {
           /// We do not own this reference
           FlatIndex* quantizer,
           faiss::MetricType metric,
+          float metricArg,
           bool useResidual,
           /// Optional ScalarQuantizer
           faiss::ScalarQuantizer* scalarQ,
@@ -39,14 +40,13 @@ class IVFFlat : public IVFBase {
                               const long* indices,
                               const std::vector<size_t>& list_length);
 
-    /// Adds the given vectors to this index.
+  /// Adds the given vectors to this index.
   /// The input data must be on our current device.
   /// Returns the number of vectors successfully added. Vectors may
   /// not be able to be added because they contain NaNs.
   int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                             Tensor<long, 1, true>& indices);
 
-
   /// Find the approximate k nearest neigbors for `queries` against
   /// our database
   void query(Tensor<float, 2, true>& queries,
@@ -61,9 +61,6 @@ class IVFFlat : public IVFBase {
   size_t getVectorMemorySize() const;
 
  private:
-  /// Metric type used
-  faiss::MetricType metric_;
-
   /// Do we encode the residual from a coarse quantizer or not?
   bool useResidual_;
 
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
index deb2720db4..2b76e0a09b 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cu
@@ -7,8 +7,8 @@
 
 
 #include <faiss/gpu/impl/IVFFlatScan.cuh>
+#include <faiss/gpu/impl/DistanceUtils.cuh>
 #include <faiss/gpu/impl/IVFUtils.cuh>
-#include <faiss/gpu/impl/Metrics.cuh>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/DeviceDefs.cuh>
@@ -25,6 +25,26 @@
 
 namespace faiss { namespace gpu {
 
+namespace {
+
+/// Sort direction per each metric
+inline bool metricToSortDirection(MetricType mt) {
+  switch (mt) {
+    case MetricType::METRIC_INNER_PRODUCT:
+      // highest
+      return true;
+    case MetricType::METRIC_L2:
+      // lowest
+      return false;
+    default:
+      // unhandled metric
+      FAISS_ASSERT(false);
+      return false;
+  }
+}
+
+}
+
 // Number of warps we create per block of IVFFlatScan
 constexpr int kIVFFlatScanWarps = 4;
 
@@ -56,8 +76,7 @@ struct IVFFlatScan {
 
     // Walk the list of vectors for this warp
     for (int vec = vecStart; vec < vecEnd; ++vec) {
-      // Reduce in dist
-      float dist = 0.0f;
+      Metric dist = metric.zero();
 
       // Scan the dimensions availabe that have whole units for the decoder,
       // as the decoder may handle more than one dimension at once (leaving the
@@ -76,7 +95,7 @@ struct IVFFlatScan {
 
 #pragma unroll
         for (int j = 0; j < Codec::kDimPerIter; ++j) {
-          dist += metric.distance(query[realDim + j], vecVal[j]);
+          dist.handle(query[realDim + j], vecVal[j]);
         }
       }
 
@@ -94,16 +113,16 @@ struct IVFFlatScan {
             float vecVal =
               codec.decodePartial(vecData, vec, limit, laneId);
             vecVal += useResidual ? residualBaseSlice[remainderDim] : 0.0f;
-            dist += metric.distance(query[remainderDim], vecVal);
+            dist.handle(query[remainderDim], vecVal);
           }
         }
       }
 
       // Reduce distance within warp
-      dist = warpReduceAllSum(dist);
+      auto warpDist = warpReduceAllSum(dist.reduce());
 
       if (laneId == 0) {
-        distanceOut[vec] = dist;
+        distanceOut[vec] = warpDist;
       }
     }
   }
@@ -221,9 +240,9 @@ runIVFFlatScanTile(Tensor<float, 2, true>& queries,
 #define HANDLE_METRICS                                  \
     do {                                                \
       if (metricType == MetricType::METRIC_L2) {        \
-        L2Metric metric; RUN_IVF_FLAT;                  \
+        L2Distance metric; RUN_IVF_FLAT;                \
       } else {                                          \
-        IPMetric metric; RUN_IVF_FLAT;                  \
+        IPDistance metric; RUN_IVF_FLAT;                \
       }                                                 \
     } while (0)
 
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
index ac63579dd2..2b67cba06f 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlatScan.cuh
@@ -8,10 +8,10 @@
 
 #pragma once
 
-#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/Index.h>
 #include <thrust/device_vector.h>
 
 namespace faiss { namespace gpu {
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
index 6f48f3a6db..01c91f9c3f 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
@@ -32,6 +32,8 @@
 namespace faiss { namespace gpu {
 
 IVFPQ::IVFPQ(GpuResources* resources,
+             faiss::MetricType metric,
+             float metricArg,
              FlatIndex* quantizer,
              int numSubQuantizers,
              int bitsPerSubQuantizer,
@@ -40,6 +42,8 @@ IVFPQ::IVFPQ(GpuResources* resources,
              bool useFloat16LookupTables,
              MemorySpace space) :
     IVFBase(resources,
+            metric,
+            metricArg,
             quantizer,
             numSubQuantizers,
             indicesOptions,
@@ -95,6 +99,11 @@ IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) {
 
 void
 IVFPQ::setPrecomputedCodes(bool enable) {
+  if (enable && metric_ == MetricType::METRIC_INNER_PRODUCT) {
+    FAISS_THROW_MSG("Precomputed codes are not needed for GpuIndexIVFPQ "
+                    "with METRIC_INNER_PRODUCT");
+  }
+
   if (precomputedCodes_ != enable) {
     precomputedCodes_ = enable;
 
@@ -114,11 +123,9 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
   FAISS_ASSERT(vecs.getSize(1) == dim_);
 
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
   auto& mem = resources_->getMemoryManagerCurrentDevice();
   auto stream = resources_->getDefaultStreamCurrentDevice();
-  
+
   // Number of valid vectors that we actually add; we return this
   int numAdded = 0;
 
@@ -130,7 +137,14 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
 
   /* pseudo bitset */
   DeviceTensor<uint8_t, 1, true> bitset(mem, {0}, stream);
-  quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);
+  quantizer_->query(vecs,
+                    bitset,
+                    1,
+                    metric_,
+                    metricArg_,
+                    listDistance,
+                    listIds2d,
+                    false);
 
   // Copy the lists that we wish to append to back to the CPU
   // FIXME: really this can be into pinned memory and a true async
@@ -142,7 +156,13 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   DeviceTensor<float, 2, true> residuals(
     mem, {vecs.getSize(0), vecs.getSize(1)}, stream);
 
-  runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  if (quantizer_->getUseFloat16()) {
+    auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
+    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  } else {
+    auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
+  }
 
   // Residuals are in the form
   // (vec x numSubQuantizer x dimPerSubQuantizer)
@@ -425,8 +445,11 @@ IVFPQ::setPQCentroids_(float* data) {
   pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
 }
 
+template <typename CentroidT>
 void
-IVFPQ::precomputeCodes_() {
+IVFPQ::precomputeCodesT_() {
+  FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
+
   //
   //    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
   //        ---------------   ---------------------------       -------
@@ -435,8 +458,6 @@ IVFPQ::precomputeCodes_() {
 
   // Terms 1 and 3 are available only at query time. We compute term 2
   // here.
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
 
   // Compute ||y_R||^2 by treating
   // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
@@ -459,9 +480,10 @@ IVFPQ::precomputeCodes_() {
   //      (centroid id)(sub q)(dim)
   // Transpose (centroid id)(sub q)(sub dim) to
   //           (sub q)(centroid id)(sub dim)
-  auto centroidView = coarseCentroids.view<3>(
+  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
+  auto centroidView = coarseCentroids.template view<3>(
     {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
-  DeviceTensor<float, 3, true> centroidsTransposed(
+  DeviceTensor<CentroidT, 3, true> centroidsTransposed(
     {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});
 
   runTransposeAny(centroidView, 0, 1, centroidsTransposed,
@@ -507,6 +529,15 @@ IVFPQ::precomputeCodes_() {
   }
 }
 
+void
+IVFPQ::precomputeCodes_() {
+  if (quantizer_->getUseFloat16()) {
+    precomputeCodesT_<half>();
+  } else {
+    precomputeCodesT_<float>();
+  }
+}
+
 void
 IVFPQ::query(Tensor<float, 2, true>& queries,
              Tensor<uint8_t, 1, true>& bitset,
@@ -538,11 +569,15 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
   quantizer_->query(queries,
                     coarseBitset,
                     nprobe,
+                    metric_,
+                    metricArg_,
                     coarseDistances,
                     coarseIndices,
                     true);
 
   if (precomputedCodes_) {
+    FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
+
     runPQPrecomputedCodes_(queries,
                            bitset,
                            coarseDistances,
@@ -601,6 +636,8 @@ IVFPQ::runPQPrecomputedCodes_(
   int k,
   Tensor<float, 2, true>& outDistances,
   Tensor<long, 2, true>& outIndices) {
+  FAISS_ASSERT(metric_ == MetricType::METRIC_L2);
+
   auto& mem = resources_->getMemoryManagerCurrentDevice();
   auto stream = resources_->getDefaultStreamCurrentDevice();
 
@@ -675,8 +712,9 @@ IVFPQ::runPQPrecomputedCodes_(
                                 resources_);
 }
 
+template <typename CentroidT>
 void
-IVFPQ::runPQNoPrecomputedCodes_(
+IVFPQ::runPQNoPrecomputedCodesT_(
   Tensor<float, 2, true>& queries,
   Tensor<uint8_t, 1, true>& bitset,
   DeviceTensor<float, 2, true>& coarseDistances,
@@ -684,8 +722,7 @@ IVFPQ::runPQNoPrecomputedCodes_(
   int k,
   Tensor<float, 2, true>& outDistances,
   Tensor<long, 2, true>& outIndices) {
-  FAISS_ASSERT(!quantizer_->getUseFloat16());
-  auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
+  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
 
   runPQScanMultiPassNoPrecomputed(queries,
                                   coarseCentroids,
@@ -702,9 +739,38 @@ IVFPQ::runPQNoPrecomputedCodes_(
                                   deviceListLengths_,
                                   maxListLength_,
                                   k,
+                                  metric_,
                                   outDistances,
                                   outIndices,
                                   resources_);
 }
 
+void
+IVFPQ::runPQNoPrecomputedCodes_(
+  Tensor<float, 2, true>& queries,
+  Tensor<uint8_t, 1, true>& bitset,
+  DeviceTensor<float, 2, true>& coarseDistances,
+  DeviceTensor<int, 2, true>& coarseIndices,
+  int k,
+  Tensor<float, 2, true>& outDistances,
+  Tensor<long, 2, true>& outIndices) {
+  if (quantizer_->getUseFloat16()) {
+    runPQNoPrecomputedCodesT_<half>(queries,
+                                    bitset,
+                                    coarseDistances,
+                                    coarseIndices,
+                                    k,
+                                    outDistances,
+                                    outIndices);
+  } else {
+    runPQNoPrecomputedCodesT_<float>(queries,
+                                     bitset,
+                                     coarseDistances,
+                                     coarseIndices,
+                                     k,
+                                     outDistances,
+                                     outIndices);
+  }
+}
+
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
index 8771e7a507..db8cbb68aa 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <faiss/MetricType.h>
 #include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
@@ -17,6 +18,8 @@ namespace faiss { namespace gpu {
 class IVFPQ : public IVFBase {
  public:
   IVFPQ(GpuResources* resources,
+        faiss::MetricType metric,
+        float metricArg,
         /// We do not own this reference
         FlatIndex* quantizer,
         int numSubQuantizers,
@@ -81,6 +84,11 @@ class IVFPQ : public IVFBase {
   /// Calculate precomputed residual distance information
   void precomputeCodes_();
 
+  /// Calculate precomputed residual distance information (for different coarse
+  /// centroid type)
+  template <typename CentroidT>
+  void precomputeCodesT_();
+
   /// Runs kernels for scanning inverted lists with precomputed codes
   void runPQPrecomputedCodes_(Tensor<float, 2, true>& queries,
                               Tensor<uint8_t, 1, true>& bitset,
@@ -99,6 +107,17 @@ class IVFPQ : public IVFBase {
                                 Tensor<float, 2, true>& outDistances,
                                 Tensor<long, 2, true>& outIndices);
 
+  /// Runs kernels for scanning inverted lists without precomputed codes (for
+  /// different coarse centroid type)
+  template <typename CentroidT>
+  void runPQNoPrecomputedCodesT_(Tensor<float, 2, true>& queries,
+                                 Tensor<uint8_t, 1, true>& bitset,
+                                 DeviceTensor<float, 2, true>& coarseDistances,
+                                 DeviceTensor<int, 2, true>& coarseIndices,
+                                 int k,
+                                 Tensor<float, 2, true>& outDistances,
+                                 Tensor<long, 2, true>& outIndices);
+
  private:
   /// Number of sub-quantizers per vector
   const int numSubQuantizers_;
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu
index c8e7228095..96bcd8e95b 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu
@@ -33,9 +33,9 @@ template <typename T, typename TVec, typename IndexType,
           int RowTileSize, bool NormLoop, bool NormSquared>
 __global__ void
 l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
-               Tensor<T, 1, true, IndexType> output) {
+               Tensor<float, 1, true, IndexType> output) {
   extern __shared__ char smemByte[]; // #warps * RowTileSize elements
-  T* smem = (T*) smemByte;
+  float* smem = (float*) smemByte;
 
   IndexType numWarps = utils::divUp(blockDim.x, kWarpSize);
   IndexType laneId = getLaneId();
@@ -43,19 +43,20 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
 
   bool lastRowTile = (blockIdx.x == (gridDim.x - 1));
   IndexType rowStart = RowTileSize * blockIdx.x;
-  T rowNorm[RowTileSize];
+  // accumulate in f32
+  float rowNorm[RowTileSize];
 
   if (lastRowTile) {
     // We are handling the very end of the input matrix rows
     for (IndexType row = 0; row < input.getSize(0) - rowStart; ++row) {
       if (NormLoop) {
-        rowNorm[0] = Math<T>::zero();
+        rowNorm[0] = 0;
 
         for (IndexType col = threadIdx.x;
              col < input.getSize(1); col += blockDim.x) {
           TVec val = input[rowStart + row][col];
           val = Math<TVec>::mul(val, val);
-          rowNorm[0] = Math<T>::add(rowNorm[0], Math<TVec>::reduceAdd(val));
+          rowNorm[0] = rowNorm[0] + Math<TVec>::reduceAdd(val);
         }
       } else {
         TVec val = input[rowStart + row][threadIdx.x];
@@ -79,7 +80,7 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
 
 #pragma unroll
       for (int row = 0; row < RowTileSize; ++row) {
-        rowNorm[row] = Math<T>::zero();
+        rowNorm[row] = 0;
       }
 
       for (IndexType col = threadIdx.x;
@@ -96,8 +97,8 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
 
 #pragma unroll
         for (int row = 0; row < RowTileSize; ++row) {
-          rowNorm[row] = Math<T>::add(rowNorm[row],
-                                      Math<TVec>::reduceAdd(tmp[row]));
+          rowNorm[row] = rowNorm[row] +
+            Math<TVec>::reduceAdd(tmp[row]);
         }
       }
     } else {
@@ -140,8 +141,7 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
   if (warpId == 0) {
 #pragma unroll
     for (int row = 0; row < RowTileSize; ++row) {
-      rowNorm[row] = laneId < numWarps ?
-                              smem[row * numWarps + laneId] : Math<T>::zero();
+      rowNorm[row] = laneId < numWarps ? smem[row * numWarps + laneId] : 0;
     }
 
 #pragma unroll
@@ -158,15 +158,13 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
         if (lastRowTile) {
           if (outCol < output.getSize(0)) {
             output[outCol] =
-              NormSquared ? rowNorm[row] :
-              ConvertTo<T>::to(
-                sqrtf(ConvertTo<float>::to(rowNorm[row])));
+              NormSquared ? ConvertTo<float>::to(rowNorm[row]) :
+              sqrtf(ConvertTo<float>::to(rowNorm[row]));
           }
         } else {
           output[outCol] =
-            NormSquared ? rowNorm[row] :
-            ConvertTo<T>::to(
-              sqrtf(ConvertTo<float>::to(rowNorm[row])));
+            NormSquared ? ConvertTo<float>::to(rowNorm[row]) :
+            sqrtf(ConvertTo<float>::to(rowNorm[row]));
         }
       }
     }
@@ -180,7 +178,7 @@ l2NormRowMajor(Tensor<TVec, 2, true, IndexType> input,
 template <typename T, typename IndexType, bool NormSquared>
 __global__ void
 l2NormColMajor(Tensor<T, 2, true, IndexType> input,
-               Tensor<T, 1, true, IndexType> output) {
+               Tensor<float, 1, true, IndexType> output) {
   // grid-stride loop to handle all batch elements
   for (IndexType batch = blockIdx.x * blockDim.x + threadIdx.x;
        batch < input.getSize(1);
@@ -198,14 +196,14 @@ l2NormColMajor(Tensor<T, 2, true, IndexType> input,
       sum = sqrtf(sum);
     }
 
-    output[batch] = ConvertTo<T>::to(sum);
+    output[batch] = ConvertTo<float>::to(sum);
   }
 }
 
 template <typename T, typename TVec, typename IndexType>
 void runL2Norm(Tensor<T, 2, true, IndexType>& input,
                bool inputRowMajor,
-               Tensor<T, 1, true, IndexType>& output,
+               Tensor<float, 1, true, IndexType>& output,
                bool normSquared,
                cudaStream_t stream) {
   IndexType maxThreads = (IndexType) getMaxThreadsCurrentDevice();
@@ -248,7 +246,7 @@ void runL2Norm(Tensor<T, 2, true, IndexType>& input,
       auto grid = dim3(utils::divUp(inputV.getSize(0), rowTileSize));
       auto block = dim3(numThreads);
 
-      auto smem = sizeof(T) * rowTileSize * utils::divUp(numThreads, kWarpSize);
+      auto smem = sizeof(float) * rowTileSize * utils::divUp(numThreads, kWarpSize);
 
       RUN_L2_ROW_MAJOR(T, TVec, inputV);
     } else {
@@ -261,7 +259,7 @@ void runL2Norm(Tensor<T, 2, true, IndexType>& input,
       auto grid = dim3(utils::divUp(input.getSize(0), rowTileSize));
       auto block = dim3(numThreads);
 
-      auto smem = sizeof(T) * rowTileSize * utils::divUp(numThreads, kWarpSize);
+      auto smem = sizeof(float) * rowTileSize * utils::divUp(numThreads, kWarpSize);
 
       RUN_L2_ROW_MAJOR(T, T, input);
     }
@@ -313,7 +311,7 @@ void runL2Norm(Tensor<float, 2, true>& input,
 
 void runL2Norm(Tensor<half, 2, true>& input,
                bool inputRowMajor,
-               Tensor<half, 1, true>& output,
+               Tensor<float, 1, true>& output,
                bool normSquared,
                cudaStream_t stream) {
   if (input.canUseIndexType<int>()) {
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh
index 1841f4b3a3..c4d5850802 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh
@@ -20,7 +20,7 @@ void runL2Norm(Tensor<float, 2, true>& input,
 
 void runL2Norm(Tensor<half, 2, true>& input,
                bool inputRowMajor,
-               Tensor<half, 1, true>& output,
+               Tensor<float, 1, true>& output,
                bool normSquared,
                cudaStream_t stream);
 
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
index a46138c69f..ed83c058ca 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
@@ -59,7 +59,7 @@ __global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
            col += blockDim.x) {
         if (bitsetEmpty || (!(bitset[col >> 3] & (0x1 << (col & 0x7))))) {
           distance[0] = Math<T>::add(centroidDistances[col],
-                                    productDistances[row][col]);
+                                     productDistances[row][col]);
         } else {
           distance[0] = (T)(1.0 / 0.0);
         }
@@ -72,8 +72,8 @@ __global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
 
       // Reduce within the block
       threadMin[0] =
-        blockReduceAll<Pair<T, int>, Min<Pair<T, int> >, false, false>(
-        threadMin[0], Min<Pair<T, int> >(), blockMin);
+        blockReduceAll<Pair<T, int>, Min<Pair<T, int>>, false, false>(
+        threadMin[0], Min<Pair<T, int>>(), blockMin);
 
       if (threadIdx.x == 0) {
         outDistances[row][0] = threadMin[0].k;
@@ -111,8 +111,13 @@ __global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
     }
 
     // Reduce within the block
-    blockReduceAll<kRowsPerBlock, Pair<T, int>, Min<Pair<T, int> >, false, false>(
-      threadMin, Min<Pair<T, int> >(), blockMin);
+    blockReduceAll<kRowsPerBlock,
+                   Pair<T, int>,
+                   Min<Pair<T, int> >,
+                   false,
+                   false>(threadMin,
+                          Min<Pair<T, int> >(),
+                          blockMin);
 
     if (threadIdx.x == 0) {
 #pragma unroll
@@ -159,7 +164,6 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
     } else {
       v = (T)(1.0 / 0.0);
     }
-      
     heap.add(v, i);
   }
 
@@ -170,7 +174,6 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
     } else {
       v = (T)(1.0 / 0.0);
     }
-
     heap.addThreadQ(v, i);
   }
 
@@ -181,7 +184,6 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
   }
 }
 
-
 template <typename T>
 void runL2SelectMin(Tensor<T, 2, true>& productDistances,
                     Tensor<T, 1, true>& centroidDistances,
@@ -213,9 +215,9 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
 #define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)                  \
     do {                                                                \
       l2SelectMinK<T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK>                  \
-        <<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
-                                    bitset, outDistances, outIndices,      \
-                                    k, Limits<T>::getMax());           \
+        <<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, bitset, \
+                                     outDistances, outIndices,          \
+                                     k, Limits<T>::getMax());           \
     } while (0)
 
     // block size 128 for everything <= 1024
@@ -237,6 +239,7 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
       // smaller block for less shared memory
       RUN_L2_SELECT(64, 2048, 8);
 #endif
+
     } else {
       FAISS_ASSERT(false);
     }
@@ -261,20 +264,4 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
                         stream);
 }
 
-void runL2SelectMin(Tensor<half, 2, true>& productDistances,
-                    Tensor<half, 1, true>& centroidDistances,
-                    Tensor<uint8_t, 1, true>& bitset,
-                    Tensor<half, 2, true>& outDistances,
-                    Tensor<int, 2, true>& outIndices,
-                    int k,
-                    cudaStream_t stream) {
-  runL2SelectMin<half>(productDistances,
-                       centroidDistances,
-                       bitset,
-                       outDistances,
-                       outIndices,
-                       k,
-                       stream);
-}
-
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
index 705b7c3355..b29552d786 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
@@ -20,12 +20,4 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
                     int k,
                     cudaStream_t stream);
 
-void runL2SelectMin(Tensor<half, 2, true>& productDistances,
-                    Tensor<half, 1, true>& centroidDistances,
-                    Tensor<uint8_t, 1, true>& bitset,
-                    Tensor<half, 2, true>& outDistances,
-                    Tensor<int, 2, true>& outIndices,
-                    int k,
-                    cudaStream_t stream);
-
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh
new file mode 100644
index 0000000000..c3ef87f2e7
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh
@@ -0,0 +1,561 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/impl/BroadcastSum.cuh>
+#include <faiss/gpu/impl/Distance.cuh>
+#include <faiss/gpu/impl/L2Norm.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceDefs.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/MatrixMult.cuh>
+#include <faiss/gpu/utils/PtxUtils.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+namespace faiss { namespace gpu {
+
+// Kernel responsible for calculating distance from residual vector to
+// each product quantizer code centroid
+template <typename OutCodeT,
+          typename CentroidT,
+          int DimsPerSubQuantizer,
+          bool L2Distance>
+__global__ void
+__launch_bounds__(288, 4)
+pqCodeDistances(Tensor<float, 2, true> queries,
+                int queriesPerBlock,
+                Tensor<CentroidT, 2, true> coarseCentroids,
+                Tensor<float, 3, true> pqCentroids,
+                Tensor<int, 2, true> topQueryToCentroid,
+                // (query id)(coarse)(subquantizer)(code) -> dist
+                Tensor<OutCodeT, 4, true> outCodeDistances) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  assert(DimsPerSubQuantizer == dimsPerSubQuantizer);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  bool isLoadingThread = threadIdx.x >= codesPerSubQuantizer;
+  int loadingThreadId = threadIdx.x - codesPerSubQuantizer;
+
+  extern __shared__ float smem[];
+
+  // Each thread calculates a single code
+  float subQuantizerData[DimsPerSubQuantizer];
+
+  auto code = threadIdx.x;
+  auto subQuantizer = blockIdx.y;
+
+  // Each thread will load the pq centroid data for the code that it
+  // is processing
+#pragma unroll
+  for (int i = 0; i < DimsPerSubQuantizer; ++i) {
+    subQuantizerData[i] = pqCentroids[subQuantizer][i][code].ldg();
+  }
+
+  // Where we store our query vector
+  float* smemQuery = smem;
+
+  // Where we store our residual vector; this is double buffered so we
+  // can be loading the next one while processing the current one
+  float* smemResidual1 = &smemQuery[DimsPerSubQuantizer];
+  float* smemResidual2 = &smemResidual1[DimsPerSubQuantizer];
+
+  // Where we pre-load the coarse centroid IDs
+  int* coarseIds = (int*) &smemResidual2[DimsPerSubQuantizer];
+
+  // Each thread is calculating the distance for a single code,
+  // performing the reductions locally
+
+  // Handle multiple queries per block
+  auto startQueryId = blockIdx.x * queriesPerBlock;
+  auto numQueries = queries.getSize(0) - startQueryId;
+  if (numQueries > queriesPerBlock) {
+    numQueries = queriesPerBlock;
+  }
+
+  for (int query = 0; query < numQueries; ++query) {
+    auto queryId = startQueryId + query;
+
+    auto querySubQuantizer =
+      queries[queryId][subQuantizer * DimsPerSubQuantizer].data();
+
+    // Load current query vector
+    for (int i = threadIdx.x; i < DimsPerSubQuantizer; i += blockDim.x) {
+      smemQuery[i] = querySubQuantizer[i];
+    }
+
+    // Load list of coarse centroids found
+    for (int i = threadIdx.x;
+         i < topQueryToCentroid.getSize(1); i += blockDim.x) {
+      coarseIds[i] = topQueryToCentroid[queryId][i];
+    }
+
+    // We need coarseIds below
+    // FIXME: investigate loading separately, so we don't need this
+    __syncthreads();
+
+    // Preload first buffer of residual data
+    if (isLoadingThread) {
+      for (int i = loadingThreadId;
+           i < DimsPerSubQuantizer;
+           i += blockDim.x - codesPerSubQuantizer) {
+        auto coarseId = coarseIds[0];
+        // In case NaNs were in the original query data
+        coarseId = coarseId == -1 ? 0 : coarseId;
+        auto coarseCentroidSubQuantizer =
+          coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+
+        if (L2Distance) {
+          smemResidual1[i] = smemQuery[i] -
+            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+        } else {
+          smemResidual1[i] =
+            ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+        }
+      }
+    }
+
+    // The block walks the list for a single query
+    for (int coarse = 0; coarse < topQueryToCentroid.getSize(1); ++coarse) {
+      // Wait for smemResidual1 to be loaded
+      __syncthreads();
+
+      if (isLoadingThread) {
+        // Preload second buffer of residual data
+        for (int i = loadingThreadId;
+             i < DimsPerSubQuantizer;
+             i += blockDim.x - codesPerSubQuantizer) {
+          // FIXME: try always making this centroid id 0 so we can
+          // terminate
+          if (coarse != (topQueryToCentroid.getSize(1) - 1)) {
+            auto coarseId = coarseIds[coarse + 1];
+            // In case NaNs were in the original query data
+            coarseId = coarseId == -1 ? 0 : coarseId;
+
+            auto coarseCentroidSubQuantizer =
+              coarseCentroids[coarseId]
+              [subQuantizer * dimsPerSubQuantizer].data();
+
+            if (L2Distance) {
+              smemResidual2[i] = smemQuery[i] -
+                ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+            } else {
+              smemResidual2[i] =
+                ConvertTo<float>::to(coarseCentroidSubQuantizer[i]);
+            }
+          }
+        }
+      } else {
+        // These are the processing threads
+        float dist = 0.0f;
+
+        constexpr int kUnroll = 4;
+        constexpr int kRemainder = DimsPerSubQuantizer % kUnroll;
+        constexpr int kRemainderBase = DimsPerSubQuantizer - kRemainder;
+        float vals[kUnroll];
+
+        // Calculate residual - pqCentroid for each dim that we're
+        // processing
+
+        // Unrolled loop
+        if (L2Distance) {
+#pragma unroll
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] -= subQuantizerData[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= vals[j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
+          }
+        } else {
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] += subQuantizerData[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= smemQuery[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
+          }
+        }
+
+        // Remainder loop
+        if (L2Distance) {
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] -= subQuantizerData[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= vals[j];
+          }
+        } else {
+          // Inner product
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] += subQuantizerData[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= smemQuery[kRemainderBase + j];
+          }
+        }
+
+#pragma unroll
+        for (int j = 0; j < kRemainder; ++j) {
+          dist += vals[j];
+        }
+
+        // We have the distance for our code; write it out
+        outCodeDistances[queryId][coarse][subQuantizer][code] =
+          ConvertTo<OutCodeT>::to(dist);
+      } // !isLoadingThread
+
+      // Swap residual buffers
+      float* tmp = smemResidual1;
+      smemResidual1 = smemResidual2;
+      smemResidual2 = tmp;
+    }
+  }
+}
+
+template <typename CentroidT>
+__global__ void
+residualVector(Tensor<float, 2, true> queries,
+               Tensor<CentroidT, 2, true> coarseCentroids,
+               Tensor<int, 2, true> topQueryToCentroid,
+               int numSubDim,
+               // output is transposed:
+               // (sub q)(query id)(centroid id)(sub dim)
+               Tensor<float, 4, true> residual) {
+  // block x is query id
+  // block y is centroid id
+  // thread x is dim
+  auto queryId = blockIdx.x;
+  auto centroidId = blockIdx.y;
+
+  int realCentroidId = topQueryToCentroid[queryId][centroidId];
+
+  for (int dim = threadIdx.x; dim < queries.getSize(1); dim += blockDim.x) {
+    float q = queries[queryId][dim];
+    float c = ConvertTo<float>::to(coarseCentroids[realCentroidId][dim]);
+
+    residual[dim / numSubDim][queryId][centroidId][dim % numSubDim] = q - c;
+  }
+}
+
+template <typename CentroidT>
+void
+runResidualVector(Tensor<float, 3, true>& pqCentroids,
+                  Tensor<float, 2, true>& queries,
+                  Tensor<CentroidT, 2, true>& coarseCentroids,
+                  Tensor<int, 2, true>& topQueryToCentroid,
+                  Tensor<float, 4, true>& residual,
+                  cudaStream_t stream) {
+  auto grid =
+    dim3(topQueryToCentroid.getSize(0), topQueryToCentroid.getSize(1));
+  auto block = dim3(std::min(queries.getSize(1), getMaxThreadsCurrentDevice()));
+
+  residualVector<<<grid, block, 0, stream>>>(
+    queries, coarseCentroids, topQueryToCentroid, pqCentroids.getSize(1),
+    residual);
+
+  CUDA_TEST_ERROR();
+}
+
+template <typename CentroidT>
+void
+runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
+                     Tensor<float, 2, true>& queries,
+                     Tensor<CentroidT, 2, true>& coarseCentroids,
+                     Tensor<int, 2, true>& topQueryToCentroid,
+                     NoTypeTensor<4, true>& outCodeDistances,
+                     bool useFloat16Lookup,
+                     DeviceMemory& mem,
+                     cublasHandle_t handle,
+                     cudaStream_t stream) {
+  // Calculate (q - c) residual vector
+  // (sub q)(query id)(centroid id)(sub dim)
+  DeviceTensor<float, 4, true> residual(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0),
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)},
+    stream);
+
+  runResidualVector(pqCentroids, queries,
+                    coarseCentroids, topQueryToCentroid,
+                    residual, stream);
+
+  // Calculate ||q - c||^2
+  DeviceTensor<float, 1, true> residualNorms(
+    mem,
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1)},
+    stream);
+
+  auto residualView2 = residual.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  runL2Norm(residualView2, true, residualNorms, true, stream);
+
+  // Perform a batch MM:
+  // (sub q) x {(q * c)(sub dim) x (sub dim)(code)} =>
+  // (sub q) x {(q * c)(code)}
+  auto residualView3 = residual.view<3>(
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 3, true> residualDistance(
+    mem,
+    {pqCentroids.getSize(0),
+        topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)},
+    stream);
+
+  runIteratedMatrixMult(residualDistance, false,
+                        residualView3, false,
+                        pqCentroids, false,
+                        -2.0f, 0.0f,
+                        handle,
+                        stream);
+
+  // Sum ||q - c||^2 along rows
+  auto residualDistanceView2 = residualDistance.view<2>(
+    {pqCentroids.getSize(0) *
+        topQueryToCentroid.getSize(0) *
+        topQueryToCentroid.getSize(1),
+        pqCentroids.getSize(2)});
+
+  runSumAlongRows(residualNorms, residualDistanceView2, false, stream);
+
+  Tensor<float, 4, true> outCodeDistancesF;
+  DeviceTensor<float, 4, true> outCodeDistancesFloatMem;
+
+  if (useFloat16Lookup) {
+    outCodeDistancesFloatMem = DeviceTensor<float, 4, true>(
+      mem, {outCodeDistances.getSize(0),
+          outCodeDistances.getSize(1),
+          outCodeDistances.getSize(2),
+          outCodeDistances.getSize(3)},
+      stream);
+
+    outCodeDistancesF = outCodeDistancesFloatMem;
+  } else {
+    outCodeDistancesF = outCodeDistances.toTensor<float>();
+  }
+
+  // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which
+  // is where we build our output distances)
+  auto outCodeDistancesView = outCodeDistancesF.view<3>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2),
+        outCodeDistances.getSize(3)});
+
+  runTransposeAny(residualDistance, 0, 1, outCodeDistancesView, stream);
+
+  // Calculate code norms per each sub-dim
+  // (sub q)(sub dim)(code) is pqCentroids
+  // transpose to (sub q)(code)(sub dim)
+  DeviceTensor<float, 3, true> pqCentroidsTranspose(
+    mem,
+    {pqCentroids.getSize(0), pqCentroids.getSize(2), pqCentroids.getSize(1)},
+    stream);
+
+  runTransposeAny(pqCentroids, 1, 2, pqCentroidsTranspose, stream);
+
+  auto pqCentroidsTransposeView = pqCentroidsTranspose.view<2>(
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2),
+        pqCentroids.getSize(1)});
+
+  DeviceTensor<float, 1, true> pqCentroidsNorm(
+    mem,
+    {pqCentroids.getSize(0) * pqCentroids.getSize(2)},
+    stream);
+
+  runL2Norm(pqCentroidsTransposeView, true, pqCentroidsNorm, true, stream);
+
+  // View output as (q * c)(sub q * code), and add centroid norm to
+  // each row
+  auto outDistancesCodeViewCols = outCodeDistancesView.view<2>(
+    {topQueryToCentroid.getSize(0) * topQueryToCentroid.getSize(1),
+        outCodeDistances.getSize(2) * outCodeDistances.getSize(3)});
+
+  runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream);
+
+  if (useFloat16Lookup) {
+    // Need to convert back
+    auto outCodeDistancesH = outCodeDistances.toTensor<half>();
+    convertTensor<float, half, 4>(stream,
+                                  outCodeDistancesF,
+                                  outCodeDistancesH);
+  }
+}
+
+template <typename CentroidT>
+void
+runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
+                   Tensor<float, 2, true>& queries,
+                   Tensor<CentroidT, 2, true>& coarseCentroids,
+                   Tensor<int, 2, true>& topQueryToCentroid,
+                   NoTypeTensor<4, true>& outCodeDistances,
+                   bool l2Distance,
+                   bool useFloat16Lookup,
+                   cudaStream_t stream) {
+  const auto numSubQuantizers = pqCentroids.getSize(0);
+  const auto dimsPerSubQuantizer = pqCentroids.getSize(1);
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // FIXME: tune
+  // Reuse of pq centroid data is based on both # of queries * nprobe,
+  // and we should really be tiling in both dimensions
+  constexpr int kQueriesPerBlock = 8;
+
+  auto grid = dim3(utils::divUp(queries.getSize(0), kQueriesPerBlock),
+                   numSubQuantizers);
+
+  // Reserve one block of threads for double buffering
+  // FIXME: probably impractical for large # of dims?
+  auto loadingThreads = utils::roundUp(dimsPerSubQuantizer, kWarpSize);
+  auto block = dim3(codesPerSubQuantizer + loadingThreads);
+
+  auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
+    + topQueryToCentroid.getSize(1) * sizeof(int);
+
+#define RUN_CODE(DIMS, L2)                                              \
+  do {                                                                  \
+    if (useFloat16Lookup) {                                             \
+      auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
+                                                                        \
+      pqCodeDistances<half, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    } else {                                                            \
+      auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
+                                                                        \
+      pqCodeDistances<float, CentroidT, DIMS, L2><<<grid, block, smem, stream>>>( \
+        queries, kQueriesPerBlock,                                      \
+        coarseCentroids, pqCentroids,                                   \
+        topQueryToCentroid, outCodeDistancesT);                         \
+    }                                                                   \
+  } while (0)
+
+#define CODE_L2(DIMS)                           \
+  do {                                          \
+    if (l2Distance) {                           \
+      RUN_CODE(DIMS, true);                     \
+    } else {                                    \
+      RUN_CODE(DIMS, false);                    \
+    }                                           \
+  } while (0)
+
+  switch (dimsPerSubQuantizer) {
+    case 1:
+      CODE_L2(1);
+      break;
+    case 2:
+      CODE_L2(2);
+      break;
+    case 3:
+      CODE_L2(3);
+      break;
+    case 4:
+      CODE_L2(4);
+      break;
+    case 6:
+      CODE_L2(6);
+      break;
+    case 8:
+      CODE_L2(8);
+      break;
+    case 10:
+      CODE_L2(10);
+      break;
+    case 12:
+      CODE_L2(12);
+      break;
+    case 16:
+      CODE_L2(16);
+      break;
+    case 20:
+      CODE_L2(20);
+      break;
+    case 24:
+      CODE_L2(24);
+      break;
+    case 28:
+      CODE_L2(28);
+      break;
+    case 32:
+      CODE_L2(32);
+      break;
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+    default:
+      FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
+                      "not currently supported");
+  }
+
+#undef RUN_CODE
+#undef CODE_L2
+
+  CUDA_TEST_ERROR();
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu
index 73a6952dcc..817990b4a6 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu
@@ -38,7 +38,7 @@ struct Converter<float> {
 
 // Kernel responsible for calculating distance from residual vector to
 // each product quantizer code centroid
-template <typename OutCodeT, int DimsPerSubQuantizer>
+template <typename OutCodeT, int DimsPerSubQuantizer, bool L2Distance>
 __global__ void
 __launch_bounds__(288, 4)
 pqCodeDistances(Tensor<float, 2, true> queries,
@@ -124,7 +124,11 @@ pqCodeDistances(Tensor<float, 2, true> queries,
         auto coarseCentroidSubQuantizer =
           coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
 
-        smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+        if (L2Distance) {
+          smemResidual1[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+        } else {
+          smemResidual1[i] = coarseCentroidSubQuantizer[i];
+        }
       }
     }
 
@@ -146,9 +150,14 @@ pqCodeDistances(Tensor<float, 2, true> queries,
             coarseId = coarseId == -1 ? 0 : coarseId;
 
             auto coarseCentroidSubQuantizer =
-              coarseCentroids[coarseId][subQuantizer * dimsPerSubQuantizer].data();
+              coarseCentroids[coarseId]
+              [subQuantizer * dimsPerSubQuantizer].data();
 
-            smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+            if (L2Distance) {
+              smemResidual2[i] = smemQuery[i] - coarseCentroidSubQuantizer[i];
+            } else {
+              smemResidual2[i] = coarseCentroidSubQuantizer[i];
+            }
           }
         }
       } else {
@@ -164,44 +173,90 @@ pqCodeDistances(Tensor<float, 2, true> queries,
         // processing
 
         // Unrolled loop
+        if (L2Distance) {
 #pragma unroll
-        for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
 
 #pragma unroll
-          for (int j = 0; j < kUnroll; ++j) {
-            vals[j] = smemResidual1[i * kUnroll + j];
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] -= subQuantizerData[i * kUnroll + j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= vals[j];
+            }
+
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
           }
+        } else {
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int i = 0; i < DimsPerSubQuantizer / kUnroll; ++i) {
+#pragma unroll
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] = smemResidual1[i * kUnroll + j];
+            }
 
 #pragma unroll
-          for (int j = 0; j < kUnroll; ++j) {
-            vals[j] -= subQuantizerData[i * kUnroll + j];
-          }
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] += subQuantizerData[i * kUnroll + j];
+            }
 
 #pragma unroll
-          for (int j = 0; j < kUnroll; ++j) {
-            vals[j] *= vals[j];
-          }
+            for (int j = 0; j < kUnroll; ++j) {
+              vals[j] *= smemQuery[i * kUnroll + j];
+            }
 
 #pragma unroll
-          for (int j = 0; j < kUnroll; ++j) {
-            dist += vals[j];
+            for (int j = 0; j < kUnroll; ++j) {
+              dist += vals[j];
+            }
           }
         }
 
         // Remainder loop
+        if (L2Distance) {
 #pragma unroll
-        for (int j = 0; j < kRemainder; ++j) {
-          vals[j] = smemResidual1[kRemainderBase + j];
-        }
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
 
 #pragma unroll
-        for (int j = 0; j < kRemainder; ++j) {
-          vals[j] -= subQuantizerData[kRemainderBase + j];
-        }
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] -= subQuantizerData[kRemainderBase + j];
+          }
 
 #pragma unroll
-        for (int j = 0; j < kRemainder; ++j) {
-          vals[j] *= vals[j];
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= vals[j];
+          }
+        } else {
+          // Inner product
+          // Inner product: query slice against the reconstructed sub-quantizer
+          // for this coarse cell (query o (centroid + subQCentroid))
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] = smemResidual1[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] += subQuantizerData[kRemainderBase + j];
+          }
+
+#pragma unroll
+          for (int j = 0; j < kRemainder; ++j) {
+            vals[j] *= smemQuery[kRemainderBase + j];
+          }
         }
 
 #pragma unroll
@@ -405,6 +460,7 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
                    Tensor<float, 2, true>& coarseCentroids,
                    Tensor<int, 2, true>& topQueryToCentroid,
                    NoTypeTensor<4, true>& outCodeDistances,
+                   bool l2Distance,
                    bool useFloat16Lookup,
                    cudaStream_t stream) {
   const auto numSubQuantizers = pqCentroids.getSize(0);
@@ -427,73 +483,83 @@ runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
   auto smem = (3 * dimsPerSubQuantizer) * sizeof(float)
     + topQueryToCentroid.getSize(1) * sizeof(int);
 
-#define CODE_DISTANCE(DIMS)                                             \
+#define RUN_CODE(DIMS, L2)                                              \
   do {                                                                  \
     if (useFloat16Lookup) {                                             \
       auto outCodeDistancesT = outCodeDistances.toTensor<half>();       \
                                                                         \
-      pqCodeDistances<half, DIMS><<<grid, block, smem, stream>>>(       \
+      pqCodeDistances<half, DIMS, L2><<<grid, block, smem, stream>>>(   \
         queries, kQueriesPerBlock,                                      \
         coarseCentroids, pqCentroids,                                   \
         topQueryToCentroid, outCodeDistancesT);                         \
     } else {                                                            \
       auto outCodeDistancesT = outCodeDistances.toTensor<float>();      \
                                                                         \
-      pqCodeDistances<float, DIMS><<<grid, block, smem, stream>>>(      \
+      pqCodeDistances<float, DIMS, L2><<<grid, block, smem, stream>>>(  \
         queries, kQueriesPerBlock,                                      \
         coarseCentroids, pqCentroids,                                   \
         topQueryToCentroid, outCodeDistancesT);                         \
     }                                                                   \
   } while (0)
 
+#define CODE_L2(DIMS)                           \
+  do {                                          \
+    if (l2Distance) {                           \
+      RUN_CODE(DIMS, true);                     \
+    } else {                                    \
+      RUN_CODE(DIMS, false);                    \
+    }                                           \
+  } while (0)
+
   switch (dimsPerSubQuantizer) {
     case 1:
-      CODE_DISTANCE(1);
+      CODE_L2(1);
       break;
     case 2:
-      CODE_DISTANCE(2);
+      CODE_L2(2);
       break;
     case 3:
-      CODE_DISTANCE(3);
+      CODE_L2(3);
       break;
     case 4:
-      CODE_DISTANCE(4);
+      CODE_L2(4);
       break;
     case 6:
-      CODE_DISTANCE(6);
+      CODE_L2(6);
       break;
     case 8:
-      CODE_DISTANCE(8);
+      CODE_L2(8);
       break;
     case 10:
-      CODE_DISTANCE(10);
+      CODE_L2(10);
       break;
     case 12:
-      CODE_DISTANCE(12);
+      CODE_L2(12);
       break;
     case 16:
-      CODE_DISTANCE(16);
+      CODE_L2(16);
       break;
     case 20:
-      CODE_DISTANCE(20);
+      CODE_L2(20);
       break;
     case 24:
-      CODE_DISTANCE(24);
+      CODE_L2(24);
       break;
     case 28:
-      CODE_DISTANCE(28);
+      CODE_L2(28);
       break;
     case 32:
-      CODE_DISTANCE(32);
+      CODE_L2(32);
       break;
       // FIXME: larger sizes require too many registers - we need the
       // MM implementation working
     default:
-      FAISS_ASSERT(false);
-      break;
+      FAISS_THROW_MSG("Too many dimensions (>32) per subquantizer "
+                      "not currently supported");
   }
 
-#undef CODE_DISTANCE
+#undef RUN_CODE
+#undef CODE_L2
 
   CUDA_TEST_ERROR();
 }
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh
index 67f9159178..0add947f2c 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cuh
@@ -20,17 +20,20 @@ class DeviceMemory;
 /// Calculates the distance from the (query - centroid) residual to
 /// each sub-code vector, for the given list of query results in
 /// topQueryToCentroid
+template <typename CentroidT>
 void runPQCodeDistances(Tensor<float, 3, true>& pqCentroids,
                         Tensor<float, 2, true>& queries,
-                        Tensor<float, 2, true>& coarseCentroids,
+                        Tensor<CentroidT, 2, true>& coarseCentroids,
                         Tensor<int, 2, true>& topQueryToCentroid,
                         NoTypeTensor<4, true>& outCodeDistances,
+                        bool l2Distance,
                         bool useFloat16Lookup,
                         cudaStream_t stream);
 
+template <typename CentroidT>
 void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
                           Tensor<float, 2, true>& queries,
-                          Tensor<float, 2, true>& coarseCentroids,
+                          Tensor<CentroidT, 2, true>& coarseCentroids,
                           Tensor<int, 2, true>& topQueryToCentroid,
                           NoTypeTensor<4, true>& outCodeDistances,
                           bool useFloat16Lookup,
@@ -39,3 +42,5 @@ void runPQCodeDistancesMM(Tensor<float, 3, true>& pqCentroids,
                           cudaStream_t stream);
 
 } } // namespace
+
+#include <faiss/gpu/impl/PQCodeDistances-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
new file mode 100644
index 0000000000..ffc81b1f8c
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
@@ -0,0 +1,606 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/impl/PQCodeDistances.cuh>
+#include <faiss/gpu/impl/PQCodeLoad.cuh>
+#include <faiss/gpu/impl/IVFUtils.cuh>
+#include <faiss/gpu/utils/ConversionOperators.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/gpu/utils/LoadStoreOperators.cuh>
+#include <faiss/gpu/utils/NoTypeTensor.cuh>
+#include <faiss/gpu/utils/StaticUtils.h>
+
+#include <faiss/gpu/utils/HostTensor.cuh>
+
+namespace faiss { namespace gpu {
+
+// This must be kept in sync with PQCodeDistances.cu
+inline bool isSupportedNoPrecomputedSubDimSize(int dims) {
+  switch (dims) {
+    case 1:
+    case 2:
+    case 3:
+    case 4:
+    case 6:
+    case 8:
+    case 10:
+    case 12:
+    case 16:
+    case 20:
+    case 24:
+    case 28:
+    case 32:
+      return true;
+    default:
+      // FIXME: larger sizes require too many registers - we need the
+      // MM implementation working
+      return false;
+  }
+}
+
+template <typename LookupT, typename LookupVecT>
+struct LoadCodeDistances {
+  static inline __device__ void load(LookupT* smem,
+                                     LookupT* codes,
+                                     int numCodes) {
+    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+
+    // We can only use the vector type if the data is guaranteed to be
+    // aligned. The codes are innermost, so if it is evenly divisible,
+    // then any slice will be aligned.
+    if (numCodes % kWordSize == 0) {
+      // Load the data by float4 for efficiency, and then handle any remainder
+      // limitVec is the number of whole vec words we can load, in terms
+      // of whole blocks performing the load
+      constexpr int kUnroll = 2;
+      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+      limitVec *= kUnroll * blockDim.x;
+
+      LookupVecT* smemV = (LookupVecT*) smem;
+      LookupVecT* codesV = (LookupVecT*) codes;
+
+      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+        LookupVecT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] =
+            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+        }
+      }
+
+      // This is where we start loading the remainder that does not evenly
+      // fit into kUnroll x blockDim.x
+      int remainder = limitVec * kWordSize;
+
+      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    } else {
+      // Potential unaligned load
+      constexpr int kUnroll = 4;
+
+      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+
+      int i = threadIdx.x;
+      for (; i < limit; i += kUnroll * blockDim.x) {
+        LookupT vals[kUnroll];
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          vals[j] = codes[i + j * blockDim.x];
+        }
+
+#pragma unroll
+        for (int j = 0; j < kUnroll; ++j) {
+          smem[i + j * blockDim.x] = vals[j];
+        }
+      }
+
+      for (; i < numCodes; i += blockDim.x) {
+        smem[i] = codes[i];
+      }
+    }
+  }
+};
+
+template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+__global__ void
+pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
+                             Tensor<float, 3, true> pqCentroids,
+                             Tensor<int, 2, true> topQueryToCentroid,
+                             Tensor<LookupT, 4, true> codeDistances,
+                             void** listCodes,
+                             int* listLengths,
+                             Tensor<int, 2, true> prefixSumOffsets,
+                             Tensor<float, 1, true> distance) {
+  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+
+  // Where the pq code -> residual distance is stored
+  extern __shared__ char smemCodeDistances[];
+  LookupT* codeDist = (LookupT*) smemCodeDistances;
+
+  // Each block handles a single query
+  auto queryId = blockIdx.y;
+  auto probeId = blockIdx.x;
+
+  // This is where we start writing out data
+  // We ensure that before the array (at offset -1), there is a 0 value
+  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+  float* distanceOut = distance[outBase].data();
+
+  auto listId = topQueryToCentroid[queryId][probeId];
+  // Safety guard in case NaNs in input cause no list ID to be generated
+  if (listId == -1) {
+    return;
+  }
+
+  unsigned char* codeList = (unsigned char*) listCodes[listId];
+  int limit = listLengths[listId];
+
+  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+    (NumSubQuantizers / 4);
+  unsigned int code32[kNumCode32];
+  unsigned int nextCode32[kNumCode32];
+
+  // We double-buffer the code loading, which improves memory utilization
+  if (threadIdx.x < limit) {
+    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+  }
+
+  LoadCodeDistances<LookupT, LookupVecT>::load(
+    codeDist,
+    codeDistances[queryId][probeId].data(),
+    codeDistances.getSize(2) * codeDistances.getSize(3));
+
+  // Prevent WAR dependencies
+  __syncthreads();
+
+  // Each thread handles one code element in the list, with a
+  // block-wide stride
+  for (int codeIndex = threadIdx.x;
+       codeIndex < limit;
+       codeIndex += blockDim.x) {
+    // Prefetch next codes
+    if (codeIndex + blockDim.x < limit) {
+      LoadCode32<NumSubQuantizers>::load(
+        nextCode32, codeList, codeIndex + blockDim.x);
+    }
+
+    float dist = 0.0f;
+
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      constexpr int kBytesPerCode32 =
+        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+
+      if (kBytesPerCode32 == 1) {
+        auto code = code32[0];
+        dist = ConvertTo<float>::to(codeDist[code]);
+
+      } else {
+#pragma unroll
+        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+          auto code = getByte(code32[word], byte * 8, 8);
+
+          auto offset =
+            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+
+          dist += ConvertTo<float>::to(codeDist[offset + code]);
+        }
+      }
+    }
+
+    // Write out intermediate distance result
+    // We do not maintain indices here, in order to reduce global
+    // memory traffic. Those are recovered in the final selection step.
+    distanceOut[codeIndex] = dist;
+
+    // Rotate buffers
+#pragma unroll
+    for (int word = 0; word < kNumCode32; ++word) {
+      code32[word] = nextCode32[word];
+    }
+  }
+}
+
+template <typename CentroidT>
+void
+runMultiPassTile(Tensor<float, 2, true>& queries,
+                 Tensor<CentroidT, 2, true>& centroids,
+                 Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                 NoTypeTensor<4, true>& codeDistances,
+                 Tensor<int, 2, true>& topQueryToCentroid,
+                 Tensor<uint8_t, 1, true>& bitset,
+                 bool useFloat16Lookup,
+                 int bytesPerCode,
+                 int numSubQuantizers,
+                 int numSubQuantizerCodes,
+                 thrust::device_vector<void*>& listCodes,
+                 thrust::device_vector<void*>& listIndices,
+                 IndicesOptions indicesOptions,
+                 thrust::device_vector<int>& listLengths,
+                 Tensor<char, 1, true>& thrustMem,
+                 Tensor<int, 2, true>& prefixSumOffsets,
+                 Tensor<float, 1, true>& allDistances,
+                 Tensor<float, 3, true>& heapDistances,
+                 Tensor<int, 3, true>& heapIndices,
+                 int k,
+                 faiss::MetricType metric,
+                 Tensor<float, 2, true>& outDistances,
+                 Tensor<long, 2, true>& outIndices,
+                 cudaStream_t stream) {
+  // We only support two metrics at the moment
+  FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
+               metric == MetricType::METRIC_L2);
+
+  bool l2Distance = metric == MetricType::METRIC_L2;
+
+  // Calculate offset lengths, so we know where to write out
+  // intermediate results
+  runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
+                     thrustMem, stream);
+
+  // Calculate residual code distances, since this is without
+  // precomputed codes
+  runPQCodeDistances(pqCentroidsInnermostCode,
+                     queries,
+                     centroids,
+                     topQueryToCentroid,
+                     codeDistances,
+                     l2Distance,
+                     useFloat16Lookup,
+                     stream);
+
+  // Convert all codes to a distance, and write out (distance,
+  // index) values for all intermediate results
+  {
+    auto kThreadsPerBlock = 256;
+
+    auto grid = dim3(topQueryToCentroid.getSize(1),
+                     topQueryToCentroid.getSize(0));
+    auto block = dim3(kThreadsPerBlock);
+
+    // pq centroid distances
+    auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+    smem *= numSubQuantizers * numSubQuantizerCodes;
+    FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice());
+
+#define RUN_PQ_OPT(NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T)                   \
+    do {                                                                \
+      auto codeDistancesT = codeDistances.toTensor<LOOKUP_T>();         \
+                                                                        \
+      pqScanNoPrecomputedMultiPass<NUM_SUB_Q, LOOKUP_T, LOOKUP_VEC_T>   \
+        <<<grid, block, smem, stream>>>(                                \
+          queries,                                                      \
+          pqCentroidsInnermostCode,                                     \
+          topQueryToCentroid,                                           \
+          codeDistancesT,                                               \
+          listCodes.data().get(),                                       \
+          listLengths.data().get(),                                     \
+          prefixSumOffsets,                                             \
+          allDistances);                                                \
+    } while (0)
+
+#define RUN_PQ(NUM_SUB_Q)                       \
+    do {                                        \
+      if (useFloat16Lookup) {                   \
+        RUN_PQ_OPT(NUM_SUB_Q, half, Half8);     \
+      } else {                                  \
+        RUN_PQ_OPT(NUM_SUB_Q, float, float4);   \
+      }                                         \
+    } while (0)
+
+    switch (bytesPerCode) {
+      case 1:
+        RUN_PQ(1);
+        break;
+      case 2:
+        RUN_PQ(2);
+        break;
+      case 3:
+        RUN_PQ(3);
+        break;
+      case 4:
+        RUN_PQ(4);
+        break;
+      case 8:
+        RUN_PQ(8);
+        break;
+      case 12:
+        RUN_PQ(12);
+        break;
+      case 16:
+        RUN_PQ(16);
+        break;
+      case 20:
+        RUN_PQ(20);
+        break;
+      case 24:
+        RUN_PQ(24);
+        break;
+      case 28:
+        RUN_PQ(28);
+        break;
+      case 32:
+        RUN_PQ(32);
+        break;
+      case 40:
+        RUN_PQ(40);
+        break;
+      case 48:
+        RUN_PQ(48);
+        break;
+      case 56:
+        RUN_PQ(56);
+        break;
+      case 64:
+        RUN_PQ(64);
+        break;
+      case 96:
+        RUN_PQ(96);
+        break;
+      default:
+        FAISS_ASSERT(false);
+        break;
+    }
+
+#undef RUN_PQ
+#undef RUN_PQ_OPT
+  }
+
+  CUDA_TEST_ERROR();
+
+  // k-select the output in chunks, to increase parallelism
+  runPass1SelectLists(listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      bitset,
+                      allDistances,
+                      topQueryToCentroid.getSize(1),
+                      k,
+                      !l2Distance, // L2 distance chooses smallest
+                      heapDistances,
+                      heapIndices,
+                      stream);
+
+  // k-select final output
+  auto flatHeapDistances = heapDistances.downcastInner<2>();
+  auto flatHeapIndices = heapIndices.downcastInner<2>();
+
+  runPass2SelectLists(flatHeapDistances,
+                      flatHeapIndices,
+                      listIndices,
+                      indicesOptions,
+                      prefixSumOffsets,
+                      topQueryToCentroid,
+                      k,
+                      !l2Distance, // L2 distance chooses smallest
+                      outDistances,
+                      outIndices,
+                      stream);
+}
+
+template <typename CentroidT>
+void
+runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
+                                Tensor<CentroidT, 2, true>& centroids,
+                                Tensor<float, 3, true>& pqCentroidsInnermostCode,
+                                Tensor<int, 2, true>& topQueryToCentroid,
+                                Tensor<uint8_t, 1, true>& bitset,
+                                bool useFloat16Lookup,
+                                int bytesPerCode,
+                                int numSubQuantizers,
+                                int numSubQuantizerCodes,
+                                thrust::device_vector<void*>& listCodes,
+                                thrust::device_vector<void*>& listIndices,
+                                IndicesOptions indicesOptions,
+                                thrust::device_vector<int>& listLengths,
+                                int maxListLength,
+                                int k,
+                                faiss::MetricType metric,
+                                // output
+                                Tensor<float, 2, true>& outDistances,
+                                // output
+                                Tensor<long, 2, true>& outIndices,
+                                GpuResources* res) {
+  constexpr int kMinQueryTileSize = 8;
+  constexpr int kMaxQueryTileSize = 128;
+  constexpr int kThrustMemSize = 16384;
+
+  int nprobe = topQueryToCentroid.getSize(1);
+
+  auto& mem = res->getMemoryManagerCurrentDevice();
+  auto stream = res->getDefaultStreamCurrentDevice();
+
+  // Make a reservation for Thrust to do its dirty work (global memory
+  // cross-block reduction space); hopefully this is large enough.
+  DeviceTensor<char, 1, true> thrustMem1(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true> thrustMem2(
+    mem, {kThrustMemSize}, stream);
+  DeviceTensor<char, 1, true>* thrustMem[2] =
+    {&thrustMem1, &thrustMem2};
+
+  // How much temporary storage is available?
+  // If possible, we'd like to fit within the space available.
+  size_t sizeAvailable = mem.getSizeAvailable();
+
+  // We run two passes of heap selection
+  // This is the size of the first-level heap passes
+  constexpr int kNProbeSplit = 8;
+  int pass2Chunks = std::min(nprobe, kNProbeSplit);
+
+  size_t sizeForFirstSelectPass =
+    pass2Chunks * k * (sizeof(float) + sizeof(int));
+
+  // How much temporary storage we need per each query
+  size_t sizePerQuery =
+    2 * // streams
+    ((nprobe * sizeof(int) + sizeof(int)) + // prefixSumOffsets
+     nprobe * maxListLength * sizeof(float) + // allDistances
+     // residual distances
+     nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
+     sizeForFirstSelectPass);
+
+  int queryTileSize = (int) (sizeAvailable / sizePerQuery);
+
+  if (queryTileSize < kMinQueryTileSize) {
+    queryTileSize = kMinQueryTileSize;
+  } else if (queryTileSize > kMaxQueryTileSize) {
+    queryTileSize = kMaxQueryTileSize;
+  }
+
+  // FIXME: we should adjust queryTileSize to deal with this, since
+  // indexing is in int32
+  FAISS_ASSERT(queryTileSize * nprobe * maxListLength <
+         std::numeric_limits<int>::max());
+
+  // Temporary memory buffers
+  // Make sure there is space prior to the start which will be 0, and
+  // will handle the boundary condition without branches
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace1(
+    mem, {queryTileSize * nprobe + 1}, stream);
+  DeviceTensor<int, 1, true> prefixSumOffsetSpace2(
+    mem, {queryTileSize * nprobe + 1}, stream);
+
+  DeviceTensor<int, 2, true> prefixSumOffsets1(
+    prefixSumOffsetSpace1[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true> prefixSumOffsets2(
+    prefixSumOffsetSpace2[1].data(),
+    {queryTileSize, nprobe});
+  DeviceTensor<int, 2, true>* prefixSumOffsets[2] =
+    {&prefixSumOffsets1, &prefixSumOffsets2};
+
+  // Make sure the element before prefixSumOffsets is 0, since we
+  // depend upon simple, boundary-less indexing to get proper results
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace1.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+  CUDA_VERIFY(cudaMemsetAsync(prefixSumOffsetSpace2.data(),
+                              0,
+                              sizeof(int),
+                              stream));
+
+  int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float);
+
+  int totalCodeDistancesSize =
+    queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes *
+    codeDistanceTypeSize;
+
+  DeviceTensor<char, 1, true> codeDistances1Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances1(
+    codeDistances1Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  DeviceTensor<char, 1, true> codeDistances2Mem(
+    mem, {totalCodeDistancesSize}, stream);
+  NoTypeTensor<4, true> codeDistances2(
+    codeDistances2Mem.data(),
+    codeDistanceTypeSize,
+    {queryTileSize, nprobe, numSubQuantizers, numSubQuantizerCodes});
+
+  NoTypeTensor<4, true>* codeDistances[2] =
+    {&codeDistances1, &codeDistances2};
+
+  DeviceTensor<float, 1, true> allDistances1(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true> allDistances2(
+    mem, {queryTileSize * nprobe * maxListLength}, stream);
+  DeviceTensor<float, 1, true>* allDistances[2] =
+    {&allDistances1, &allDistances2};
+
+  DeviceTensor<float, 3, true> heapDistances1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true> heapDistances2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<float, 3, true>* heapDistances[2] =
+    {&heapDistances1, &heapDistances2};
+
+  DeviceTensor<int, 3, true> heapIndices1(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true> heapIndices2(
+    mem, {queryTileSize, pass2Chunks, k}, stream);
+  DeviceTensor<int, 3, true>* heapIndices[2] =
+    {&heapIndices1, &heapIndices2};
+
+  auto streams = res->getAlternateStreamsCurrentDevice();
+  streamWait(streams, {stream});
+
+  int curStream = 0;
+
+  for (int query = 0; query < queries.getSize(0); query += queryTileSize) {
+    int numQueriesInTile =
+      std::min(queryTileSize, queries.getSize(0) - query);
+
+    auto prefixSumOffsetsView =
+      prefixSumOffsets[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto codeDistancesView =
+      codeDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto coarseIndicesView =
+      topQueryToCentroid.narrowOutermost(query, numQueriesInTile);
+    auto queryView =
+      queries.narrowOutermost(query, numQueriesInTile);
+
+    auto heapDistancesView =
+      heapDistances[curStream]->narrowOutermost(0, numQueriesInTile);
+    auto heapIndicesView =
+      heapIndices[curStream]->narrowOutermost(0, numQueriesInTile);
+
+    auto outDistanceView =
+      outDistances.narrowOutermost(query, numQueriesInTile);
+    auto outIndicesView =
+      outIndices.narrowOutermost(query, numQueriesInTile);
+
+    runMultiPassTile(queryView,
+                     centroids,
+                     pqCentroidsInnermostCode,
+                     codeDistancesView,
+                     coarseIndicesView,
+                     bitset,
+                     useFloat16Lookup,
+                     bytesPerCode,
+                     numSubQuantizers,
+                     numSubQuantizerCodes,
+                     listCodes,
+                     listIndices,
+                     indicesOptions,
+                     listLengths,
+                     *thrustMem[curStream],
+                     prefixSumOffsetsView,
+                     *allDistances[curStream],
+                     heapDistancesView,
+                     heapIndicesView,
+                     k,
+                     metric,
+                     outDistanceView,
+                     outIndicesView,
+                     streams[curStream]);
+
+    curStream = (curStream + 1) % 2;
+  }
+
+  streamWait({stream}, streams);
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
index 57030c9e34..ecf35fffdb 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu
@@ -23,200 +23,200 @@
 
 namespace faiss { namespace gpu {
 
-// This must be kept in sync with PQCodeDistances.cu
-bool isSupportedNoPrecomputedSubDimSize(int dims) {
-  switch (dims) {
-    case 1:
-    case 2:
-    case 3:
-    case 4:
-    case 6:
-    case 8:
-    case 10:
-    case 12:
-    case 16:
-    case 20:
-    case 24:
-    case 28:
-    case 32:
-      return true;
-    default:
-      // FIXME: larger sizes require too many registers - we need the
-      // MM implementation working
-      return false;
-  }
-}
-
-template <typename LookupT, typename LookupVecT>
-struct LoadCodeDistances {
-  static inline __device__ void load(LookupT* smem,
-                                     LookupT* codes,
-                                     int numCodes) {
-    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
-
-    // We can only use the vector type if the data is guaranteed to be
-    // aligned. The codes are innermost, so if it is evenly divisible,
-    // then any slice will be aligned.
-    if (numCodes % kWordSize == 0) {
-      // Load the data by float4 for efficiency, and then handle any remainder
-      // limitVec is the number of whole vec words we can load, in terms
-      // of whole blocks performing the load
-      constexpr int kUnroll = 2;
-      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
-      limitVec *= kUnroll * blockDim.x;
-
-      LookupVecT* smemV = (LookupVecT*) smem;
-      LookupVecT* codesV = (LookupVecT*) codes;
-
-      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
-        LookupVecT vals[kUnroll];
-
-#pragma unroll
-        for (int j = 0; j < kUnroll; ++j) {
-          vals[j] =
-            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
-        }
-
-#pragma unroll
-        for (int j = 0; j < kUnroll; ++j) {
-          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
-        }
-      }
-
-      // This is where we start loading the remainder that does not evenly
-      // fit into kUnroll x blockDim.x
-      int remainder = limitVec * kWordSize;
-
-      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
-        smem[i] = codes[i];
-      }
-    } else {
-      // Potential unaligned load
-      constexpr int kUnroll = 4;
-
-      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
-
-      int i = threadIdx.x;
-      for (; i < limit; i += kUnroll * blockDim.x) {
-        LookupT vals[kUnroll];
-
-#pragma unroll
-        for (int j = 0; j < kUnroll; ++j) {
-          vals[j] = codes[i + j * blockDim.x];
-        }
-
-#pragma unroll
-        for (int j = 0; j < kUnroll; ++j) {
-          smem[i + j * blockDim.x] = vals[j];
-        }
-      }
-
-      for (; i < numCodes; i += blockDim.x) {
-        smem[i] = codes[i];
-      }
-    }
-  }
-};
-
-template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
-__global__ void
-pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
-                             Tensor<float, 3, true> pqCentroids,
-                             Tensor<int, 2, true> topQueryToCentroid,
-                             Tensor<LookupT, 4, true> codeDistances,
-                             void** listCodes,
-                             int* listLengths,
-                             Tensor<int, 2, true> prefixSumOffsets,
-                             Tensor<float, 1, true> distance) {
-  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
-
-  // Where the pq code -> residual distance is stored
-  extern __shared__ char smemCodeDistances[];
-  LookupT* codeDist = (LookupT*) smemCodeDistances;
-
-  // Each block handles a single query
-  auto queryId = blockIdx.y;
-  auto probeId = blockIdx.x;
-
-  // This is where we start writing out data
-  // We ensure that before the array (at offset -1), there is a 0 value
-  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
-  float* distanceOut = distance[outBase].data();
-
-  auto listId = topQueryToCentroid[queryId][probeId];
-  // Safety guard in case NaNs in input cause no list ID to be generated
-  if (listId == -1) {
-    return;
-  }
-
-  unsigned char* codeList = (unsigned char*) listCodes[listId];
-  int limit = listLengths[listId];
-
-  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
-    (NumSubQuantizers / 4);
-  unsigned int code32[kNumCode32];
-  unsigned int nextCode32[kNumCode32];
-
-  // We double-buffer the code loading, which improves memory utilization
-  if (threadIdx.x < limit) {
-    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
-  }
-
-  LoadCodeDistances<LookupT, LookupVecT>::load(
-    codeDist,
-    codeDistances[queryId][probeId].data(),
-    codeDistances.getSize(2) * codeDistances.getSize(3));
-
-  // Prevent WAR dependencies
-  __syncthreads();
-
-  // Each thread handles one code element in the list, with a
-  // block-wide stride
-  for (int codeIndex = threadIdx.x;
-       codeIndex < limit;
-       codeIndex += blockDim.x) {
-    // Prefetch next codes
-    if (codeIndex + blockDim.x < limit) {
-      LoadCode32<NumSubQuantizers>::load(
-        nextCode32, codeList, codeIndex + blockDim.x);
-    }
-
-    float dist = 0.0f;
-
-#pragma unroll
-    for (int word = 0; word < kNumCode32; ++word) {
-      constexpr int kBytesPerCode32 =
-        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
-
-      if (kBytesPerCode32 == 1) {
-        auto code = code32[0];
-        dist = ConvertTo<float>::to(codeDist[code]);
-
-      } else {
-#pragma unroll
-        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
-          auto code = getByte(code32[word], byte * 8, 8);
-
-          auto offset =
-            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
-
-          dist += ConvertTo<float>::to(codeDist[offset + code]);
-        }
-      }
-    }
-
-    // Write out intermediate distance result
-    // We do not maintain indices here, in order to reduce global
-    // memory traffic. Those are recovered in the final selection step.
-    distanceOut[codeIndex] = dist;
-
-    // Rotate buffers
-#pragma unroll
-    for (int word = 0; word < kNumCode32; ++word) {
-      code32[word] = nextCode32[word];
-    }
-  }
-}
+//// This must be kept in sync with PQCodeDistances.cu
+//bool isSupportedNoPrecomputedSubDimSize(int dims) {
+//  switch (dims) {
+//    case 1:
+//    case 2:
+//    case 3:
+//    case 4:
+//    case 6:
+//    case 8:
+//    case 10:
+//    case 12:
+//    case 16:
+//    case 20:
+//    case 24:
+//    case 28:
+//    case 32:
+//      return true;
+//    default:
+//      // FIXME: larger sizes require too many registers - we need the
+//      // MM implementation working
+//      return false;
+//  }
+//}
+//
+//template <typename LookupT, typename LookupVecT>
+//struct LoadCodeDistances {
+//  static inline __device__ void load(LookupT* smem,
+//                                     LookupT* codes,
+//                                     int numCodes) {
+//    constexpr int kWordSize = sizeof(LookupVecT) / sizeof(LookupT);
+//
+//    // We can only use the vector type if the data is guaranteed to be
+//    // aligned. The codes are innermost, so if it is evenly divisible,
+//    // then any slice will be aligned.
+//    if (numCodes % kWordSize == 0) {
+//      // Load the data by float4 for efficiency, and then handle any remainder
+//      // limitVec is the number of whole vec words we can load, in terms
+//      // of whole blocks performing the load
+//      constexpr int kUnroll = 2;
+//      int limitVec = numCodes / (kUnroll * kWordSize * blockDim.x);
+//      limitVec *= kUnroll * blockDim.x;
+//
+//      LookupVecT* smemV = (LookupVecT*) smem;
+//      LookupVecT* codesV = (LookupVecT*) codes;
+//
+//      for (int i = threadIdx.x; i < limitVec; i += kUnroll * blockDim.x) {
+//        LookupVecT vals[kUnroll];
+//
+//#pragma unroll
+//        for (int j = 0; j < kUnroll; ++j) {
+//          vals[j] =
+//            LoadStore<LookupVecT>::load(&codesV[i + j * blockDim.x]);
+//        }
+//
+//#pragma unroll
+//        for (int j = 0; j < kUnroll; ++j) {
+//          LoadStore<LookupVecT>::store(&smemV[i + j * blockDim.x], vals[j]);
+//        }
+//      }
+//
+//      // This is where we start loading the remainder that does not evenly
+//      // fit into kUnroll x blockDim.x
+//      int remainder = limitVec * kWordSize;
+//
+//      for (int i = remainder + threadIdx.x; i < numCodes; i += blockDim.x) {
+//        smem[i] = codes[i];
+//      }
+//    } else {
+//      // Potential unaligned load
+//      constexpr int kUnroll = 4;
+//
+//      int limit = utils::roundDown(numCodes, kUnroll * blockDim.x);
+//
+//      int i = threadIdx.x;
+//      for (; i < limit; i += kUnroll * blockDim.x) {
+//        LookupT vals[kUnroll];
+//
+//#pragma unroll
+//        for (int j = 0; j < kUnroll; ++j) {
+//          vals[j] = codes[i + j * blockDim.x];
+//        }
+//
+//#pragma unroll
+//        for (int j = 0; j < kUnroll; ++j) {
+//          smem[i + j * blockDim.x] = vals[j];
+//        }
+//      }
+//
+//      for (; i < numCodes; i += blockDim.x) {
+//        smem[i] = codes[i];
+//      }
+//    }
+//  }
+//};
+//
+//template <int NumSubQuantizers, typename LookupT, typename LookupVecT>
+//__global__ void
+//pqScanNoPrecomputedMultiPass(Tensor<float, 2, true> queries,
+//                             Tensor<float, 3, true> pqCentroids,
+//                             Tensor<int, 2, true> topQueryToCentroid,
+//                             Tensor<LookupT, 4, true> codeDistances,
+//                             void** listCodes,
+//                             int* listLengths,
+//                             Tensor<int, 2, true> prefixSumOffsets,
+//                             Tensor<float, 1, true> distance) {
+//  const auto codesPerSubQuantizer = pqCentroids.getSize(2);
+//
+//  // Where the pq code -> residual distance is stored
+//  extern __shared__ char smemCodeDistances[];
+//  LookupT* codeDist = (LookupT*) smemCodeDistances;
+//
+//  // Each block handles a single query
+//  auto queryId = blockIdx.y;
+//  auto probeId = blockIdx.x;
+//
+//  // This is where we start writing out data
+//  // We ensure that before the array (at offset -1), there is a 0 value
+//  int outBase = *(prefixSumOffsets[queryId][probeId].data() - 1);
+//  float* distanceOut = distance[outBase].data();
+//
+//  auto listId = topQueryToCentroid[queryId][probeId];
+//  // Safety guard in case NaNs in input cause no list ID to be generated
+//  if (listId == -1) {
+//    return;
+//  }
+//
+//  unsigned char* codeList = (unsigned char*) listCodes[listId];
+//  int limit = listLengths[listId];
+//
+//  constexpr int kNumCode32 = NumSubQuantizers <= 4 ? 1 :
+//    (NumSubQuantizers / 4);
+//  unsigned int code32[kNumCode32];
+//  unsigned int nextCode32[kNumCode32];
+//
+//  // We double-buffer the code loading, which improves memory utilization
+//  if (threadIdx.x < limit) {
+//    LoadCode32<NumSubQuantizers>::load(code32, codeList, threadIdx.x);
+//  }
+//
+//  LoadCodeDistances<LookupT, LookupVecT>::load(
+//    codeDist,
+//    codeDistances[queryId][probeId].data(),
+//    codeDistances.getSize(2) * codeDistances.getSize(3));
+//
+//  // Prevent WAR dependencies
+//  __syncthreads();
+//
+//  // Each thread handles one code element in the list, with a
+//  // block-wide stride
+//  for (int codeIndex = threadIdx.x;
+//       codeIndex < limit;
+//       codeIndex += blockDim.x) {
+//    // Prefetch next codes
+//    if (codeIndex + blockDim.x < limit) {
+//      LoadCode32<NumSubQuantizers>::load(
+//        nextCode32, codeList, codeIndex + blockDim.x);
+//    }
+//
+//    float dist = 0.0f;
+//
+//#pragma unroll
+//    for (int word = 0; word < kNumCode32; ++word) {
+//      constexpr int kBytesPerCode32 =
+//        NumSubQuantizers < 4 ? NumSubQuantizers : 4;
+//
+//      if (kBytesPerCode32 == 1) {
+//        auto code = code32[0];
+//        dist = ConvertTo<float>::to(codeDist[code]);
+//
+//      } else {
+//#pragma unroll
+//        for (int byte = 0; byte < kBytesPerCode32; ++byte) {
+//          auto code = getByte(code32[word], byte * 8, 8);
+//
+//          auto offset =
+//            codesPerSubQuantizer * (word * kBytesPerCode32 + byte);
+//
+//          dist += ConvertTo<float>::to(codeDist[offset + code]);
+//        }
+//      }
+//    }
+//
+//    // Write out intermediate distance result
+//    // We do not maintain indices here, in order to reduce global
+//    // memory traffic. Those are recovered in the final selection step.
+//    distanceOut[codeIndex] = dist;
+//
+//    // Rotate buffers
+//#pragma unroll
+//    for (int word = 0; word < kNumCode32; ++word) {
+//      code32[word] = nextCode32[word];
+//    }
+//  }
+//}
 
 void
 runMultiPassTile(Tensor<float, 2, true>& queries,
@@ -239,9 +239,16 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                  Tensor<float, 3, true>& heapDistances,
                  Tensor<int, 3, true>& heapIndices,
                  int k,
+                 faiss::MetricType metric,
                  Tensor<float, 2, true>& outDistances,
                  Tensor<long, 2, true>& outIndices,
                  cudaStream_t stream) {
+  // We only support two metrics at the moment
+  FAISS_ASSERT(metric == MetricType::METRIC_INNER_PRODUCT ||
+               metric == MetricType::METRIC_L2);
+
+  bool l2Distance = metric == MetricType::METRIC_L2;
+
   // Calculate offset lengths, so we know where to write out
   // intermediate results
   runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets,
@@ -254,6 +261,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                      centroids,
                      topQueryToCentroid,
                      codeDistances,
+                     l2Distance,
                      useFloat16Lookup,
                      stream);
 
@@ -366,7 +374,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                       allDistances,
                       topQueryToCentroid.getSize(1),
                       k,
-                      false, // L2 distance chooses smallest
+                      !l2Distance, // L2 distance chooses smallest
                       heapDistances,
                       heapIndices,
                       stream);
@@ -382,7 +390,7 @@ runMultiPassTile(Tensor<float, 2, true>& queries,
                       prefixSumOffsets,
                       topQueryToCentroid,
                       k,
-                      false, // L2 distance chooses smallest
+                      !l2Distance, // L2 distance chooses smallest
                       outDistances,
                       outIndices,
                       stream);
@@ -403,6 +411,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      thrust::device_vector<int>& listLengths,
                                      int maxListLength,
                                      int k,
+                                     faiss::MetricType metric,
                                      // output
                                      Tensor<float, 2, true>& outDistances,
                                      // output
@@ -581,6 +590,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                      heapDistancesView,
                      heapIndicesView,
                      k,
+                     metric,
                      outDistanceView,
                      outIndicesView,
                      streams[curStream]);
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
index 50c017c04f..d3c0cc53d5 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/utils/Tensor.cuh>
 #include <thrust/device_vector.h>
@@ -20,8 +21,9 @@ class GpuResources;
 /// per subquantizer?
 bool isSupportedNoPrecomputedSubDimSize(int dims);
 
+template <typename CentroidT>
 void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
-                                     Tensor<float, 2, true>& centroids,
+                                     Tensor<CentroidT, 2, true>& centroids,
                                      Tensor<float, 3, true>& pqCentroidsInnermostCode,
                                      Tensor<int, 2, true>& topQueryToCentroid,
                                      Tensor<uint8_t, 1, true>& bitset,
@@ -35,6 +37,7 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      thrust::device_vector<int>& listLengths,
                                      int maxListLength,
                                      int k,
+                                     faiss::MetricType metric,
                                      // output
                                      Tensor<float, 2, true>& outDistances,
                                      // output
@@ -42,3 +45,5 @@ void runPQScanMultiPassNoPrecomputed(Tensor<float, 2, true>& queries,
                                      GpuResources* res);
 
 } } // namespace
+
+#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu
index 3b0e36ba13..20a16382f1 100644
--- a/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfFlat.cu
@@ -76,7 +76,6 @@ int main(int argc, char** argv) {
     GpuIndexFlatConfig config;
     config.device = dev;
     config.useFloat16 = FLAGS_use_float16;
-    config.useFloat16Accumulator = FLAGS_use_float16_math;
     config.storeTransposed = FLAGS_transposed;
     config.memorySpace = FLAGS_use_unified_mem ?
     MemorySpace::Unified : MemorySpace::Device;
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu
index a287ef8444..f188a1b7d3 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuDistance.cu
@@ -19,7 +19,8 @@
 
 void testTransposition(bool colMajorVecs,
                        bool colMajorQueries,
-                       faiss::MetricType metric) {
+                       faiss::MetricType metric,
+                       float metricArg = 0) {
   int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
   faiss::gpu::StandardGpuResources res;
@@ -28,30 +29,39 @@ void testTransposition(bool colMajorVecs,
   int dim = faiss::gpu::randVal(20, 150);
   int numVecs = faiss::gpu::randVal(10, 30000);
   int numQuery = faiss::gpu::randVal(1, 1024);
-  int k = faiss::gpu::randVal(20, 70);
+  int k = std::min(numVecs, faiss::gpu::randVal(20, 70));
 
   // Input data for CPU
   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
   std::vector<float> queries = faiss::gpu::randVecs(numQuery, dim);
 
+  if (metric == faiss::MetricType::METRIC_JensenShannon) {
+    // make values positive
+    for (auto& v : vecs) {
+      v = std::abs(v);
+      if (v == 0) {
+        v = 1e-6;
+      }
+    }
+
+    for (auto& q : queries) {
+      q = std::abs(q);
+      if (q == 0) {
+        q = 1e-6;
+      }
+    }
+  }
+
   // The CPU index is our reference for the results
-  faiss::IndexFlatL2 cpuIndexL2(dim);
-  cpuIndexL2.add(numVecs, vecs.data());
+  faiss::IndexFlat cpuIndex(dim, metric);
+  cpuIndex.metric_arg = metricArg;
+  cpuIndex.add(numVecs, vecs.data());
 
-  std::vector<float> cpuDistanceL2(numQuery * k, 0);
-  std::vector<faiss::Index::idx_t> cpuIndicesL2(numQuery * k, -1);
+  std::vector<float> cpuDistance(numQuery * k, 0);
+  std::vector<faiss::Index::idx_t> cpuIndices(numQuery * k, -1);
 
-  cpuIndexL2.search(numQuery, queries.data(), k,
-                    cpuDistanceL2.data(), cpuIndicesL2.data());
-
-  faiss::IndexFlatIP cpuIndexIP(dim);
-  cpuIndexIP.add(numVecs, vecs.data());
-
-  std::vector<float> cpuDistanceIP(numQuery * k, 0);
-  std::vector<faiss::Index::idx_t> cpuIndicesIP(numQuery * k, -1);
-
-  cpuIndexIP.search(numQuery, queries.data(), k,
-                    cpuDistanceIP.data(), cpuIndicesIP.data());
+  cpuIndex.search(numQuery, queries.data(), k,
+                    cpuDistance.data(), cpuIndices.data());
 
   // The transpose and distance code assumes the desired device is already set
   faiss::gpu::DeviceScope scope(device);
@@ -73,29 +83,29 @@ void testTransposition(bool colMajorVecs,
   std::vector<float> gpuDistance(numQuery * k, 0);
   std::vector<faiss::Index::idx_t> gpuIndices(numQuery * k, -1);
 
-  faiss::gpu::bruteForceKnn(
-    &res,
-    metric,
-    colMajorVecs ? vecsT.data() : gpuVecs.data(),
-    !colMajorVecs,
-    numVecs,
-    colMajorQueries ? queriesT.data() : gpuQueries.data(),
-    !colMajorQueries,
-    numQuery,
-    dim,
-    k,
-    gpuDistance.data(),
-    gpuIndices.data());
+  faiss::gpu::GpuDistanceParams args;
+  args.metric = metric;
+  args.metricArg = metricArg;
+  args.k = k;
+  args.dims = dim;
+  args.vectors = colMajorVecs ? vecsT.data() : gpuVecs.data();
+  args.vectorsRowMajor = !colMajorVecs;
+  args.numVectors = numVecs;
+  args.queries = colMajorQueries ? queriesT.data() : gpuQueries.data();
+  args.queriesRowMajor = !colMajorQueries;
+  args.numQueries = numQuery;
+  args.outDistances = gpuDistance.data();
+  args.outIndices = gpuIndices.data();
+
+  faiss::gpu::bfKnn(&res, args);
 
   std::stringstream str;
   str << "metric " << metric
       << " colMajorVecs " << colMajorVecs
       << " colMajorQueries " << colMajorQueries;
 
-  faiss::gpu::compareLists(metric == faiss::MetricType::METRIC_L2 ?
-                           cpuDistanceL2.data() : cpuDistanceIP.data(),
-                           metric == faiss::MetricType::METRIC_L2 ?
-                           cpuIndicesL2.data() : cpuIndicesIP.data(),
+  faiss::gpu::compareLists(cpuDistance.data(),
+                           cpuIndices.data(),
                            gpuDistance.data(),
                            gpuIndices.data(),
                            numQuery, k,
@@ -107,22 +117,57 @@ void testTransposition(bool colMajorVecs,
 // Test different memory layouts for brute-force k-NN
 TEST(TestGpuDistance, Transposition_RR) {
   testTransposition(false, false, faiss::MetricType::METRIC_L2);
-//  testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
+  testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
 TEST(TestGpuDistance, Transposition_RC) {
   testTransposition(false, true, faiss::MetricType::METRIC_L2);
-//  testTransposition(false, true, faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
 TEST(TestGpuDistance, Transposition_CR) {
   testTransposition(true, false, faiss::MetricType::METRIC_L2);
-//  testTransposition(true, false, faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
 TEST(TestGpuDistance, Transposition_CC) {
   testTransposition(true, true, faiss::MetricType::METRIC_L2);
-//  testTransposition(true, true, faiss::MetricType::METRIC_INNER_PRODUCT);
+}
+
+TEST(TestGpuDistance, L1) {
+  testTransposition(false, false, faiss::MetricType::METRIC_L1);
+}
+
+// Test other transpositions with the general distance kernel
+TEST(TestGpuDistance, L1_RC) {
+  testTransposition(false, true, faiss::MetricType::METRIC_L1);
+}
+
+TEST(TestGpuDistance, L1_CR) {
+  testTransposition(true, false, faiss::MetricType::METRIC_L1);
+}
+
+TEST(TestGpuDistance, L1_CC) {
+  testTransposition(true, true, faiss::MetricType::METRIC_L1);
+}
+
+// Test remainder of metric types
+TEST(TestGpuDistance, Linf) {
+  testTransposition(false, false, faiss::MetricType::METRIC_Linf);
+}
+
+TEST(TestGpuDistance, Lp) {
+  testTransposition(false, false, faiss::MetricType::METRIC_Lp, 3);
+}
+
+TEST(TestGpuDistance, Canberra) {
+  testTransposition(false, false, faiss::MetricType::METRIC_Canberra);
+}
+
+TEST(TestGpuDistance, BrayCurtis) {
+  testTransposition(false, false, faiss::MetricType::METRIC_BrayCurtis);
+}
+
+TEST(TestGpuDistance, JensenShannon) {
+  testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon);
 }
 
 int main(int argc, char** argv) {
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp
index 7847b63e21..73cfe20542 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -21,7 +21,8 @@ constexpr float kF32MaxRelErr = 6e-3f;
 
 struct TestFlatOptions {
   TestFlatOptions()
-      : useL2(true),
+      : metric(faiss::MetricType::METRIC_L2),
+        metricArg(0),
         useFloat16(false),
         useTransposed(false),
         numVecsOverride(-1),
@@ -30,7 +31,9 @@ struct TestFlatOptions {
         dimOverride(-1) {
   }
 
-  bool useL2;
+  faiss::MetricType metric;
+  float metricArg;
+
   bool useFloat16;
   bool useTransposed;
   int numVecsOverride;
@@ -41,7 +44,7 @@ struct TestFlatOptions {
 
 void testFlat(const TestFlatOptions& opt) {
   int numVecs = opt.numVecsOverride > 0 ?
-    opt.numVecsOverride : faiss::gpu::randVal(1000, 20000);
+    opt.numVecsOverride : faiss::gpu::randVal(1000, 5000);
   int dim = opt.dimOverride > 0 ?
     opt.dimOverride : faiss::gpu::randVal(50, 800);
   int numQuery = opt.numQueriesOverride > 0 ?
@@ -57,12 +60,8 @@ void testFlat(const TestFlatOptions& opt) {
     k = opt.kOverride;
   }
 
-  faiss::IndexFlatIP cpuIndexIP(dim);
-  faiss::IndexFlatL2 cpuIndexL2(dim);
-
-  faiss::IndexFlat* cpuIndex =
-    opt.useL2 ? (faiss::IndexFlat*) &cpuIndexL2 :
-    (faiss::IndexFlat*) &cpuIndexIP;
+  faiss::IndexFlat cpuIndex(dim, opt.metric);
+  cpuIndex.metric_arg = opt.metricArg;
 
   // Construct on a random device to test multi-device, if we have
   // multiple devices
@@ -71,25 +70,22 @@ void testFlat(const TestFlatOptions& opt) {
   faiss::gpu::StandardGpuResources res;
   res.noTempMemory();
 
-
   faiss::gpu::GpuIndexFlatConfig config;
   config.device = device;
   config.useFloat16 = opt.useFloat16;
   config.storeTransposed = opt.useTransposed;
 
-  faiss::gpu::GpuIndexFlatIP gpuIndexIP(&res, dim, config);
-  faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
-
-  faiss::gpu::GpuIndexFlat* gpuIndex =
-    opt.useL2 ? (faiss::gpu::GpuIndexFlat*) &gpuIndexL2 :
-    (faiss::gpu::GpuIndexFlat*) &gpuIndexIP;
+  faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
+  gpuIndex.metric_arg = opt.metricArg;
 
   std::vector<float> vecs = faiss::gpu::randVecs(numVecs, dim);
-  cpuIndex->add(numVecs, vecs.data());
-  gpuIndex->add(numVecs, vecs.data());
+  cpuIndex.add(numVecs, vecs.data());
+  gpuIndex.add(numVecs, vecs.data());
 
   std::stringstream str;
-  str << (opt.useL2 ? "L2" : "IP") << " numVecs " << numVecs
+  str << "metric " << opt.metric
+      << " marg " << opt.metricArg
+      << " numVecs " << numVecs
       << " dim " << dim
       << " useFloat16 " << opt.useFloat16
       << " transposed " << opt.useTransposed
@@ -98,7 +94,7 @@ void testFlat(const TestFlatOptions& opt) {
 
   // To some extent, we depend upon the relative error for the test
   // for float16
-  faiss::gpu::compareIndices(*cpuIndex, *gpuIndex, numQuery, dim, k, str.str(),
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex, numQuery, dim, k, str.str(),
                              opt.useFloat16 ? kF16MaxRelErr : kF32MaxRelErr,
                              // FIXME: the fp16 bounds are
                              // useless when math (the accumulator) is
@@ -110,7 +106,7 @@ void testFlat(const TestFlatOptions& opt) {
 TEST(TestGpuIndexFlat, IP_Float32) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = false;
+    opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
     opt.useFloat16 = false;
     opt.useTransposed = false;
 
@@ -121,10 +117,36 @@ TEST(TestGpuIndexFlat, IP_Float32) {
   }
 }
 
+TEST(TestGpuIndexFlat, L1_Float32) {
+  TestFlatOptions opt;
+  opt.metric = faiss::MetricType::METRIC_L1;
+  opt.useFloat16 = false;
+  opt.useTransposed = false;
+
+  testFlat(opt);
+
+  opt.useTransposed = true;
+  testFlat(opt);
+}
+
+TEST(TestGpuIndexFlat, Lp_Float32) {
+  TestFlatOptions opt;
+  opt.metric = faiss::MetricType::METRIC_Lp;
+  opt.metricArg = 5;
+  opt.useFloat16 = false;
+  opt.useTransposed = false;
+
+  testFlat(opt);
+
+  // Don't bother testing the transposed version, the L1 test should be good
+  // enough for that
+}
+
 TEST(TestGpuIndexFlat, L2_Float32) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = true;
+    opt.metric = faiss::MetricType::METRIC_L2;
+
     opt.useFloat16 = false;
     opt.useTransposed = false;
 
@@ -139,7 +161,7 @@ TEST(TestGpuIndexFlat, L2_Float32) {
 TEST(TestGpuIndexFlat, L2_Float32_K1) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = true;
+    opt.metric = faiss::MetricType::METRIC_L2;
     opt.useFloat16 = false;
     opt.useTransposed = false;
     opt.kOverride = 1;
@@ -151,7 +173,7 @@ TEST(TestGpuIndexFlat, L2_Float32_K1) {
 TEST(TestGpuIndexFlat, IP_Float16) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = false;
+    opt.metric = faiss::MetricType::METRIC_INNER_PRODUCT;
     opt.useFloat16 = true;
     opt.useTransposed = false;
 
@@ -165,7 +187,7 @@ TEST(TestGpuIndexFlat, IP_Float16) {
 TEST(TestGpuIndexFlat, L2_Float16) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = true;
+    opt.metric = faiss::MetricType::METRIC_L2;
     opt.useFloat16 = true;
     opt.useTransposed = false;
 
@@ -180,7 +202,7 @@ TEST(TestGpuIndexFlat, L2_Float16) {
 TEST(TestGpuIndexFlat, L2_Float16_K1) {
   for (int tries = 0; tries < 3; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = true;
+    opt.metric = faiss::MetricType::METRIC_L2;
     opt.useFloat16 = true;
     opt.useTransposed = false;
     opt.kOverride = 1;
@@ -193,7 +215,7 @@ TEST(TestGpuIndexFlat, L2_Float16_K1) {
 TEST(TestGpuIndexFlat, L2_Tiling) {
   for (int tries = 0; tries < 2; ++tries) {
     TestFlatOptions opt;
-    opt.useL2 = true;
+    opt.metric = faiss::MetricType::METRIC_L2;
     opt.useFloat16 = false;
     opt.useTransposed = false;
     opt.numVecsOverride = 1000000;
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 0a461b63c3..1bee6b4bbf 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -117,7 +117,7 @@ struct Options {
   int device;
 };
 
-TEST(TestGpuIndexIVFPQ, Query) {
+TEST(TestGpuIndexIVFPQ, Query_L2) {
   for (int tries = 0; tries < 2; ++tries) {
     Options opt;
 
@@ -151,7 +151,78 @@ TEST(TestGpuIndexIVFPQ, Query) {
   }
 }
 
-TEST(TestGpuIndexIVFPQ, Add) {
+TEST(TestGpuIndexIVFPQ, Query_IP) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatIP coarseQuantizer(opt.dim);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
+
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = false; // not supported/required for IP
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+}
+
+TEST(TestGpuIndexIVFPQ, Float16Coarse) {
+  Options opt;
+
+  std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+  std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+  faiss::IndexFlatL2 coarseQuantizer(opt.dim);
+  faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                             opt.codes, opt.bitsPerCode);
+  cpuIndex.nprobe = opt.nprobe;
+  cpuIndex.train(opt.numTrain, trainVecs.data());
+
+  faiss::gpu::StandardGpuResources res;
+  res.noTempMemory();
+
+  faiss::gpu::GpuIndexIVFPQConfig config;
+  config.device = opt.device;
+  config.flatConfig.useFloat16 = true;
+  config.usePrecomputedTables = opt.usePrecomputed;
+  config.indicesOptions = opt.indicesOpt;
+  config.useFloat16LookupTables = opt.useFloat16;
+
+  faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+  gpuIndex.setNumProbes(opt.nprobe);
+
+  gpuIndex.add(opt.numAdd, addVecs.data());
+  cpuIndex.add(opt.numAdd, addVecs.data());
+
+  faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                             opt.numQuery, opt.dim, opt.k, opt.toString(),
+                             opt.getCompareEpsilon(),
+                             opt.getPctMaxDiff1(),
+                             opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, Add_L2) {
   for (int tries = 0; tries < 2; ++tries) {
     Options opt;
 
@@ -187,6 +258,43 @@ TEST(TestGpuIndexIVFPQ, Add) {
   }
 }
 
+TEST(TestGpuIndexIVFPQ, Add_IP) {
+  for (int tries = 0; tries < 2; ++tries) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatIP coarseQuantizer(opt.dim);
+    faiss::IndexIVFPQ cpuIndex(&coarseQuantizer, opt.dim, opt.numCentroids,
+                               opt.codes, opt.bitsPerCode);
+    cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.setNumProbes(opt.nprobe);
+
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::compareIndices(cpuIndex, gpuIndex,
+                               opt.numQuery, opt.dim, opt.k, opt.toString(),
+                               opt.getCompareEpsilon(),
+                               opt.getPctMaxDiff1(),
+                               opt.getPctMaxDiffN());
+  }
+}
+
 TEST(TestGpuIndexIVFPQ, CopyTo) {
   Options opt;
   std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
diff --git a/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py
index 4b291febcb..8b17b4801f 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py
+++ b/core/src/index/thirdparty/faiss/gpu/test/test_gpu_index.py
@@ -3,9 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
 
-from __future__ import print_function
 import time
 import unittest
 import numpy as np
@@ -99,7 +98,9 @@ class EvalIVFPQAccuracy(unittest.TestCase):
 
             D, Inew = gpu_index.search(xq, 10)
 
-            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
+            # 0.99: allow some tolerance in results otherwise test
+            # fails occasionally (not reproducible)
+            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
 
     def test_cpu_to_gpu_IVFPQ(self):
         self.do_cpu_to_gpu('IVF128,PQ4')
@@ -126,7 +127,7 @@ class ReferencedObject(unittest.TestCase):
 
     def test_proxy(self):
         index = faiss.IndexReplicas()
-        for i in range(3):
+        for _i in range(3):
             sub_index = faiss.IndexFlatL2(self.d)
             sub_index.add(self.xb)
             index.addIndex(sub_index)
@@ -196,7 +197,7 @@ class ReferencedObject(unittest.TestCase):
         index = faiss.IndexReplicas()
         size, dim = target.shape
         num_gpu = 4
-        for i in range(num_gpu):
+        for _i in range(num_gpu):
             config = faiss.GpuIndexFlatConfig()
             config.device = 0   # simulate on a single GPU
             sub_index = faiss.GpuIndexFlatIP(faiss.StandardGpuResources(), dim, config)
@@ -268,6 +269,45 @@ class TestGPUKmeans(unittest.TestCase):
         assert np.allclose(obj1, obj2)
 
 
+class TestAlternativeDistances(unittest.TestCase):
+
+    def do_test(self, metric, metric_arg=0):
+        res = faiss.StandardGpuResources()
+        d = 32
+        nb = 1000
+        nq = 100
+
+        rs = np.random.RandomState(123)
+        xb = rs.rand(nb, d).astype('float32')
+        xq = rs.rand(nq, d).astype('float32')
+
+        index_ref = faiss.IndexFlat(d, metric)
+        index_ref.metric_arg = metric_arg
+        index_ref.add(xb)
+        Dref, Iref = index_ref.search(xq, 10)
+
+        # build from other index
+        index = faiss.GpuIndexFlat(res, index_ref)
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref, rtol=1e-6)
+
+        #  build from scratch
+        index = faiss.GpuIndexFlat(res, d, metric)
+        index.metric_arg = metric_arg
+        index.add(xb)
+
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_L1(self):
+        self.do_test(faiss.METRIC_L1)
+
+    def test_Linf(self):
+        self.do_test(faiss.METRIC_Linf)
+
+    def test_Lp(self):
+        self.do_test(faiss.METRIC_Lp, 0.7)
 
 
 if __name__ == '__main__':
diff --git a/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py b/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py
index 3348e104b2..f59f711b82 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py
+++ b/core/src/index/thirdparty/faiss/gpu/test/test_pytorch_faiss.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import numpy as np
 import unittest
@@ -98,6 +98,12 @@ def search_raw_array_pytorch(res, xb, xq, k, D=None, I=None,
 
     return D, I
 
+def to_column_major(x):
+    if hasattr(torch, 'contiguous_format'):
+        return x.t().clone(memory_format=torch.contiguous_format).t()
+    else:
+        # was default setting before memory_format was introduced
+        return x.t().clone().t()
 
 class PytorchFaissInterop(unittest.TestCase):
 
@@ -165,11 +171,11 @@ class PytorchFaissInterop(unittest.TestCase):
                 xb_t = torch.from_numpy(xb).cuda()
 
                 if not xq_row_major:
-                    xq_t = xq_t.t().clone().t()
+                    xq_t = to_column_major(xq_t)
                     assert not xq_t.is_contiguous()
 
                 if not xb_row_major:
-                    xb_t = xb_t.t().clone().t()
+                    xb_t = to_column_major(xb_t)
                     assert not xb_t.is_contiguous()
 
                 D, I = search_raw_array_pytorch(res, xb_t, xq_t, k)
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectImpl.cuh
new file mode 100644
index 0000000000..4c32b75194
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectImpl.cuh
@@ -0,0 +1,106 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/utils/BlockSelectKernel.cuh>
+#include <faiss/gpu/utils/Limits.cuh>
+
+#define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
+  extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream);                                               \
+                                                                        \
+  extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream);
+
+#define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
+  void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
+    Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(in.getSize(0) == outK.getSize(0));                     \
+    FAISS_ASSERT(in.getSize(0) == outV.getSize(0));                     \
+    FAISS_ASSERT(outK.getSize(1) == k);                                 \
+    FAISS_ASSERT(outV.getSize(1) == k);                                 \
+                                                                        \
+    auto grid = dim3(in.getSize(0));                                    \
+                                                                        \
+    constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64; \
+    auto block = dim3(kBlockSelectNumThreads);                          \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    if (bitset.getSize(0) == 0)                                       \
+      blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);  \
+    else                                                                \
+      blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(in, bitset, outK, outV, kInit, vInit, k);    \
+    CUDA_TEST_ERROR();                                                  \
+  }                                                                     \
+                                                                        \
+  void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
+    Tensor<TYPE, 2, true>& inK,                                         \
+    Tensor<int, 2, true>& inV,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<TYPE, 2, true>& outK,                                        \
+    Tensor<int, 2, true>& outV,                                         \
+    bool dir,                                                           \
+    int k,                                                              \
+    cudaStream_t stream) {                                              \
+    FAISS_ASSERT(inK.isSameSize(inV));                                  \
+    FAISS_ASSERT(outK.isSameSize(outV));                                \
+                                                                        \
+    auto grid = dim3(inK.getSize(0));                                   \
+                                                                        \
+    constexpr int kBlockSelectNumThreads = (WARP_Q <= 1024) ? 128 : 64; \
+    auto block = dim3(kBlockSelectNumThreads);                          \
+                                                                        \
+    FAISS_ASSERT(k <= WARP_Q);                                          \
+    FAISS_ASSERT(dir == DIR);                                           \
+                                                                        \
+    auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
+    auto vInit = -1;                                                    \
+                                                                        \
+    if (bitset.getSize(0) == 0)                                         \
+      blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k);  \
+    else                                                                \
+      blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(inK, inV, bitset, outK, outV, kInit, vInit, k); \
+    CUDA_TEST_ERROR();                                                  \
+  }
+
+
+#define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
+  runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
+    in, bitset, outK, outV, dir, k, stream)
+
+#define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)               \
+  runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
+    inK, inV, bitset, outK, outV, dir, k, stream)
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh
index a53e6fc2ed..ddc30af173 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh
@@ -8,11 +8,11 @@
 
 #pragma once
 
-#include <cuda.h>
-#include <faiss/Index.h>
-#include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/MetricType.h>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
 
+#include <cuda.h>
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu
index 5d8254a09b..a8195c9ca6 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.cu
@@ -101,13 +101,15 @@ int getDeviceForAddress(const void* p) {
 
   cudaPointerAttributes att;
   cudaError_t err = cudaPointerGetAttributes(&att, p);
-  FAISS_ASSERT(err == cudaSuccess ||
-         err == cudaErrorInvalidValue);
+  FAISS_ASSERT_FMT(err == cudaSuccess ||
+                   err == cudaErrorInvalidValue,
+                   "unknown error %d", (int) err);
 
   if (err == cudaErrorInvalidValue) {
     // Make sure the current thread error status has been reset
     err = cudaGetLastError();
-    FAISS_ASSERT(err == cudaErrorInvalidValue);
+    FAISS_ASSERT_FMT(err == cudaErrorInvalidValue,
+                     "unknown error %d", (int) err);
     return -1;
   } else if (att.memoryType == cudaMemoryTypeHost) {
     return -1;
@@ -125,6 +127,15 @@ bool getFullUnifiedMemSupportCurrentDevice() {
   return getFullUnifiedMemSupport(getCurrentDevice());
 }
 
+bool getTensorCoreSupport(int device) {
+  const auto& prop = getDeviceProperties(device);
+  return (prop.major >= 7);
+}
+
+bool getTensorCoreSupportCurrentDevice() {
+  return getTensorCoreSupport(getCurrentDevice());
+}
+
 int getMaxKSelection() {
   // Don't use the device at the moment, just base this based on the CUDA SDK
   // that we were compiled with
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h
index 02fccfc6bb..e9b5426ae4 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceUtils.h
@@ -64,6 +64,12 @@ bool getFullUnifiedMemSupport(int device);
 /// Equivalent to getFullUnifiedMemSupport(getCurrentDevice())
 bool getFullUnifiedMemSupportCurrentDevice();
 
+/// Does the given device support tensor core operations?
+bool getTensorCoreSupport(int device);
+
+/// Equivalent to getTensorCoreSupport(getCurrentDevice())
+bool getTensorCoreSupportCurrentDevice();
+
 /// Returns the maximum k-selection value supported based on the CUDA SDK that
 /// we were compiled with. .cu files can use DeviceDefs.cuh, but this is for
 /// non-CUDA files
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh b/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh
index 041db76510..dac73679fd 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/DeviceVector.cuh
@@ -46,9 +46,10 @@ class DeviceVector {
       num_ = num;
       capacity_ = capacity_;
   }
+
   // Clear all allocated memory; reset to zero size
   void clear() {
-    if(owner) {
+    if (owner) {
         freeMemorySpace(space_, data_);
     }
     data_ = nullptr;
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh
index 4954f27b64..09566eaa94 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh
@@ -12,9 +12,9 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 
-// We require at least CUDA 7.5 for compilation
-#if CUDA_VERSION < 7050
-#error "CUDA >= 7.5 is required"
+// We require at least CUDA 8.0 for compilation
+#if CUDA_VERSION < 8000
+#error "CUDA >= 8.0 is required"
 #endif
 
 // Some compute capabilities have full float16 ALUs.
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh
index f62971bdd3..68ccbd5686 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
 //
@@ -39,8 +40,8 @@ struct Math {
   }
 
   /// For a vector type, this is a horizontal add, returning sum(v_i)
-  static inline __device__ T reduceAdd(T v) {
-    return v;
+  static inline __device__ float reduceAdd(T v) {
+    return ConvertTo<float>::to(v);
   }
 
   static inline __device__ bool lt(T a, T b) {
@@ -252,8 +253,8 @@ struct Math<half> {
 #endif
   }
 
-  static inline __device__ half reduceAdd(half v) {
-    return v;
+  static inline __device__ float reduceAdd(half v) {
+    return ConvertTo<float>::to(v);
   }
 
   static inline __device__ bool lt(half a, half b) {
@@ -394,18 +395,11 @@ struct Math<half2> {
 #endif
   }
 
-  static inline __device__ half reduceAdd(half2 v) {
-#ifdef FAISS_USE_FULL_FLOAT16
-  half hv = __high2half(v);
-  half lv = __low2half(v);
+  static inline __device__ float reduceAdd(half2 v) {
+    float2 vf = __half22float2(v);
+    vf.x += vf.y;
 
-  return __hadd(hv, lv);
-#else
-  float2 vf = __half22float2(v);
-  vf.x += vf.y;
-
-  return __float2half(vf.x);
-#endif
+    return vf.x;
   }
 
   // not implemented for vector types
@@ -471,10 +465,10 @@ struct Math<Half4> {
     return h;
   }
 
-  static inline __device__ half reduceAdd(Half4 v) {
-    half hx = Math<half2>::reduceAdd(v.a);
-    half hy = Math<half2>::reduceAdd(v.b);
-    return Math<half>::add(hx, hy);
+  static inline __device__ float reduceAdd(Half4 v) {
+    float x = Math<half2>::reduceAdd(v.a);
+    float y = Math<half2>::reduceAdd(v.b);
+    return x + y;
   }
 
   // not implemented for vector types
@@ -544,9 +538,9 @@ struct Math<Half8> {
   }
 
   static inline __device__ half reduceAdd(Half8 v) {
-    half hx = Math<Half4>::reduceAdd(v.a);
-    half hy = Math<Half4>::reduceAdd(v.b);
-    return Math<half>::add(hx, hy);
+    float x = Math<Half4>::reduceAdd(v.a);
+    float y = Math<Half4>::reduceAdd(v.b);
+    return x + y;
   }
 
   // not implemented for vector types
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult-inl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult-inl.cuh
new file mode 100644
index 0000000000..ede225e035
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult-inl.cuh
@@ -0,0 +1,160 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
+
+namespace faiss { namespace gpu {
+
+class DeviceMemory;
+
+template <typename T>
+struct GetCudaType;
+
+template <>
+struct GetCudaType<float> {
+  static constexpr cudaDataType_t Type = CUDA_R_32F;
+};
+
+template <>
+struct GetCudaType<half> {
+  static constexpr cudaDataType_t Type = CUDA_R_16F;
+};
+
+template <typename AT, typename BT>
+cublasStatus_t
+rawGemm(cublasHandle_t handle,
+        cublasOperation_t transa,
+        cublasOperation_t transb,
+        int m,
+        int n,
+        int k,
+        const float fAlpha,
+        const AT *A,
+        int lda,
+        const BT *B,
+        int ldb,
+        const float fBeta,
+        float *C,
+        int ldc) {
+  auto cAT = GetCudaType<AT>::Type;
+  auto cBT = GetCudaType<BT>::Type;
+
+  // Always accumulate in f32
+  return cublasSgemmEx(handle, transa, transb, m, n, k,
+                       &fAlpha, A, cAT, lda,
+                       B, cBT, ldb,
+                       &fBeta,
+                       C, CUDA_R_32F, ldc);
+}
+
+template <typename AT, typename BT>
+void
+runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+              Tensor<AT, 2, true>& a, bool transA,
+              Tensor<BT, 2, true>& b, bool transB,
+              float alpha,
+              float beta,
+              cublasHandle_t handle,
+              cudaStream_t stream) {
+  cublasSetStream(handle, stream);
+
+  // Check that we have (m x k) * (k x n) = (m x n)
+  // using the input row-major layout
+  int aM = transA ? a.getSize(1) : a.getSize(0);
+  int aK = transA ? a.getSize(0) : a.getSize(1);
+
+  int bK = transB ? b.getSize(1) : b.getSize(0);
+  int bN = transB ? b.getSize(0) : b.getSize(1);
+
+  int cM = transC ? c.getSize(1) : c.getSize(0);
+  int cN = transC ? c.getSize(0) : c.getSize(1);
+
+  FAISS_ASSERT(aM == cM);
+  FAISS_ASSERT(aK == bK);
+  FAISS_ASSERT(bN == cN);
+
+  FAISS_ASSERT(a.getStride(1) == 1);
+  FAISS_ASSERT(b.getStride(1) == 1);
+  FAISS_ASSERT(c.getStride(1) == 1);
+
+  // Now, we have to represent the matrix multiplication in
+  // column-major layout
+  float* pC = c.data();
+
+  int m = c.getSize(1); // stride 1 size
+  int n = c.getSize(0); // other size
+  int k = transA ? a.getSize(0) : a.getSize(1);
+
+  int lda = transC ? a.getStride(0) : b.getStride(0);
+  int ldb = transC ? b.getStride(0) : a.getStride(0);
+  int ldc = c.getStride(0);
+
+  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  if (transC) {
+    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
+    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
+  }
+
+  cublasStatus_t err;
+
+  if (transC) {
+    err = rawGemm(handle,
+                  gemmTrA, gemmTrB,
+                  m, n, k, alpha,
+                  a.data(), lda, b.data(), ldb, beta,
+                  pC, ldc);
+  } else {
+    err = rawGemm(handle,
+                  gemmTrA, gemmTrB,
+                  m, n, k, alpha,
+                  b.data(), lda, a.data(), ldb, beta,
+                  pC, ldc);
+  }
+
+  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
+                   "cublas failed (%d): "
+                   "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
+                   (int) err,
+                   a.getSize(0), a.getSize(1), transA ? "'" : "",
+                   b.getSize(0), b.getSize(1), transB ? "'" : "",
+                   c.getSize(0), c.getSize(1), transC ? "'" : "");
+  CUDA_TEST_ERROR();
+}
+
+template <typename AT, typename BT>
+void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
+                           Tensor<AT, 3, true>& a, bool transA,
+                           Tensor<BT, 3, true>& b, bool transB,
+                           float alpha,
+                           float beta,
+                           cublasHandle_t handle,
+                           cudaStream_t stream) {
+  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
+  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
+
+  for (int i = 0; i < a.getSize(0); ++i) {
+    auto cView = c[i].view();
+    auto aView = a[i].view();
+    auto bView = b[i].view();
+
+    runMatrixMult(cView, transC,
+                  aView, transA,
+                  bView, transB,
+                  alpha, beta, handle, stream);
+  }
+}
+
+} } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu
index 42c031119e..2afb5017b2 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cu
@@ -8,196 +8,9 @@
 
 #include <faiss/gpu/utils/MatrixMult.cuh>
 #include <faiss/gpu/utils/DeviceMemory.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
 
 namespace faiss { namespace gpu {
 
-template <typename T>
-struct CublasGemm {
-};
-
-template <>
-struct CublasGemm<float> {
-  static cublasStatus_t gemm(cublasHandle_t handle,
-                             cublasOperation_t transa,
-                             cublasOperation_t transb,
-                             int m,
-                             int n,
-                             int k,
-                             float fAlpha,
-                             const float *A,
-                             int lda,
-                             const float *B,
-                             int ldb,
-                             float fBeta,
-                             float *C,
-                             int ldc,
-                             bool useHgemm) {
-    return cublasSgemm(handle, transa, transb, m, n, k,
-                       &fAlpha, A, lda, B, ldb, &fBeta, C, ldc);
-  }
-};
-
-template <>
-struct CublasGemm<half> {
-  static cublasStatus_t gemm(cublasHandle_t handle,
-                             cublasOperation_t transa,
-                             cublasOperation_t transb,
-                             int m,
-                             int n,
-                             int k,
-                             const float fAlpha,
-                             const half *A,
-                             int lda,
-                             const half *B,
-                             int ldb,
-                             const float fBeta,
-                             half *C,
-                             int ldc,
-                             bool useHgemm) {
-    if (getDeviceSupportsFloat16Math(getCurrentDevice()) && useHgemm) {
-      half hAlpha = hostFloat2Half(fAlpha);
-      half hBeta = hostFloat2Half(fBeta);
-
-      return cublasHgemm(handle, transa, transb, m, n, k,
-                         &hAlpha, A, lda, B, ldb, &hBeta, C, ldc);
-    }
-
-    // CUDA 8.0 changes the half datatype specifier
-#if CUDA_VERSION == 7050
-    auto halfType = CUBLAS_DATA_HALF;
-#else
-    auto halfType = CUDA_R_16F;
-#endif // CUDA_VERSION
-
-    return cublasSgemmEx(handle, transa, transb, m, n, k,
-                         &fAlpha, A, halfType, lda,
-                         B, halfType, ldb,
-                         &fBeta,
-                         C, halfType, ldc);
-  }
-};
-
-template <typename T>
-void
-runMatrixMult(Tensor<T, 2, true>& c, bool transC,
-              Tensor<T, 2, true>& a, bool transA,
-              Tensor<T, 2, true>& b, bool transB,
-              float alpha,
-              float beta,
-              bool useHgemm,
-              cublasHandle_t handle,
-              cudaStream_t stream) {
-  cublasSetStream(handle, stream);
-
-  // Check that we have (m x k) * (k x n) = (m x n)
-  // using the input row-major layout
-  int aM = transA ? a.getSize(1) : a.getSize(0);
-  int aK = transA ? a.getSize(0) : a.getSize(1);
-
-  int bK = transB ? b.getSize(1) : b.getSize(0);
-  int bN = transB ? b.getSize(0) : b.getSize(1);
-
-  int cM = transC ? c.getSize(1) : c.getSize(0);
-  int cN = transC ? c.getSize(0) : c.getSize(1);
-
-  FAISS_ASSERT(aM == cM);
-  FAISS_ASSERT(aK == bK);
-  FAISS_ASSERT(bN == cN);
-
-  FAISS_ASSERT(a.getStride(1) == 1);
-  FAISS_ASSERT(b.getStride(1) == 1);
-  FAISS_ASSERT(c.getStride(1) == 1);
-
-  // Now, we have to represent the matrix multiplication in
-  // column-major layout
-  T* pA = transC ? a.data() : b.data();
-  T* pB = transC ? b.data() : a.data();
-  T* pC = c.data();
-
-  int m = c.getSize(1); // stride 1 size
-  int n = c.getSize(0); // other size
-  int k = transA ? a.getSize(0) : a.getSize(1);
-
-  int lda = transC ? a.getStride(0) : b.getStride(0);
-  int ldb = transC ? b.getStride(0) : a.getStride(0);
-  int ldc = c.getStride(0);
-
-  auto gemmTrA = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto gemmTrB = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  if (transC) {
-    gemmTrA = transA ? CUBLAS_OP_N : CUBLAS_OP_T;
-    gemmTrB = transB ? CUBLAS_OP_N : CUBLAS_OP_T;
-  }
-
-  auto err = CublasGemm<T>::gemm(handle,
-                                 gemmTrA, gemmTrB,
-                                 m, n, k, alpha,
-                                 pA, lda, pB, ldb, beta,
-                                 pC, ldc, useHgemm);
-
-  FAISS_ASSERT_FMT(err == CUBLAS_STATUS_SUCCESS,
-                   "cublas failed (%d): %s "
-                   "(%d, %d)%s x (%d, %d)%s = (%d, %d)%s",
-                   (int) err,
-                   useHgemm ? "Hgemm" : "Sgemm",
-                   a.getSize(0), a.getSize(1), transA ? "'" : "",
-                   b.getSize(0), b.getSize(1), transB ? "'" : "",
-                   c.getSize(0), c.getSize(1), transC ? "'" : "");
-  CUDA_TEST_ERROR();
-}
-
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<float, 2, true>& a, bool transA,
-                   Tensor<float, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   bool useHgemm,
-                   cublasHandle_t handle,
-                   cudaStream_t stream) {
-  return runMatrixMult<float>(c, transC, a, transA, b, transB,
-                              alpha, beta, useHgemm, handle, stream);
-}
-
-void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
-                   Tensor<half, 2, true>& a, bool transA,
-                   Tensor<half, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   bool useHgemm,
-                   cublasHandle_t handle,
-                   cudaStream_t stream) {
-  return runMatrixMult<half>(c, transC, a, transA, b, transB,
-                             alpha, beta, useHgemm, handle, stream);
-}
-
-void
-runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
-                      Tensor<float, 3, true>& a, bool transA,
-                      Tensor<float, 3, true>& b, bool transB,
-                      float alpha,
-                      float beta,
-                      cublasHandle_t handle,
-                      cudaStream_t stream) {
-  FAISS_ASSERT(c.getSize(0) == a.getSize(0));
-  FAISS_ASSERT(a.getSize(0) == b.getSize(0));
-
-  for (int i = 0; i < a.getSize(0); ++i) {
-    auto cView = c[i].view();
-    auto aView = a[i].view();
-    auto bView = b[i].view();
-
-    runMatrixMult(cView, transC,
-                  aView, transA,
-                  bView, transB,
-                  alpha, beta, false, handle, stream);
-  }
-}
-
 void
 runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
                    Tensor<float, 3, true>& a, bool transA,
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh
index 1175ac213a..eeb11ccc5c 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/MatrixMult.cuh
@@ -10,6 +10,9 @@
 
 #include <cublas_v2.h>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <faiss/gpu/utils/HostTensor.cuh>
+#include <faiss/gpu/utils/Float16.cuh>
 
 namespace faiss { namespace gpu {
 
@@ -17,32 +20,23 @@ class DeviceMemory;
 
 /// C = alpha * A * B + beta * C
 /// Expects row major layout, not fortran/blas column major!
-void runMatrixMult(Tensor<float, 2, true>& c, bool transC,
-                   Tensor<float, 2, true>& a, bool transA,
-                   Tensor<float, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   bool useHgemm, // ignored for float32
-                   cublasHandle_t handle,
-                   cudaStream_t stream);
-
-/// C = alpha * A * B + beta * C
-/// Expects row major layout, not fortran/blas column major!
-void runMatrixMult(Tensor<half, 2, true>& c, bool transC,
-                   Tensor<half, 2, true>& a, bool transA,
-                   Tensor<half, 2, true>& b, bool transB,
-                   float alpha,
-                   float beta,
-                   bool useHgemm,
-                   cublasHandle_t handle,
-                   cudaStream_t stream);
+template <typename AT, typename BT>
+void
+runMatrixMult(Tensor<float, 2, true>& c, bool transC,
+              Tensor<AT, 2, true>& a, bool transA,
+              Tensor<BT, 2, true>& b, bool transB,
+              float alpha,
+              float beta,
+              cublasHandle_t handle,
+              cudaStream_t stream);
 
 /// C_i = alpha * A_i * B_i + beta * C_i
 /// where `i` is the outermost dimension, via iterated gemm
 /// Expects row major layout, not fortran/blas column major!
+template <typename AT, typename BT>
 void runIteratedMatrixMult(Tensor<float, 3, true>& c, bool transC,
-                           Tensor<float, 3, true>& a, bool transA,
-                           Tensor<float, 3, true>& b, bool transB,
+                           Tensor<AT, 3, true>& a, bool transA,
+                           Tensor<BT, 3, true>& b, bool transB,
                            float alpha,
                            float beta,
                            cublasHandle_t handle,
@@ -61,3 +55,5 @@ void runBatchMatrixMult(Tensor<float, 3, true>& c, bool transC,
                         cudaStream_t stream);
 
 } } // namespace
+
+#include <faiss/gpu/utils/MatrixMult-inl.cuh>
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh
index 0f5aef1315..964fbfb940 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Tensor-inl.cuh
@@ -452,7 +452,7 @@ template <typename T, int Dim, bool InnerContig,
           typename IndexT, template <typename U> class PtrTraits>
 __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
 Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
-                                                     int dim2) const {
+                                                          int dim2) const {
   GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
   GPU_FAISS_ASSERT(dim1 >= 0 && dim2 < Dim);
 
@@ -478,7 +478,36 @@ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transpose(int dim1,
   newStride[dim1] = newStride[dim2];
   newStride[dim2] = tmp;
 
-  return Tensor<T, Dim, InnerContig, IndexT, PtrTraits>(data_, newSize, newStride);
+  return Tensor<T, Dim, true, IndexT, PtrTraits>(data_, newSize, newStride);
+}
+
+template <typename T, int Dim, bool InnerContig,
+          typename IndexT, template <typename U> class PtrTraits>
+__host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
+Tensor<T, Dim, InnerContig, IndexT, PtrTraits>::transposeInnermost(
+  int dim1) const {
+  GPU_FAISS_ASSERT(dim1 >= 0 && dim1 < Dim);
+
+  // We are exchanging with the innermost dimension
+  int dim2 = 1;
+
+  IndexT newSize[Dim];
+  IndexT newStride[Dim];
+
+  for (int i = 0; i < Dim; ++i) {
+    newSize[i] = size_[i];
+    newStride[i] = stride_[i];
+  }
+
+  IndexT tmp = newSize[dim1];
+  newSize[dim1] = newSize[dim2];
+  newSize[dim2] = tmp;
+
+  tmp = newStride[dim1];
+  newStride[dim1] = newStride[dim2];
+  newStride[dim2] = tmp;
+
+  return Tensor<T, Dim, false, IndexT, PtrTraits>(data_, newSize, newStride);
 }
 
 template <typename T, int Dim, bool InnerContig,
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh
index 7f737a87ed..bb3d956d6b 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/Tensor.cuh
@@ -277,6 +277,11 @@ class Tensor {
   __host__ __device__ Tensor<T, Dim, InnerContig, IndexT, PtrTraits>
   transpose(int dim1, int dim2) const;
 
+  /// Transpose a tensor, exchanging a non-innermost dimension with the
+  /// innermost dimension, returning a no longer innermost contiguous tensor
+  __host__ __device__ Tensor<T, Dim, false, IndexT, PtrTraits>
+  transposeInnermost(int dim1) const;
+
   /// Upcast a tensor of dimension `D` to some tensor of dimension
   /// D' > D by padding the leading dimensions by 1
   /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
index 4c32b75194..e7a5a03c22 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
@@ -13,7 +13,7 @@
 #define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
   extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
     Tensor<TYPE, 2, true>& in,                                          \
-    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<uint8_t, 1, true>& bitset,                                   \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -23,7 +23,7 @@
   extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
     Tensor<TYPE, 2, true>& inK,                                         \
     Tensor<int, 2, true>& inV,                                          \
-    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<uint8_t, 1, true>& bitset,                                   \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -33,7 +33,7 @@
 #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
   void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
     Tensor<TYPE, 2, true>& in,                                          \
-    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<uint8_t, 1, true>& bitset,                                   \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -60,14 +60,14 @@
         <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);  \
     else                                                                \
       blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
-        <<<grid, block, 0, stream>>>(in, bitset, outK, outV, kInit, vInit, k);    \
+        <<<grid, block, 0, stream>>>(in, bitset, outK, outV, kInit, vInit, k); \
     CUDA_TEST_ERROR();                                                  \
   }                                                                     \
                                                                         \
   void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
     Tensor<TYPE, 2, true>& inK,                                         \
     Tensor<int, 2, true>& inV,                                          \
-    Tensor<uint8_t, 1, true>& bitset,                                    \
+    Tensor<uint8_t, 1, true>& bitset,                                   \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
diff --git a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp
index 2d7a9269d6..7482fb7b3b 100644
--- a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp
+++ b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.cpp
@@ -228,6 +228,23 @@ bool IDSelectorRange::is_member (idx_t id) const
     return id >= imin && id < imax;
 }
 
+/***********************************************************************
+ * IDSelectorArray
+ ***********************************************************************/
+
+IDSelectorArray::IDSelectorArray (size_t n, const idx_t *ids):
+   n (n), ids(ids)
+{
+}
+
+bool IDSelectorArray::is_member (idx_t id) const
+{
+    for (idx_t i = 0; i < n; i++) {
+        if (ids[i] == id) return true;
+    }
+    return false;
+}
+
 
 /***********************************************************************
  * IDSelectorBatch
diff --git a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h
index fee0026a78..c82b9ed560 100644
--- a/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h
+++ b/core/src/index/thirdparty/faiss/impl/AuxIndexStructures.h
@@ -51,9 +51,7 @@ struct RangeSearchResult {
 };
 
 
-/**
-
- Encapsulates a set of ids to remove. */
+/** Encapsulates a set of ids to remove. */
 struct IDSelector {
     typedef Index::idx_t idx_t;
     virtual bool is_member (idx_t id) const = 0;
@@ -71,6 +69,19 @@ struct IDSelectorRange: IDSelector {
     ~IDSelectorRange() override {}
 };
 
+/** simple list of elements to remove
+ *
+ * this is inefficient in most cases, except for IndexIVF with
+ * maintain_direct_map
+ */
+struct IDSelectorArray: IDSelector {
+    size_t n;
+    const idx_t *ids;
+
+    IDSelectorArray (size_t n, const idx_t *ids);
+    bool is_member(idx_t id) const override;
+    ~IDSelectorArray() override {}
+};
 
 /** Remove ids from a set. Repetitions of ids in the indices set
  * passed to the constructor does not hurt performance. The hash
diff --git a/core/src/index/thirdparty/faiss/impl/HNSW.cpp b/core/src/index/thirdparty/faiss/impl/HNSW.cpp
index 58d113e3f4..740ab0d136 100644
--- a/core/src/index/thirdparty/faiss/impl/HNSW.cpp
+++ b/core/src/index/thirdparty/faiss/impl/HNSW.cpp
@@ -15,7 +15,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 
 /**************************************************************
  * HNSW structure implementation
diff --git a/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h
index cf511a74c5..c27a48c999 100644
--- a/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h
+++ b/core/src/index/thirdparty/faiss/impl/PolysemousTraining.h
@@ -123,15 +123,15 @@ struct PolysemousTraining: SimulatedAnnealingParameters {
     enum Optimization_type_t {
         OT_None,
         OT_ReproduceDistances_affine,  ///< default
-        OT_Ranking_weighted_diff  /// same as _2, but use rank of y+ - rank of y-
+        OT_Ranking_weighted_diff       ///< same as _2, but use rank of y+ - rank of y-
     };
     Optimization_type_t optimization_type;
 
-    // use 1/4 of the training points for the optimization, with
-    // max. ntrain_permutation. If ntrain_permutation == 0: train on
-    // centroids
+    /** use 1/4 of the training points for the optimization, with
+     * max. ntrain_permutation. If ntrain_permutation == 0: train on
+     * centroids */
     int ntrain_permutation;
-    double dis_weight_factor; // decay of exp that weights distance loss
+    double dis_weight_factor; ///< decay of exp that weights distance loss
 
     // filename pattern for the logging of iterations
     std::string log_pattern;
diff --git a/core/src/index/thirdparty/faiss/impl/ProductQuantizer-inl.h b/core/src/index/thirdparty/faiss/impl/ProductQuantizer-inl.h
new file mode 100644
index 0000000000..01937dca9f
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/impl/ProductQuantizer-inl.h
@@ -0,0 +1,138 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+namespace faiss {
+
+inline
+PQEncoderGeneric::PQEncoderGeneric(uint8_t *code, int nbits,
+                                                     uint8_t offset)
+    : code(code), offset(offset), nbits(nbits), reg(0)
+{
+    assert(nbits <= 64);
+    if (offset > 0) {
+        reg = (*code & ((1 << offset) - 1));
+    }
+}
+
+inline
+void PQEncoderGeneric::encode(uint64_t x)
+{
+    reg |= (uint8_t)(x << offset);
+    x >>= (8 - offset);
+    if (offset + nbits >= 8) {
+        *code++ = reg;
+
+        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+            *code++ = (uint8_t)x;
+            x >>= 8;
+        }
+
+        offset += nbits;
+        offset &= 7;
+        reg = (uint8_t)x;
+    } else {
+        offset += nbits;
+    }
+}
+
+inline
+PQEncoderGeneric::~PQEncoderGeneric()
+{
+    if (offset > 0) {
+        *code = reg;
+    }
+}
+
+
+inline
+PQEncoder8::PQEncoder8(uint8_t *code, int nbits)
+    : code(code) {
+    assert(8 == nbits);
+}
+
+inline
+void PQEncoder8::encode(uint64_t x) {
+    *code++ = (uint8_t)x;
+}
+
+inline
+PQEncoder16::PQEncoder16(uint8_t *code, int nbits)
+    : code((uint16_t *)code) {
+    assert(16 == nbits);
+}
+
+inline
+void PQEncoder16::encode(uint64_t x) {
+    *code++ = (uint16_t)x;
+}
+
+
+inline
+PQDecoderGeneric::PQDecoderGeneric(const uint8_t *code,
+                                                     int nbits)
+    : code(code),
+      offset(0),
+      nbits(nbits),
+      mask((1ull << nbits) - 1),
+      reg(0) {
+    assert(nbits <= 64);
+}
+
+inline
+uint64_t PQDecoderGeneric::decode() {
+    if (offset == 0) {
+        reg = *code;
+    }
+    uint64_t c = (reg >> offset);
+
+    if (offset + nbits >= 8) {
+        uint64_t e = 8 - offset;
+        ++code;
+        for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
+            c |= ((uint64_t)(*code++) << e);
+            e += 8;
+        }
+
+        offset += nbits;
+        offset &= 7;
+        if (offset > 0) {
+            reg = *code;
+            c |= ((uint64_t)reg << e);
+        }
+    } else {
+        offset += nbits;
+    }
+
+    return c & mask;
+}
+
+
+inline
+PQDecoder8::PQDecoder8(const uint8_t *code, int nbits)
+    : code(code) {
+    assert(8 == nbits);
+}
+
+inline
+uint64_t PQDecoder8::decode() {
+    return (uint64_t)(*code++);
+}
+
+
+inline
+PQDecoder16::PQDecoder16(const uint8_t *code, int nbits)
+    : code((uint16_t *)code) {
+     assert(16 == nbits);
+}
+
+inline
+uint64_t PQDecoder16::decode() {
+    return (uint64_t)(*code++);
+}
+
+
+} // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp
index 379bb78822..a9658af46a 100644
--- a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.cpp
@@ -148,7 +148,7 @@ static inline void pq_estimators_from_tables_generic(const ProductQuantizer& pq,
   const size_t M = pq.M;
   const size_t ksub = pq.ksub;
   for (size_t j = 0; j < ncodes; ++j) {
-    faiss::ProductQuantizer::PQDecoderGeneric decoder(
+    PQDecoderGeneric decoder(
       codes + j * pq.code_size, nbits
     );
     float dis = 0;
@@ -755,117 +755,5 @@ void ProductQuantizer::search_sdc (const uint8_t * qcodes,
 }
 
 
-ProductQuantizer::PQEncoderGeneric::PQEncoderGeneric(uint8_t *code, int nbits,
-                                                     uint8_t offset)
-    : code(code), offset(offset), nbits(nbits), reg(0) {
-  assert(nbits <= 64);
-  if (offset > 0) {
-    reg = (*code & ((1 << offset) - 1));
-  }
-}
-
-void ProductQuantizer::PQEncoderGeneric::encode(uint64_t x) {
-  reg |= (uint8_t)(x << offset);
-  x >>= (8 - offset);
-  if (offset + nbits >= 8) {
-    *code++ = reg;
-
-    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-      *code++ = (uint8_t)x;
-      x >>= 8;
-    }
-
-    offset += nbits;
-    offset &= 7;
-    reg = (uint8_t)x;
-  } else {
-    offset += nbits;
-  }
-}
-
-ProductQuantizer::PQEncoderGeneric::~PQEncoderGeneric() {
-  if (offset > 0) {
-    *code = reg;
-  }
-}
-
-
-ProductQuantizer::PQEncoder8::PQEncoder8(uint8_t *code, int nbits)
-    : code(code) {
-  assert(8 == nbits);
-}
-
-void ProductQuantizer::PQEncoder8::encode(uint64_t x) {
-  *code++ = (uint8_t)x;
-}
-
-
-ProductQuantizer::PQEncoder16::PQEncoder16(uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
-  assert(16 == nbits);
-}
-
-void ProductQuantizer::PQEncoder16::encode(uint64_t x) {
-  *code++ = (uint16_t)x;
-}
-
-
-ProductQuantizer::PQDecoderGeneric::PQDecoderGeneric(const uint8_t *code,
-                                                     int nbits)
-    : code(code),
-      offset(0),
-      nbits(nbits),
-      mask((1ull << nbits) - 1),
-      reg(0) {
-  assert(nbits <= 64);
-}
-
-uint64_t ProductQuantizer::PQDecoderGeneric::decode() {
-  if (offset == 0) {
-    reg = *code;
-  }
-  uint64_t c = (reg >> offset);
-
-  if (offset + nbits >= 8) {
-    uint64_t e = 8 - offset;
-    ++code;
-    for (int i = 0; i < (nbits - (8 - offset)) / 8; ++i) {
-      c |= ((uint64_t)(*code++) << e);
-      e += 8;
-    }
-
-    offset += nbits;
-    offset &= 7;
-    if (offset > 0) {
-      reg = *code;
-      c |= ((uint64_t)reg << e);
-    }
-  } else {
-    offset += nbits;
-  }
-
-  return c & mask;
-}
-
-
-ProductQuantizer::PQDecoder8::PQDecoder8(const uint8_t *code, int nbits)
-    : code(code) {
-  assert(8 == nbits);
-}
-
-uint64_t ProductQuantizer::PQDecoder8::decode() {
-  return (uint64_t)(*code++);
-}
-
-
-ProductQuantizer::PQDecoder16::PQDecoder16(const uint8_t *code, int nbits)
-    : code((uint16_t *)code) {
-  assert(16 == nbits);
-}
-
-uint64_t ProductQuantizer::PQDecoder16::decode() {
-  return (uint64_t)(*code++);
-}
-
 
 }  // namespace faiss
diff --git a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h
index 40066441bd..c900d9c9d4 100644
--- a/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h
+++ b/core/src/index/thirdparty/faiss/impl/ProductQuantizer.h
@@ -173,70 +173,65 @@ struct ProductQuantizer {
                      float_maxheap_array_t * res,
                      bool init_finalize_heap = true) const;
 
-    struct PQEncoderGeneric {
-        uint8_t *code;   ///< code for this vector
-        uint8_t offset;
-        const int nbits; ///< number of bits per subquantizer index
-
-        uint8_t reg;
-
-        PQEncoderGeneric(uint8_t *code, int nbits, uint8_t offset = 0);
-
-        void encode(uint64_t x);
-
-        ~PQEncoderGeneric();
-    };
-
-
-    struct PQEncoder8 {
-        uint8_t *code;
-
-        PQEncoder8(uint8_t *code, int nbits);
-
-        void encode(uint64_t x);
-    };
-
-    struct PQEncoder16 {
-        uint16_t *code;
-
-        PQEncoder16(uint8_t *code, int nbits);
-
-        void encode(uint64_t x);
-    };
-
-
-    struct PQDecoderGeneric {
-        const uint8_t *code;
-        uint8_t offset;
-        const int nbits;
-        const uint64_t mask;
-        uint8_t reg;
-
-        PQDecoderGeneric(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
-    struct PQDecoder8 {
-        const uint8_t *code;
-
-        PQDecoder8(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
-    struct PQDecoder16 {
-        const uint16_t *code;
-
-        PQDecoder16(const uint8_t *code, int nbits);
-
-        uint64_t decode();
-    };
-
 };
 
 
+/*************************************************
+ * Objects to encode / decode strings of bits
+ *************************************************/
+
+struct PQEncoderGeneric {
+    uint8_t *code;   ///< code for this vector
+    uint8_t offset;
+    const int nbits; ///< number of bits per subquantizer index
+
+    uint8_t reg;
+
+    PQEncoderGeneric(uint8_t *code, int nbits, uint8_t offset = 0);
+
+    void encode(uint64_t x);
+
+    ~PQEncoderGeneric();
+};
+
+
+struct PQEncoder8 {
+    uint8_t *code;
+    PQEncoder8(uint8_t *code, int nbits);
+    void encode(uint64_t x);
+};
+
+struct PQEncoder16 {
+    uint16_t *code;
+    PQEncoder16(uint8_t *code, int nbits);
+    void encode(uint64_t x);
+};
+
+
+struct PQDecoderGeneric {
+    const uint8_t *code;
+    uint8_t offset;
+    const int nbits;
+    const uint64_t mask;
+    uint8_t reg;
+    PQDecoderGeneric(const uint8_t *code, int nbits);
+    uint64_t decode();
+};
+
+struct PQDecoder8 {
+    const uint8_t *code;
+    PQDecoder8(const uint8_t *code, int nbits);
+    uint64_t decode();
+};
+
+struct PQDecoder16 {
+    const uint16_t *code;
+    PQDecoder16(const uint8_t *code, int nbits);
+    uint64_t decode();
+};
+
 }  // namespace faiss
 
+#include <faiss/impl/ProductQuantizer-inl.h>
 
 #endif
diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
index 2df8636bcc..a8f8c46d5c 100644
--- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h
@@ -12,7 +12,6 @@
 #include <faiss/IndexIVF.h>
 #include <faiss/impl/ScalarQuantizerOp.h>
 
-
 namespace faiss {
 
 /**
diff --git a/core/src/index/thirdparty/faiss/impl/index_read.cpp b/core/src/index/thirdparty/faiss/impl/index_read.cpp
index c092aa89dc..85cec7d39f 100644
--- a/core/src/index/thirdparty/faiss/impl/index_read.cpp
+++ b/core/src/index/thirdparty/faiss/impl/index_read.cpp
@@ -19,6 +19,7 @@
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/io.h>
+#include <faiss/utils/hamming.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
@@ -42,6 +43,7 @@
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryHash.h>
 
 
 
@@ -387,6 +389,25 @@ ProductQuantizer * read_ProductQuantizer (IOReader *reader) {
   return pq;
 }
 
+static void read_direct_map (DirectMap *dm, IOReader *f) {
+    char maintain_direct_map;
+    READ1 (maintain_direct_map);
+    dm->type = (DirectMap::Type)maintain_direct_map;
+    READVECTOR (dm->array);
+    if (dm->type == DirectMap::Hashtable) {
+        using idx_t = Index::idx_t;
+        std::vector<std::pair<idx_t, idx_t>> v;
+        READVECTOR (v);
+        std::unordered_map<idx_t, idx_t> & map = dm->hashtable;
+        map.reserve (v.size());
+        for (auto it: v) {
+            map [it.first] = it.second;
+        }
+    }
+
+}
+
+
 static void read_ivf_header (
     IndexIVF *ivf, IOReader *f,
     std::vector<std::vector<Index::idx_t> > *ids = nullptr)
@@ -401,8 +422,7 @@ static void read_ivf_header (
         for (size_t i = 0; i < ivf->nlist; i++)
             READVECTOR ((*ids)[i]);
     }
-    READ1 (ivf->maintain_direct_map);
-    READVECTOR (ivf->direct_map);
+    read_direct_map (&ivf->direct_map, f);
 }
 
 // used for legacy formats
@@ -460,10 +480,15 @@ Index *read_index (IOReader *f, int io_flags) {
     Index * idx = nullptr;
     uint32_t h;
     READ1 (h);
-    if (h == fourcc ("IxFI") || h == fourcc ("IxF2")) {
+    if (h == fourcc ("IxFI") || h == fourcc ("IxF2") || h == fourcc("IxFl")) {
         IndexFlat *idxf;
-        if (h == fourcc ("IxFI")) idxf = new IndexFlatIP ();
-        else                      idxf = new IndexFlatL2 ();
+        if (h == fourcc ("IxFI")) {
+            idxf = new IndexFlatIP ();
+        } else if (h == fourcc("IxF2")) {
+            idxf = new IndexFlatL2 ();
+        } else {
+            idxf = new IndexFlat ();
+        }
         read_index_header (idxf, f);
         READVECTOR (idxf->xb);
         FAISS_THROW_IF_NOT (idxf->xb.size() == idxf->ntotal * idxf->d);
@@ -757,10 +782,59 @@ static void read_binary_ivf_header (
         for (size_t i = 0; i < ivf->nlist; i++)
             READVECTOR ((*ids)[i]);
     }
-    READ1 (ivf->maintain_direct_map);
-    READVECTOR (ivf->direct_map);
+    read_direct_map (&ivf->direct_map, f);
 }
 
+static void read_binary_hash_invlists (
+        IndexBinaryHash::InvertedListMap &invlists,
+        int b, IOReader *f)
+{
+    size_t sz;
+    READ1 (sz);
+    int il_nbit = 0;
+    READ1 (il_nbit);
+    // buffer for bitstrings
+    std::vector<uint8_t> buf((b + il_nbit) * sz);
+    READVECTOR (buf);
+    BitstringReader rd (buf.data(), buf.size());
+    invlists.reserve (sz);
+    for (size_t i = 0; i < sz; i++) {
+        uint64_t hash = rd.read(b);
+        uint64_t ilsz = rd.read(il_nbit);
+        auto & il = invlists[hash];
+        READVECTOR (il.ids);
+        FAISS_THROW_IF_NOT (il.ids.size() == ilsz);
+        READVECTOR (il.vecs);
+    }
+}
+
+static void read_binary_multi_hash_map(
+        IndexBinaryMultiHash::Map &map,
+        int b, size_t ntotal,
+        IOReader *f)
+{
+    int id_bits;
+    size_t sz;
+    READ1 (id_bits);
+    READ1 (sz);
+    std::vector<uint8_t> buf;
+    READVECTOR (buf);
+    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
+    FAISS_THROW_IF_NOT (buf.size() == (nbit + 7) / 8);
+    BitstringReader rd (buf.data(), buf.size());
+    map.reserve (sz);
+    for (size_t i = 0; i < sz; i++) {
+        uint64_t hash = rd.read(b);
+        uint64_t ilsz = rd.read(id_bits);
+        auto & il = map[hash];
+        for (size_t j = 0; j < ilsz; j++) {
+            il.push_back (rd.read (id_bits));
+        }
+    }
+}
+
+
+
 IndexBinary *read_index_binary (IOReader *f, int io_flags) {
     IndexBinary * idx = nullptr;
     uint32_t h;
@@ -802,6 +876,28 @@ IndexBinary *read_index_binary (IOReader *f, int io_flags) {
             static_cast<IndexBinaryIDMap2*>(idxmap)->construct_rev_map ();
         }
         idx = idxmap;
+    } else if(h == fourcc("IBHh")) {
+        IndexBinaryHash *idxh = new IndexBinaryHash ();
+        read_index_binary_header (idxh, f);
+        READ1 (idxh->b);
+        READ1 (idxh->nflip);
+        read_binary_hash_invlists(idxh->invlists, idxh->b, f);
+        idx = idxh;
+    } else if(h == fourcc("IBHm")) {
+        IndexBinaryMultiHash* idxmh = new IndexBinaryMultiHash ();
+        read_index_binary_header (idxmh, f);
+        idxmh->storage = dynamic_cast<IndexBinaryFlat*> (read_index_binary (f));
+        FAISS_THROW_IF_NOT(idxmh->storage && idxmh->storage->ntotal == idxmh->ntotal);
+        idxmh->own_fields = true;
+        READ1 (idxmh->b);
+        READ1 (idxmh->nhash);
+        READ1 (idxmh->nflip);
+        idxmh->maps.resize (idxmh->nhash);
+        for (int i = 0; i < idxmh->nhash; i++) {
+            read_binary_multi_hash_map(
+                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
+        }
+        idx = idxmh;
     } else {
         FAISS_THROW_FMT("Index type 0x%08x not supported\n", h);
         idx = nullptr;
diff --git a/core/src/index/thirdparty/faiss/impl/index_write.cpp b/core/src/index/thirdparty/faiss/impl/index_write.cpp
index c18f8021e0..54fce2fc46 100644
--- a/core/src/index/thirdparty/faiss/impl/index_write.cpp
+++ b/core/src/index/thirdparty/faiss/impl/index_write.cpp
@@ -19,6 +19,7 @@
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/io.h>
+#include <faiss/utils/hamming.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
@@ -42,6 +43,7 @@
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
+#include <faiss/IndexBinaryHash.h>
 
 
 
@@ -305,20 +307,33 @@ static void write_HNSW (const HNSW *hnsw, IOWriter *f) {
     WRITE1 (hnsw->upper_beam);
 }
 
+static void write_direct_map (const DirectMap *dm, IOWriter *f) {
+    char maintain_direct_map = (char)dm->type; // for backwards compatibility with bool
+    WRITE1 (maintain_direct_map);
+    WRITEVECTOR (dm->array);
+    if (dm->type == DirectMap::Hashtable) {
+        using idx_t = Index::idx_t;
+        std::vector<std::pair<idx_t, idx_t>> v;
+        const std::unordered_map<idx_t, idx_t> & map = dm->hashtable;
+        v.resize (map.size());
+        std::copy(map.begin(), map.end(), v.begin());
+        WRITEVECTOR (v);
+    }
+}
+
 static void write_ivf_header (const IndexIVF *ivf, IOWriter *f) {
     write_index_header (ivf, f);
     WRITE1 (ivf->nlist);
     WRITE1 (ivf->nprobe);
     write_index (ivf->quantizer, f);
-    WRITE1 (ivf->maintain_direct_map);
-    WRITEVECTOR (ivf->direct_map);
+    write_direct_map (&ivf->direct_map, f);
 }
 
 void write_index (const Index *idx, IOWriter *f) {
     if (const IndexFlat * idxf = dynamic_cast<const IndexFlat *> (idx)) {
         uint32_t h = fourcc (
               idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI" :
-              idxf->metric_type == METRIC_L2 ? "IxF2" : nullptr);
+              idxf->metric_type == METRIC_L2 ? "IxF2" : "IxFl");
         WRITE1 (h);
         write_index_header (idx, f);
         WRITEVECTOR (idxf->xb);
@@ -527,8 +542,68 @@ static void write_binary_ivf_header (const IndexBinaryIVF *ivf, IOWriter *f) {
     WRITE1 (ivf->nlist);
     WRITE1 (ivf->nprobe);
     write_index_binary (ivf->quantizer, f);
-    WRITE1 (ivf->maintain_direct_map);
-    WRITEVECTOR (ivf->direct_map);
+    write_direct_map (&ivf->direct_map, f);
+}
+
+static void write_binary_hash_invlists (
+        const IndexBinaryHash::InvertedListMap &invlists,
+        int b, IOWriter *f)
+{
+    size_t sz = invlists.size();
+    WRITE1 (sz);
+    size_t maxil = 0;
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        if(it->second.ids.size() > maxil) {
+            maxil = it->second.ids.size();
+        }
+    }
+    int il_nbit = 0;
+    while(maxil >= ((uint64_t)1 << il_nbit)) {
+        il_nbit++;
+    }
+    WRITE1(il_nbit);
+
+    // first write sizes then data, may be useful if we want to
+    // memmap it at some point
+
+    // buffer for bitstrings
+    std::vector<uint8_t> buf (((b + il_nbit) * sz + 7) / 8);
+    BitstringWriter wr (buf.data(), buf.size());
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        wr.write (it->first, b);
+        wr.write (it->second.ids.size(), il_nbit);
+    }
+    WRITEVECTOR (buf);
+
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        WRITEVECTOR (it->second.ids);
+        WRITEVECTOR (it->second.vecs);
+    }
+}
+
+static void write_binary_multi_hash_map(
+        const IndexBinaryMultiHash::Map &map,
+        int b, size_t ntotal,
+        IOWriter *f)
+{
+    int id_bits = 0;
+    while ((ntotal > ((Index::idx_t)1 << id_bits))) {
+        id_bits++;
+    }
+    WRITE1(id_bits);
+    size_t sz = map.size();
+    WRITE1(sz);
+    size_t nbit = (b + id_bits) * sz + ntotal * id_bits;
+    std::vector<uint8_t> buf((nbit + 7) / 8);
+    BitstringWriter wr (buf.data(), buf.size());
+    for (auto it = map.begin(); it != map.end(); ++it) {
+        wr.write(it->first, b);
+        wr.write(it->second.size(), id_bits);
+        for (auto id : it->second) {
+            wr.write(id, id_bits);
+        }
+    }
+    WRITEVECTOR (buf);
 }
 
 void write_index_binary (const IndexBinary *idx, IOWriter *f) {
@@ -567,6 +642,27 @@ void write_index_binary (const IndexBinary *idx, IOWriter *f) {
         write_index_binary_header (idxmap, f);
         write_index_binary (idxmap->index, f);
         WRITEVECTOR (idxmap->id_map);
+    } else if (const IndexBinaryHash *idxh =
+               dynamic_cast<const IndexBinaryHash *> (idx)) {
+        uint32_t h = fourcc ("IBHh");
+        WRITE1 (h);
+        write_index_binary_header (idxh, f);
+        WRITE1 (idxh->b);
+        WRITE1 (idxh->nflip);
+        write_binary_hash_invlists(idxh->invlists, idxh->b, f);
+    } else if (const IndexBinaryMultiHash *idxmh =
+               dynamic_cast<const IndexBinaryMultiHash *> (idx)) {
+        uint32_t h = fourcc ("IBHm");
+        WRITE1 (h);
+        write_index_binary_header (idxmh, f);
+        write_index_binary (idxmh->storage, f);
+        WRITE1 (idxmh->b);
+        WRITE1 (idxmh->nhash);
+        WRITE1 (idxmh->nflip);
+        for (int i = 0; i < idxmh->nhash; i++) {
+            write_binary_multi_hash_map(
+                    idxmh->maps[i], idxmh->b, idxmh->ntotal, f);
+        }
     } else {
         FAISS_THROW_MSG ("don't know how to serialize this type of index");
     }
diff --git a/core/src/index/thirdparty/faiss/impl/io.cpp b/core/src/index/thirdparty/faiss/impl/io.cpp
index e8ffca6bc9..0954f3c1fc 100644
--- a/core/src/index/thirdparty/faiss/impl/io.cpp
+++ b/core/src/index/thirdparty/faiss/impl/io.cpp
@@ -37,7 +37,6 @@ int IOWriter::fileno ()
  ***********************************************************************/
 
 
-
 size_t VectorIOWriter::operator()(
                 const void *ptr, size_t size, size_t nitems)
 {
@@ -132,6 +131,117 @@ int FileIOWriter::fileno()  {
     return ::fileno (f);
 }
 
+/***********************************************************************
+ * IO buffer
+ ***********************************************************************/
+
+BufferedIOReader::BufferedIOReader(IOReader *reader, size_t bsz, size_t totsz):
+    reader(reader), bsz(bsz), totsz(totsz), ofs(0), b0(0), b1(0), buffer(bsz)
+{
+}
+
+
+size_t BufferedIOReader::operator()(void *ptr, size_t unitsize, size_t nitems)
+{
+    size_t size = unitsize * nitems;
+    if (size == 0) return 0;
+    char * dst = (char*)ptr;
+    size_t nb;
+
+    { // first copy available bytes
+        nb = std::min(b1 - b0, size);
+        memcpy (dst, buffer.data() + b0, nb);
+        b0 += nb;
+        dst += nb;
+        size -= nb;
+    }
+
+    if (size > totsz - ofs) {
+        size = totsz - ofs;
+    }
+    // while we would like to have more data
+    while (size > 0) {
+        assert (b0 == b1); // buffer empty on input
+        // try to read from main reader
+        b0 = 0;
+        b1 = (*reader)(buffer.data(), 1, std::min(bsz, size));
+
+        if (b1 == 0) {
+            // no more bytes available
+            break;
+        }
+        ofs += b1;
+
+        // copy remaining bytes
+        size_t nb2 = std::min(b1, size);
+        memcpy (dst, buffer.data(), nb2);
+        b0 = nb2;
+        nb += nb2;
+        dst += nb2;
+        size -= nb2;
+    }
+    return nb / unitsize;
+}
+
+
+BufferedIOWriter::BufferedIOWriter(IOWriter *writer, size_t bsz):
+    writer(writer), bsz(bsz), b0(0), buffer(bsz)
+{
+}
+
+size_t BufferedIOWriter::operator()(const void *ptr, size_t unitsize, size_t nitems)
+{
+    size_t size = unitsize * nitems;
+    if (size == 0) return 0;
+    const char * src = (const char*)ptr;
+    size_t nb;
+
+    { // copy as many bytes as possible to buffer
+        nb = std::min(bsz - b0, size);
+        memcpy (buffer.data() + b0, src, nb);
+        b0 += nb;
+        src += nb;
+        size -= nb;
+    }
+    while (size > 0) {
+        assert(b0 == bsz);
+        // now we need to flush to add more bytes
+        size_t ofs = 0;
+        do {
+            assert (ofs < 10000000);
+            size_t written = (*writer)(buffer.data() + ofs, 1, bsz - ofs);
+            FAISS_THROW_IF_NOT(written > 0);
+            ofs += written;
+        } while(ofs != bsz);
+
+        // copy src to buffer
+        size_t nb1 = std::min(bsz, size);
+        memcpy (buffer.data(), src, nb1);
+        b0 = nb1;
+        nb += nb1;
+        src += nb1;
+        size -= nb1;
+    }
+
+    return nb / unitsize;
+}
+
+BufferedIOWriter::~BufferedIOWriter()
+{
+    size_t ofs = 0;
+    while(ofs != b0) {
+        printf("Destructor write %ld \n", b0 - ofs);
+        size_t written = (*writer)(buffer.data() + ofs, 1, b0 - ofs);
+        FAISS_THROW_IF_NOT(written > 0);
+        ofs += written;
+    }
+
+}
+
+
+
+
+
 uint32_t fourcc (const char sx[4]) {
     assert(4 == strlen(sx));
     const unsigned char *x = (unsigned char*)sx;
diff --git a/core/src/index/thirdparty/faiss/impl/io.h b/core/src/index/thirdparty/faiss/impl/io.h
index 173d87da63..a3a565af26 100644
--- a/core/src/index/thirdparty/faiss/impl/io.h
+++ b/core/src/index/thirdparty/faiss/impl/io.h
@@ -9,6 +9,9 @@
 
 /***********************************************************
  * Abstract I/O objects
+ *
+ * I/O is always sequential, seek does not need to be supported
+ * (indexes could be read or written to a pipe).
  ***********************************************************/
 
 #pragma once
@@ -92,6 +95,41 @@ struct FileIOWriter: IOWriter {
     int fileno() override;
 };
 
+/*******************************************************
+ * Buffered reader + writer
+ *******************************************************/
+
+
+
+/** wraps an ioreader to make buffered reads to avoid too small reads */
+struct BufferedIOReader: IOReader {
+
+    IOReader *reader;
+    size_t bsz, totsz, ofs;
+    size_t b0, b1; ///< range of available bytes in the buffer
+    std::vector<char> buffer;
+
+    BufferedIOReader(IOReader *reader, size_t bsz,
+                     size_t totsz=(size_t)(-1));
+
+    size_t operator()(void *ptr, size_t size, size_t nitems) override;
+};
+
+struct BufferedIOWriter: IOWriter {
+
+    IOWriter *writer;
+    size_t bsz, ofs;
+    size_t b0; ///< amount of data in buffer
+    std::vector<char> buffer;
+
+    BufferedIOWriter(IOWriter *writer, size_t bsz);
+
+    size_t operator()(const void *ptr, size_t size, size_t nitems) override;
+
+    // flushes
+    ~BufferedIOWriter();
+};
+
 /// cast a 4-character string to a uint32_t that can be written and read easily
 uint32_t fourcc (const char sx[4]);
 
diff --git a/core/src/index/thirdparty/faiss/index_factory.cpp b/core/src/index/thirdparty/faiss/index_factory.cpp
index 85662cb5ba..456b8e5356 100644
--- a/core/src/index/thirdparty/faiss/index_factory.cpp
+++ b/core/src/index/thirdparty/faiss/index_factory.cpp
@@ -14,8 +14,6 @@
 #include <faiss/AutoTune.h>
 
 #include <cmath>
-#include <stdarg.h>     /* va_list, va_start, va_arg, va_end */
-
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
diff --git a/core/src/index/thirdparty/faiss/index_factory.h b/core/src/index/thirdparty/faiss/index_factory.h
index 552c63c943..ce62734298 100644
--- a/core/src/index/thirdparty/faiss/index_factory.h
+++ b/core/src/index/thirdparty/faiss/index_factory.h
@@ -19,7 +19,6 @@ namespace faiss {
 Index *index_factory (int d, const char *description,
                       MetricType metric = METRIC_L2);
 
-IndexBinary *index_binary_factory (int d, const char *description, MetricType metric = METRIC_L2);
-
+IndexBinary *index_binary_factory (int d, const char *description, MetricType metric);
 
 }
diff --git a/core/src/index/thirdparty/faiss/python/faiss.py b/core/src/index/thirdparty/faiss/python/faiss.py
index 3adbbf4a87..2d58b7f708 100644
--- a/core/src/index/thirdparty/faiss/python/faiss.py
+++ b/core/src/index/thirdparty/faiss/python/faiss.py
@@ -8,14 +8,16 @@
 # not linting this file because it imports * form swigfaiss, which
 # causes a ton of useless warnings.
 
-from __future__ import print_function
-
 import numpy as np
 import sys
 import inspect
 import pdb
 import platform
 import subprocess
+import logging
+
+
+logger = logging.getLogger(__name__)
 
 
 def instruction_set():
@@ -26,7 +28,7 @@ def instruction_set():
             return "default"
     elif platform.system() == "Linux":
         import numpy.distutils.cpuinfo
-        if "avx2" in numpy.distutils.cpuinfo.cpu.info[0]['flags']:
+        if "avx2" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
             return "AVX2"
         else:
             return "default"
@@ -35,15 +37,15 @@ def instruction_set():
 try:
     instr_set = instruction_set()
     if instr_set == "AVX2":
-        print("Loading faiss with AVX2 support.", file=sys.stderr)
+        logger.info("Loading faiss with AVX2 support.")
         from .swigfaiss_avx2 import *
     else:
-        print("Loading faiss.", file=sys.stderr)
+        logger.info("Loading faiss.")
         from .swigfaiss import *
 
 except ImportError:
     # we import * so that the symbol X can be accessed as faiss.X
-    print("Loading faiss.", file=sys.stderr)
+    logger.info("Loading faiss.")
     from .swigfaiss import *
 
 
@@ -73,12 +75,25 @@ def replace_method(the_class, name, replacement, ignore_missing=False):
 
 
 def handle_Clustering():
-    def replacement_train(self, x, index):
-        assert x.flags.contiguous
+    def replacement_train(self, x, index, weights=None):
         n, d = x.shape
         assert d == self.d
-        self.train_c(n, swig_ptr(x), index)
+        if weights is not None:
+            assert weights.shape == (n, )
+            self.train_c(n, swig_ptr(x), index, swig_ptr(weights))
+        else:
+            self.train_c(n, swig_ptr(x), index)
+    def replacement_train_encoded(self, x, codec, index, weights=None):
+        n, d = x.shape
+        assert d == codec.sa_code_size()
+        assert codec.d == index.d
+        if weights is not None:
+            assert weights.shape == (n, )
+            self.train_encoded_c(n, swig_ptr(x), codec, index, swig_ptr(weights))
+        else:
+            self.train_encoded_c(n, swig_ptr(x), codec, index)
     replace_method(Clustering, 'train', replacement_train)
+    replace_method(Clustering, 'train_encoded', replacement_train_encoded)
 
 
 handle_Clustering()
@@ -168,7 +183,11 @@ def handle_Index(the_class):
             sel = x
         else:
             assert x.ndim == 1
-            sel = IDSelectorBatch(x.size, swig_ptr(x))
+            index_ivf = try_extract_index_ivf (self)
+            if index_ivf and index_ivf.direct_map.type == DirectMap.Hashtable:
+                sel = IDSelectorArray(x.size, swig_ptr(x))
+            else:
+                sel = IDSelectorBatch(x.size, swig_ptr(x))
         return self.remove_ids_c(sel)
 
     def replacement_reconstruct(self, key):
@@ -264,6 +283,18 @@ def handle_IndexBinary(the_class):
                       swig_ptr(labels))
         return distances, labels
 
+    def replacement_range_search(self, x, thresh):
+        n, d = x.shape
+        assert d * 8 == self.d
+        res = RangeSearchResult(n)
+        self.range_search_c(n, swig_ptr(x), thresh, res)
+        # get pointers and copy them
+        lims = rev_swig_ptr(res.lims, n + 1).copy()
+        nd = int(lims[-1])
+        D = rev_swig_ptr(res.distances, nd).copy()
+        I = rev_swig_ptr(res.labels, nd).copy()
+        return lims, D, I
+
     def replacement_remove_ids(self, x):
         if isinstance(x, IDSelector):
             sel = x
@@ -276,6 +307,7 @@ def handle_IndexBinary(the_class):
     replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
     replace_method(the_class, 'train', replacement_train)
     replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'remove_ids', replacement_remove_ids)
 
@@ -442,6 +474,9 @@ add_ref_in_constructor(IndexBinaryIDMap2, 0)
 add_ref_in_method(IndexReplicas, 'addIndex', 0)
 add_ref_in_method(IndexBinaryReplicas, 'addIndex', 0)
 
+add_ref_in_constructor(BufferedIOWriter, 0)
+add_ref_in_constructor(BufferedIOReader, 0)
+
 # seems really marginal...
 # remove_ref_from_method(IndexReplicas, 'removeIndex', 0)
 
@@ -463,25 +498,37 @@ if hasattr(this_module, 'GpuIndexFlat'):
 ###########################################
 
 
-def index_cpu_to_gpu_multiple_py(resources, index, co=None):
-    """builds the C++ vectors for the GPU indices and the
-    resources. Handles the common case where the resources are assigned to
-    the first len(resources) GPUs"""
+def index_cpu_to_gpu_multiple_py(resources, index, co=None, gpus=None):
+    """ builds the C++ vectors for the GPU indices and the
+    resources. Handles the case where the resources are assigned to
+    the list of GPUs """
+    if gpus is None:
+        gpus = range(len(resources))
     vres = GpuResourcesVector()
     vdev = IntVector()
-    for i, res in enumerate(resources):
+    for i, res in zip(gpus, resources):
         vdev.push_back(i)
         vres.push_back(res)
     index = index_cpu_to_gpu_multiple(vres, vdev, index, co)
     index.referenced_objects = resources
     return index
 
+
 def index_cpu_to_all_gpus(index, co=None, ngpu=-1):
-    if ngpu == -1:
-        ngpu = get_num_gpus()
-    res = [StandardGpuResources() for i in range(ngpu)]
-    index2 = index_cpu_to_gpu_multiple_py(res, index, co)
-    return index2
+    index_gpu = index_cpu_to_gpus_list(index, co=co, gpus=None, ngpu=ngpu)
+    return index_gpu
+
+
+def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
+    """ Here we can pass list of GPU ids as a parameter or ngpu to
+    use first n GPU's. gpus mut be a list or None"""
+    if (gpus is None) and (ngpu == -1):  # All blank
+        gpus = range(get_num_gpus())
+    elif (gpus is None) and (ngpu != -1):  # Get number of GPU's only
+        gpus = range(ngpu)
+    res = [StandardGpuResources() for _ in gpus]
+    index_gpu = index_cpu_to_gpu_multiple_py(res, index, co, gpus)
+    return index_gpu
 
 
 ###########################################
@@ -670,7 +717,7 @@ class Kmeans:
                 setattr(self.cp, k, v)
         self.centroids = None
 
-    def train(self, x):
+    def train(self, x, weights=None):
         n, d = x.shape
         assert d == self.d
         clus = Clustering(d, self.k, self.cp)
@@ -684,10 +731,13 @@ class Kmeans:
             else:
                 ngpu = self.gpu
             self.index = index_cpu_to_all_gpus(self.index, ngpu=ngpu)
-        clus.train(x, self.index)
+        clus.train(x, self.index, weights)
         centroids = vector_float_to_array(clus.centroids)
         self.centroids = centroids.reshape(self.k, d)
-        self.obj = vector_float_to_array(clus.obj)
+        stats = clus.iteration_stats
+        self.obj = np.array([
+            stats.at(i).obj for i in range(stats.size())
+        ])
         return self.obj[-1] if self.obj.size > 0 else 0.0
 
     def assign(self, x):
@@ -716,3 +766,47 @@ def deserialize_index(data):
     reader = VectorIOReader()
     copy_array_to_vector(data, reader.data)
     return read_index(reader)
+
+def serialize_index_binary(index):
+    """ convert an index to a numpy uint8 array  """
+    writer = VectorIOWriter()
+    write_index_binary(index, writer)
+    return vector_to_array(writer.data)
+
+def deserialize_index_binary(data):
+    reader = VectorIOReader()
+    copy_array_to_vector(data, reader.data)
+    return read_index_binary(reader)
+
+
+###########################################
+# ResultHeap
+###########################################
+
+class ResultHeap:
+    """Accumulate query results from a sliced dataset. The final result will
+    be in self.D, self.I."""
+
+    def __init__(self, nq, k):
+        " nq: number of query vectors, k: number of results per query "
+        self.I = np.zeros((nq, k), dtype='int64')
+        self.D = np.zeros((nq, k), dtype='float32')
+        self.nq, self.k = nq, k
+        heaps = float_maxheap_array_t()
+        heaps.k = k
+        heaps.nh = nq
+        heaps.val = swig_ptr(self.D)
+        heaps.ids = swig_ptr(self.I)
+        heaps.heapify()
+        self.heaps = heaps
+
+    def add_result(self, D, I):
+        """D, I do not need to be in a particular order (heap or sorted)"""
+        assert D.shape == (self.nq, self.k)
+        assert I.shape == (self.nq, self.k)
+        self.heaps.addn_with_ids(
+            self.k, faiss.swig_ptr(D),
+            faiss.swig_ptr(I), self.k)
+
+    def finalize(self):
+        self.heaps.reorder()
diff --git a/core/src/index/thirdparty/faiss/python/setup.py b/core/src/index/thirdparty/faiss/python/setup.py
index 592f3730ea..89b6d398cb 100644
--- a/core/src/index/thirdparty/faiss/python/setup.py
+++ b/core/src/index/thirdparty/faiss/python/setup.py
@@ -32,7 +32,7 @@ are implemented on the GPU. It is developed by Facebook AI Research.
 """
 setup(
     name='faiss',
-    version='1.6.0',
+    version='1.6.3',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',
diff --git a/core/src/index/thirdparty/faiss/python/swigfaiss.swig b/core/src/index/thirdparty/faiss/python/swigfaiss.swig
index 726823bee4..b0d8b8173d 100644
--- a/core/src/index/thirdparty/faiss/python/swigfaiss.swig
+++ b/core/src/index/thirdparty/faiss/python/swigfaiss.swig
@@ -25,7 +25,8 @@
 #pragma SWIG nowarn=512
 
 %include <stdint.i>
-typedef int64_t size_t;
+
+typedef uint64_t size_t;
 
 #define __restrict
 
@@ -92,6 +93,7 @@ extern "C" {
 #include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexBinaryFromFloat.h>
 #include <faiss/IndexBinaryHNSW.h>
+#include <faiss/IndexBinaryHash.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/index_io.h>
@@ -198,6 +200,7 @@ namespace std {
 //   uint64_t as unsigned long long, which SWIG is not aware of.
 %template(Uint64Vector) std::vector<unsigned long>;
 %template(LongVector) std::vector<long>;
+%template(LongLongVector) std::vector<long long>;
 %template(IntVector) std::vector<int>;
 %template(FloatVectorVector) std::vector<std::vector<float> >;
 %template(ByteVectorVector) std::vector<std::vector<unsigned char> >;
@@ -206,6 +209,7 @@ namespace std {
 %template(OperatingPointVector) std::vector<faiss::OperatingPoint>;
 %template(InvertedListsPtrVector) std::vector<faiss::InvertedLists*>;
 %template(RepeatVector) std::vector<faiss::Repeat>;
+%template(ClusteringIterationStatsVector) std::vector<faiss::ClusteringIterationStats>;
 
 #ifdef GPU_WRAPPER
 %template(GpuResourcesVector) std::vector<faiss::gpu::GpuResources*>;
@@ -309,6 +313,7 @@ void gpu_sync_all_devices()
 %include  <faiss/utils/distances.h>
 %include  <faiss/utils/random.h>
 
+%include  <faiss/MetricType.h>
 %include  <faiss/Index.h>
 %include  <faiss/Clustering.h>
 
@@ -325,12 +330,14 @@ void gpu_sync_all_devices()
 %include  <faiss/impl/PolysemousTraining.h>
 %include  <faiss/IndexPQ.h>
 %include  <faiss/InvertedLists.h>
+%include  <faiss/DirectMap.h>
 %ignore InvertedListScanner;
 %ignore BinaryInvertedListScanner;
 %include  <faiss/IndexIVF.h>
 // NOTE(hoss): SWIG (wrongly) believes the overloaded const version shadows the
 //   non-const one.
 %warnfilter(509) extract_index_ivf;
+%warnfilter(509) try_extract_index_ivf;
 %include  <faiss/IVFlib.h>
 %include  <faiss/impl/ScalarQuantizer.h>
 %include  <faiss/IndexScalarQuantizer.h>
@@ -353,6 +360,7 @@ void gpu_sync_all_devices()
 %include  <faiss/IndexBinaryIVF.h>
 %include  <faiss/IndexBinaryFromFloat.h>
 %include  <faiss/IndexBinaryHNSW.h>
+%include  <faiss/IndexBinaryHash.h>
 
 
 
@@ -973,6 +981,124 @@ struct MapLong2Long {
 
 %}
 
+/*******************************************************************
+ * Support I/O to arbitrary functions
+ *******************************************************************/
+
+
+%inline %{
+
+#ifdef SWIGPYTHON
+
+
+struct PyCallbackIOWriter: faiss::IOWriter {
+
+    PyObject * callback;
+    size_t bs; // maximum write size
+
+    PyCallbackIOWriter(PyObject *callback,
+                       size_t bs = 1024 * 1024):
+        callback(callback), bs(bs) {
+        Py_INCREF(callback);
+        name = "PyCallbackIOWriter";
+    }
+
+    size_t operator()(const void *ptrv, size_t size, size_t nitems) override {
+        size_t ws = size * nitems;
+        const char *ptr = (const char*)ptrv;
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+        while(ws > 0) {
+            size_t wi = ws > bs ? bs : ws;
+            PyObject* bo = PyBytes_FromStringAndSize(ptr, wi);
+            PyObject *arglist = Py_BuildValue("(N)", bo);
+            if(!arglist) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            ptr += wi;
+            ws -= wi;
+            PyObject * result = PyObject_CallObject(callback, arglist);
+            Py_DECREF(arglist);
+            if (result == NULL) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            Py_DECREF(result);
+        }
+        PyGILState_Release(gstate);
+        return nitems;
+    }
+
+    ~PyCallbackIOWriter() {
+        Py_DECREF(callback);
+    }
+
+};
+
+struct PyCallbackIOReader: faiss::IOReader {
+
+    PyObject * callback;
+    size_t bs; // maximum buffer size
+
+    PyCallbackIOReader(PyObject *callback,
+                       size_t bs = 1024 * 1024):
+        callback(callback), bs(bs) {
+        Py_INCREF(callback);
+        name = "PyCallbackIOReader";
+    }
+
+    size_t operator()(void *ptrv, size_t size, size_t nitems) override {
+        size_t rs = size * nitems;
+        char *ptr = (char*)ptrv;
+        PyGILState_STATE gstate;
+        gstate = PyGILState_Ensure();
+        while(rs > 0) {
+            size_t ri = rs > bs ? bs : rs;
+            PyObject *arglist = Py_BuildValue("(n)", ri);
+            PyObject * result = PyObject_CallObject(callback, arglist);
+            Py_DECREF(arglist);
+            if (result == NULL) {
+                PyGILState_Release(gstate);
+                return 0;
+            }
+            if(!PyBytes_Check(result)) {
+                Py_DECREF(result);
+                PyErr_SetString(PyExc_RuntimeError,
+                                "read callback did not return a bytes object");
+                PyGILState_Release(gstate);
+                throw faiss::FaissException("reader error");
+            }
+            size_t sz = PyBytes_Size(result);
+            if (sz == 0 || sz > rs) {
+                Py_DECREF(result);
+                PyErr_Format(PyExc_RuntimeError,
+                             "read callback returned %ld bytes (asked %ld)",
+                             sz, rs);
+                PyGILState_Release(gstate);
+                throw faiss::FaissException("reader error");
+            }
+            memcpy(ptr, PyBytes_AsString(result), sz);
+            Py_DECREF(result);
+            ptr += sz;
+            rs -= sz;
+        }
+        PyGILState_Release(gstate);
+        return nitems;
+    }
+
+    ~PyCallbackIOReader() {
+        Py_DECREF(callback);
+    }
+
+};
+
+#endif
+
+%}
+
+
+
 %inline %{
     void wait() {
         // in gdb, use return to get out of this function
diff --git a/core/src/index/thirdparty/faiss/tests/common.py b/core/src/index/thirdparty/faiss/tests/common.py
index b6bc37ef17..8621dd822a 100644
--- a/core/src/index/thirdparty/faiss/tests/common.py
+++ b/core/src/index/thirdparty/faiss/tests/common.py
@@ -3,10 +3,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 # a few common functions for the tests
 
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import numpy as np
 import faiss
 
@@ -97,3 +97,32 @@ def get_dataset_2(d, nt, nb, nq):
     x = np.sin(x)
     x = x.astype('float32')
     return x[:nt], x[nt:nt + nb], x[nt + nb:]
+
+
+def make_binary_dataset(d, nt, nb, nq):
+    assert d % 8 == 0
+    rs = np.random.RandomState(123)
+    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
+    return x[:nt], x[nt:-nq], x[-nq:]
+
+
+def compare_binary_result_lists(D1, I1, D2, I2):
+    """comparing result lists is difficult because there are many
+    ties. Here we sort by (distance, index) pairs and ignore the largest
+    distance of each result. Compatible result lists should pass this."""
+    assert D1.shape == I1.shape == D2.shape == I2.shape
+    n, k = D1.shape
+    ndiff = (D1 != D2).sum()
+    assert ndiff == 0, '%d differences in distance matrix %s' % (
+        ndiff, D1.shape)
+
+    def normalize_DI(D, I):
+        norm = I.max() + 1.0
+        Dr = D.astype('float64') + I / norm
+        # ignore -1s and elements on last column
+        Dr[I1 == -1] = 1e20
+        Dr[D == D[:, -1:]] = 1e20
+        Dr.sort(axis=1)
+        return Dr
+    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
+    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_factory.py b/core/src/index/thirdparty/faiss/tests/test_binary_factory.py
index dfe618cc38..70ddbb6e99 100644
--- a/core/src/index/thirdparty/faiss/tests/test_binary_factory.py
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_factory.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function
 
 import unittest
 import faiss
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_hashindex.py b/core/src/index/thirdparty/faiss/tests/test_binary_hashindex.py
new file mode 100644
index 0000000000..1ee5a5f7da
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_hashindex.py
@@ -0,0 +1,183 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
+import unittest
+import numpy as np
+import faiss
+
+from common import make_binary_dataset
+
+
+def bitvec_shuffle(a, order):
+    n, d = a.shape
+    db, = order.shape
+    b = np.empty((n, db // 8), dtype='uint8')
+    faiss.bitvec_shuffle(
+        n, d * 8, db,
+        faiss.swig_ptr(order),
+        faiss.swig_ptr(a), faiss.swig_ptr(b))
+    return b
+
+
+class TestSmallFuncs(unittest.TestCase):
+
+    def test_shuffle(self):
+        d = 256
+        n = 1000
+        rs = np.random.RandomState(123)
+        o = rs.permutation(d).astype('int32')
+
+        x = rs.randint(256, size=(n, d // 8)).astype('uint8')
+
+        y1 = bitvec_shuffle(x, o[:128])
+        y2 = bitvec_shuffle(x, o[128:])
+        y = np.hstack((y1, y2))
+
+        oinv = np.empty(d, dtype='int32')
+        oinv[o] = np.arange(d)
+        z = bitvec_shuffle(y, oinv)
+
+        np.testing.assert_array_equal(x, z)
+
+
+class TestRange(unittest.TestCase):
+
+    def test_hash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        radius = 55
+
+        Lref, Dref, Iref = index_ref.range_search(xq, radius)
+
+        print("nb res: ", Lref[-1])
+
+        index = faiss.IndexBinaryHash(d, 10)
+        index.add(xb)
+        # index.display()
+        nfound = []
+        ndis = []
+        stats = faiss.cvar.indexBinaryHash_stats
+        for n_bitflips in range(index.b + 1):
+            index.nflip = n_bitflips
+            stats.reset()
+            Lnew, Dnew, Inew = index.range_search(xq, radius)
+            for i in range(nq):
+                ref = Iref[Lref[i]:Lref[i + 1]]
+                new = Inew[Lnew[i]:Lnew[i + 1]]
+                snew = set(new)
+                # no duplicates
+                self.assertTrue(len(new) == len(snew))
+                # subset of real results
+                self.assertTrue(snew <= set(ref))
+            nfound.append(Lnew[-1])
+            ndis.append(stats.ndis)
+        print('nfound=', nfound)
+        print('ndis=', ndis)
+        nfound = np.array(nfound)
+        self.assertTrue(nfound[-1] == Lref[-1])
+        self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
+
+    def test_multihash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+
+        radius = 55
+
+        Lref, Dref, Iref = index_ref.range_search(xq, radius)
+
+        print("nb res: ", Lref[-1])
+
+        nfound = []
+        ndis = []
+
+        for nh in 1, 3, 5:
+            index = faiss.IndexBinaryMultiHash(d, nh, 10)
+            index.add(xb)
+            # index.display()
+            stats = faiss.cvar.indexBinaryHash_stats
+            index.nflip = 2
+            stats.reset()
+            Lnew, Dnew, Inew = index.range_search(xq, radius)
+            for i in range(nq):
+                ref = Iref[Lref[i]:Lref[i + 1]]
+                new = Inew[Lnew[i]:Lnew[i + 1]]
+                snew = set(new)
+                # no duplicates
+                self.assertTrue(len(new) == len(snew))
+                # subset of real results
+                self.assertTrue(snew <= set(ref))
+            nfound.append(Lnew[-1])
+            ndis.append(stats.ndis)
+        print('nfound=', nfound)
+        print('ndis=', ndis)
+        nfound = np.array(nfound)
+        # self.assertTrue(nfound[-1] == Lref[-1])
+        self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
+
+
+class TestKnn(unittest.TestCase):
+
+    def test_hash_and_multihash(self):
+        d = 128
+        nq = 100
+        nb = 2000
+
+        (_, xb, xq) = make_binary_dataset(d, 0, nb, nq)
+
+        index_ref = faiss.IndexBinaryFlat(d)
+        index_ref.add(xb)
+        k = 10
+        Dref, Iref = index_ref.search(xq, k)
+
+        nfound = {}
+        for nh in 0, 1, 3, 5:
+
+            for nbit in 4, 7:
+                if nh == 0:
+                    index = faiss.IndexBinaryHash(d, nbit)
+                else:
+                    index = faiss.IndexBinaryMultiHash(d, nh, nbit)
+                index.add(xb)
+                index.nflip = 2
+                Dnew, Inew = index.search(xq, k)
+                nf = 0
+                for i in range(nq):
+                    ref = Iref[i]
+                    new = Inew[i]
+                    snew = set(new)
+                    # no duplicates
+                    self.assertTrue(len(new) == len(snew))
+                    nf += len(set(ref) & snew)
+                print('nfound', nh, nbit, nf)
+                nfound[(nh, nbit)] = nf
+            self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
+
+            # test serialization
+            index2 = faiss.deserialize_index_binary(
+                faiss.serialize_index_binary(index))
+
+            D2, I2 = index2.search(xq, k)
+            np.testing.assert_array_equal(Inew, I2)
+            np.testing.assert_array_equal(Dnew, D2)
+
+        print('nfound=', nfound)
+        self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
+        self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
+        self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/core/src/index/thirdparty/faiss/tests/test_binary_io.py b/core/src/index/thirdparty/faiss/tests/test_binary_io.py
index 8cdc91df7a..4af7dab9ca 100644
--- a/core/src/index/thirdparty/faiss/tests/test_binary_io.py
+++ b/core/src/index/thirdparty/faiss/tests/test_binary_io.py
@@ -3,10 +3,10 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 """Binary indexes (de)serialization"""
 
+from __future__ import absolute_import, division, print_function, unicode_literals
+
 import numpy as np
 import unittest
 import faiss
diff --git a/core/src/index/thirdparty/faiss/tests/test_build_blocks.py b/core/src/index/thirdparty/faiss/tests/test_build_blocks.py
index 2c31bf7aeb..d1ce73cd1b 100644
--- a/core/src/index/thirdparty/faiss/tests/test_build_blocks.py
+++ b/core/src/index/thirdparty/faiss/tests/test_build_blocks.py
@@ -3,13 +3,16 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function
 
 import numpy as np
 
 import faiss
 import unittest
 
+from common import get_dataset_2
+
+
 
 class TestClustering(unittest.TestCase):
 
@@ -62,17 +65,19 @@ class TestClustering(unittest.TestCase):
         rs = np.random.RandomState(123)
         x = rs.uniform(size=(n, d)).astype('float32')
 
+        # make sure that doing 10 redos yields a better objective than just 1
+
         clus = faiss.Clustering(d, 20)
         clus.nredo = 1
         clus.train(x, faiss.IndexFlatL2(d))
-        obj1 = faiss.vector_to_array(clus.obj)
+        obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
 
         clus = faiss.Clustering(d, 20)
         clus.nredo = 10
         clus.train(x, faiss.IndexFlatL2(d))
-        obj10 = faiss.vector_to_array(clus.obj)
+        obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
 
-        self.assertGreater(obj1[-1], obj10[-1])
+        self.assertGreater(obj1, obj10)
 
     def test_1ptpercluster(self):
         # https://github.com/facebookresearch/faiss/issues/842
@@ -84,6 +89,89 @@ class TestClustering(unittest.TestCase):
         kmeans.train(X)
         l2_distances, I = kmeans.index.search(X, 1)
 
+    def test_weighted(self):
+        d = 32
+        sigma = 0.1
+
+        # Data is naturally clustered in 10 clusters.
+        # 5 clusters have 100 points
+        # 5 clusters have 10 points
+        # run k-means with 5 clusters
+
+        ccent = faiss.randn((10, d), 123)
+        faiss.normalize_L2(ccent)
+        x = [ccent[i] + sigma * faiss.randn((100, d), 1234 + i) for i in range(5)]
+        x += [ccent[i] + sigma * faiss.randn((10, d), 1234 + i) for i in range(5, 10)]
+        x = np.vstack(x)
+
+        clus = faiss.Clustering(d, 5)
+        index = faiss.IndexFlatL2(d)
+        clus.train(x, index)
+        cdis1, perm1 = index.search(ccent, 1)
+
+        # distance^2 of ground-truth centroids to clusters
+        cdis1_first = cdis1[:5].sum()
+        cdis1_last = cdis1[5:].sum()
+
+        # now assign weight 0.1 to the 5 first clusters and weight 10
+        # to the 5 last ones and re-run k-means
+        weights = np.ones(100 * 5 + 10 * 5, dtype='float32')
+        weights[:100 * 5] = 0.1
+        weights[100 * 5:] = 10
+
+        clus = faiss.Clustering(d, 5)
+        index = faiss.IndexFlatL2(d)
+        clus.train(x, index, weights=weights)
+        cdis2, perm2 = index.search(ccent, 1)
+
+        # distance^2 of ground-truth centroids to clusters
+        cdis2_first = cdis2[:5].sum()
+        cdis2_last = cdis2[5:].sum()
+
+        print(cdis1_first, cdis1_last)
+        print(cdis2_first, cdis2_last)
+
+        # with the new clustering, the last should be much (*2) closer
+        # to their centroids
+        self.assertGreater(cdis1_last, cdis1_first * 2)
+        self.assertGreater(cdis2_first, cdis2_last * 2)
+
+    def test_encoded(self):
+        d = 32
+        k = 5
+        xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
+
+        # make sure that training on a compressed then decompressed
+        # dataset gives the same result as decompressing on-the-fly
+
+        codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit)
+        codec.train(xt)
+        codes = codec.sa_encode(xt)
+
+        xt2 = codec.sa_decode(codes)
+
+        clus = faiss.Clustering(d, k)
+        # clus.verbose = True
+        clus.niter = 0
+        index = faiss.IndexFlatL2(d)
+        clus.train(xt2, index)
+        ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)
+
+        _, ref_errs = index.search(xt2, 1)
+
+        clus = faiss.Clustering(d, k)
+        # clus.verbose = True
+        clus.niter = 0
+        clus.decode_block_size = 120
+        index = faiss.IndexFlatL2(d)
+        clus.train_encoded(codes, codec, index)
+        new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)
+
+        _, new_errs = index.search(xt2, 1)
+
+        # It's the same operation, so should be bit-exact the same
+        self.assertTrue(np.all(ref_centroids == new_centroids))
+
 
 class TestPCA(unittest.TestCase):
 
@@ -120,7 +208,7 @@ class TestProductQuantizer(unittest.TestCase):
         x2 = pq.decode(codes)
         diff = ((x - x2)**2).sum()
 
-        # print "diff=", diff
+        # print("diff=", diff)
         # diff= 4418.0562
         self.assertGreater(5000, diff)
 
@@ -191,22 +279,20 @@ class TestException(unittest.TestCase):
         a = np.zeros((5, 10), dtype='float32')
         b = np.zeros(5, dtype='int64')
 
-        try:
-            # an unsupported operation for IndexFlat
-            index.add_with_ids(a, b)
-        except RuntimeError as e:
-            assert 'add_with_ids not implemented' in str(e)
-        else:
-            assert False, 'exception did not fire???'
+        # an unsupported operation for IndexFlat
+        self.assertRaises(
+            RuntimeError,
+            index.add_with_ids, a, b
+        )
+        # assert 'add_with_ids not implemented' in str(e)
 
     def test_exception_2(self):
+        self.assertRaises(
+            RuntimeError,
+            faiss.index_factory, 12, 'IVF256,Flat,PQ8'
+        )
+        #    assert 'could not parse' in str(e)
 
-        try:
-            faiss.index_factory(12, 'IVF256,Flat,PQ8')
-        except RuntimeError as e:
-            assert 'could not parse' in str(e)
-        else:
-            assert False, 'exception did not fire???'
 
 class TestMapLong2Long(unittest.TestCase):
 
@@ -273,7 +359,7 @@ class TestMAdd(unittest.TestCase):
         rs = np.random.RandomState(123)
         swig_ptr = faiss.swig_ptr
         for dim in 16, 32, 20, 25:
-            for repeat in 1, 2, 3, 4, 5:
+            for _repeat in 1, 2, 3, 4, 5:
                 a = rs.rand(dim).astype('float32')
                 b = rs.rand(dim).astype('float32')
                 c = np.zeros(dim, dtype='float32')
@@ -325,7 +411,7 @@ class TestMatrixStats(unittest.TestCase):
         m = rs.rand(40, 20).astype('float32')
         m[5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print comments
+        print(comments)
         assert 'has 5 copies' in comments
         assert '5 null vectors' in comments
 
@@ -334,7 +420,7 @@ class TestMatrixStats(unittest.TestCase):
         m = rs.rand(40, 20).astype('float32')
         m[::2] = m[1::2]
         comments = faiss.MatrixStats(m).comments
-        print comments
+        print(comments)
         assert '20 vectors are distinct' in comments
 
     def test_dead_dims(self):
@@ -342,7 +428,7 @@ class TestMatrixStats(unittest.TestCase):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print comments
+        print(comments)
         assert '5 dimensions are constant' in comments
 
     def test_rogue_means(self):
@@ -350,7 +436,7 @@ class TestMatrixStats(unittest.TestCase):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] += 12345
         comments = faiss.MatrixStats(m).comments
-        print comments
+        print(comments)
         assert '5 dimensions are too large wrt. their variance' in comments
 
     def test_normalized(self):
@@ -358,7 +444,7 @@ class TestMatrixStats(unittest.TestCase):
         m = rs.rand(40, 20).astype('float32')
         faiss.normalize_L2(m)
         comments = faiss.MatrixStats(m).comments
-        print comments
+        print(comments)
         assert 'vectors are normalized' in comments
 
 
@@ -366,7 +452,7 @@ class TestScalarQuantizer(unittest.TestCase):
 
     def test_8bit_equiv(self):
         rs = np.random.RandomState(123)
-        for it in range(20):
+        for _it in range(20):
             for d in 13, 16, 24:
                 x = np.floor(rs.rand(5, d) * 256).astype('float32')
                 x[0] = 0
@@ -483,6 +569,31 @@ class TestPairwiseDis(unittest.TestCase):
                 dis[i], np.dot(x[ix[i]], y[iy[i]]))
 
 
+class TestSWIGWrap(unittest.TestCase):
+    """ various regressions with the SWIG wrapper """
+
+    def test_size_t_ptr(self):
+        # issue 1064
+        index = faiss.IndexHNSWFlat(10, 32)
+
+        hnsw = index.hnsw
+        index.add(np.random.rand(100, 10).astype('float32'))
+        be = np.empty(2, 'uint64')
+        hnsw.neighbor_range(23, 0, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
+
+    def test_id_map_at(self):
+        # issue 1020
+        n_features = 100
+        feature_dims = 10
+
+        features = np.random.random((n_features, feature_dims)).astype(np.float32)
+        idx = np.arange(n_features).astype(np.int64)
+
+        index = faiss.IndexFlatL2(feature_dims)
+        index = faiss.IndexIDMap2(index)
+        index.add_with_ids(features, idx)
+
+        [index.id_map.at(int(i)) for i in range(index.ntotal)]
 
 
 if __name__ == '__main__':
diff --git a/core/src/index/thirdparty/faiss/tests/test_extra_distances.py b/core/src/index/thirdparty/faiss/tests/test_extra_distances.py
index 3d87669a2a..3977075879 100644
--- a/core/src/index/thirdparty/faiss/tests/test_extra_distances.py
+++ b/core/src/index/thirdparty/faiss/tests/test_extra_distances.py
@@ -106,6 +106,12 @@ class TestKNN(unittest.TestCase):
         for q in range(nq):
             assert np.all(D[q] == dis[q, I[q]])
 
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+
+        D2, I2 = index2.search(xq, 10)
+
+        self.assertTrue(np.all(I == I2))
+
     def test_L1(self):
         self.do_test_knn(faiss.METRIC_L1)
 
diff --git a/core/src/index/thirdparty/faiss/tests/test_factory.py b/core/src/index/thirdparty/faiss/tests/test_factory.py
index 968d52ceaa..e08b0ca850 100644
--- a/core/src/index/thirdparty/faiss/tests/test_factory.py
+++ b/core/src/index/thirdparty/faiss/tests/test_factory.py
@@ -3,8 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function
 
+import numpy as np
 import unittest
 import faiss
 
@@ -50,3 +51,14 @@ class TestFactory(unittest.TestCase):
     def test_factory_4(self):
         index = faiss.index_factory(12, "IVF10,FlatDedup")
         assert index.instances is not None
+
+
+class TestCloneSize(unittest.TestCase):
+
+    def test_clone_size(self):
+        index = faiss.index_factory(20, 'PCA10,Flat')
+        xb = faiss.rand((100, 20))
+        index.train(xb)
+        index.add(xb)
+        index2 = faiss.clone_index(index)
+        assert index2.ntotal == 100
diff --git a/core/src/index/thirdparty/faiss/tests/test_index.py b/core/src/index/thirdparty/faiss/tests/test_index.py
index 429ba1fb0d..c41f7f8c0b 100644
--- a/core/src/index/thirdparty/faiss/tests/test_index.py
+++ b/core/src/index/thirdparty/faiss/tests/test_index.py
@@ -3,9 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 """this is a basic test script for simple indices work"""
+from __future__ import absolute_import, division, print_function
+# no unicode_literals because it messes up in py2
 
 import numpy as np
 import unittest
@@ -13,7 +13,7 @@ import faiss
 import tempfile
 import os
 import re
-
+import warnings
 
 from common import get_dataset, get_dataset_2
 
@@ -24,7 +24,6 @@ class TestModuleInterface(unittest.TestCase):
         assert re.match('^\\d+\\.\\d+\\.\\d+$', faiss.__version__)
 
 
-
 class EvalIVFPQAccuracy(unittest.TestCase):
 
     def test_IndexIVFPQ(self):
@@ -487,38 +486,25 @@ class TestHNSW(unittest.TestCase):
         # infinite loop
         index.add(zero_vecs)
 
+    def test_hnsw_IP(self):
+        d = self.xq.shape[1]
 
-class TestIOError(unittest.TestCase):
+        index_IP = faiss.IndexFlatIP(d)
+        index_IP.add(self.xb)
+        Dref, Iref = index_IP.search(self.xq, 1)
 
-    def test_io_error(self):
-        d, n = 32, 1000
-        x = np.random.uniform(size=(n, d)).astype('float32')
-        index = faiss.IndexFlatL2(d)
-        index.add(x)
-        _, fname = tempfile.mkstemp()
-        try:
-            faiss.write_index(index, fname)
+        index = faiss.IndexHNSWFlat(d, 16, faiss.METRIC_INNER_PRODUCT)
+        index.add(self.xb)
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
 
-            # should be fine
-            faiss.read_index(fname)
+        print('nb equal: ', (Iref == Ihnsw).sum())
 
-            # now damage file
-            data = open(fname, 'rb').read()
-            data = data[:int(len(data) / 2)]
-            open(fname, 'wb').write(data)
+        self.assertGreaterEqual((Iref == Ihnsw).sum(), 480)
+
+        mask = Iref[:, 0] == Ihnsw[:, 0]
+        assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
 
-            # should make a nice readable exception that mentions the
-            try:
-                faiss.read_index(fname)
-            except RuntimeError as e:
-                if fname not in str(e):
-                    raise
-            else:
-                raise
 
-        finally:
-            if os.path.exists(fname):
-                os.unlink(fname)
 
 
 class TestDistancesPositive(unittest.TestCase):
@@ -546,7 +532,7 @@ class TestDistancesPositive(unittest.TestCase):
 
 class TestReconsException(unittest.TestCase):
 
-    def test_recons(self):
+    def test_recons_exception(self):
 
         d = 64                           # dimension
         nb = 1000
@@ -561,14 +547,93 @@ class TestReconsException(unittest.TestCase):
 
         index.reconstruct(9)
 
-        try:
-            index.reconstruct(100001)
-        except RuntimeError:
-            pass
-        else:
-            assert False, "should raise an exception"
+        self.assertRaises(
+            RuntimeError,
+            index.reconstruct, 100001
+        )
+
+    def test_reconstuct_after_add(self):
+        index = faiss.index_factory(10, 'IVF5,SQfp16')
+        index.train(faiss.randn((100, 10), 123))
+        index.add(faiss.randn((100, 10), 345))
+        index.make_direct_map()
+        index.add(faiss.randn((100, 10), 678))
+
+        # should not raise an exception
+        index.reconstruct(5)
+        print(index.ntotal)
+        index.reconstruct(150)
 
 
+class TestReconsHash(unittest.TestCase):
+
+    def do_test(self, index_key):
+        d = 32
+        index = faiss.index_factory(d, index_key)
+        index.train(faiss.randn((100, d), 123))
+
+        # reference reconstruction
+        index.add(faiss.randn((100, d), 345))
+        index.add(faiss.randn((100, d), 678))
+        ref_recons = index.reconstruct_n(0, 200)
+
+        # with lookup
+        index.reset()
+        rs = np.random.RandomState(123)
+        ids = rs.choice(10000, size=200, replace=False)
+        index.add_with_ids(faiss.randn((100, d), 345), ids[:100])
+        index.set_direct_map_type(faiss.DirectMap.Hashtable)
+        index.add_with_ids(faiss.randn((100, d), 678), ids[100:])
+
+        # compare
+        for i in range(0, 200, 13):
+            recons = index.reconstruct(int(ids[i]))
+            self.assertTrue(np.all(recons == ref_recons[i]))
+
+        # test I/O
+        buf = faiss.serialize_index(index)
+        index2 = faiss.deserialize_index(buf)
+
+        # compare
+        for i in range(0, 200, 13):
+            recons = index2.reconstruct(int(ids[i]))
+            self.assertTrue(np.all(recons == ref_recons[i]))
+
+        # remove
+        toremove = np.ascontiguousarray(ids[0:200:3])
+
+        sel = faiss.IDSelectorArray(50, faiss.swig_ptr(toremove[:50]))
+
+        # test both ways of removing elements
+        nremove = index2.remove_ids(sel)
+        nremove += index2.remove_ids(toremove[50:])
+
+        self.assertEqual(nremove, len(toremove))
+
+        for i in range(0, 200, 13):
+            if i % 3 == 0:
+                self.assertRaises(
+                    RuntimeError,
+                    index2.reconstruct, int(ids[i])
+                )
+            else:
+                recons = index2.reconstruct(int(ids[i]))
+                self.assertTrue(np.all(recons == ref_recons[i]))
+
+        # index error should raise
+        self.assertRaises(
+            RuntimeError,
+            index.reconstruct, 20000
+        )
+
+    def test_IVFFlat(self):
+        self.do_test("IVF5,Flat")
+
+    def test_IVFSQ(self):
+        self.do_test("IVF5,SQfp16")
+
+    def test_IVFPQ(self):
+        self.do_test("IVF5,PQ4x4np")
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py b/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py
index 41244da326..d97362f843 100644
--- a/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py
+++ b/core/src/index/thirdparty/faiss/tests/test_index_accuracy.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
 # noqa E741
 # translation of test_knn.lua
 
@@ -445,7 +445,37 @@ class TestPQFlavors(unittest.TestCase):
             print('ndiff %d / %d' % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
+    def test_IVFPQ_non8bit(self):
+        d = 16
+        xt, xb, xq = get_dataset_2(d, 10000, 2000, 200)
+        nlist = 64
 
+        gt_index = faiss.IndexFlat(d)
+        gt_index.add(xb)
+        gt_D, gt_I = gt_index.search(xq, 10)
+
+        quantizer = faiss.IndexFlat(d)
+        ninter = {}
+        for v in '2x8', '8x2':
+            if v == '8x2':
+                index = faiss.IndexIVFPQ(
+                    quantizer, d, nlist, 2, 8)
+            else:
+                index = faiss.IndexIVFPQ(
+                    quantizer, d, nlist, 8, 2)
+            index.train(xt)
+            index.add(xb)
+            index.npobe = 16
+
+            D, I = index.search(xq, 10)
+            ninter[v] = faiss.eval_intersection(I, gt_I)
+        print('ninter=', ninter)
+        # this should be the case but we don't observe
+        # that... Probavly too few test points
+        #  assert ninter['2x8'] > ninter['8x2']
+        # ref numbers on 2019-11-02
+        assert abs(ninter['2x8'] - 458) < 4
+        assert abs(ninter['8x2'] - 465) < 4
 
 
 class TestFlat1D(unittest.TestCase):
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_binary.py b/core/src/index/thirdparty/faiss/tests/test_index_binary.py
index 046e2bb3e9..c61e2fa5df 100644
--- a/core/src/index/thirdparty/faiss/tests/test_index_binary.py
+++ b/core/src/index/thirdparty/faiss/tests/test_index_binary.py
@@ -3,20 +3,15 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 """this is a basic test script for simple indices work"""
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import numpy as np
 import unittest
 import faiss
 
+from common import compare_binary_result_lists, make_binary_dataset
 
-def make_binary_dataset(d, nt, nb, nq):
-    assert d % 8 == 0
-    rs = np.random.RandomState(123)
-    x = rs.randint(256, size=(nb + nq + nt, int(d / 8))).astype('uint8')
-    return x[:nt], x[nt:-nq], x[-nq:]
 
 
 def binary_to_float(x):
@@ -125,6 +120,29 @@ class TestBinaryFlat(unittest.TestCase):
             assert(np.all(Iflat == -1))
             assert(np.all(Dflat == 2147483647)) # NOTE(hoss): int32_t max
 
+    def test_range_search(self):
+        d = self.xq.shape[1] * 8
+
+        index = faiss.IndexBinaryFlat(d)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 10)
+        thresh = int(np.median(D[:, -1]))
+
+        lims, D2, I2 = index.range_search(self.xq, thresh)
+        nt1 = nt2 = 0
+        for i in range(len(self.xq)):
+            range_res = I2[lims[i]:lims[i + 1]]
+            if thresh > D[i, -1]:
+                self.assertTrue(set(I[i]) <= set(range_res))
+                nt1 += 1
+            elif thresh < D[i, -1]:
+                self.assertTrue(set(range_res) <= set(I[i]))
+                nt2 += 1
+            # in case of equality we have a problem with ties
+        print('nb tests', nt1, nt2)
+        # nb tests is actually low...
+        self.assertTrue(nt1 > 19 and nt2 > 19)
+
 
 class TestBinaryIVF(unittest.TestCase):
 
@@ -167,6 +185,29 @@ class TestBinaryIVF(unittest.TestCase):
 
         self.assertEqual((self.Dref == Divfflat).sum(), 4122)
 
+    def test_ivf_range(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+        index.add(self.xb)
+        D, I = index.search(self.xq, 10)
+
+        radius = int(np.median(D[:, -1]) + 1)
+        Lr, Dr, Ir = index.range_search(self.xq, radius)
+
+        for i in range(len(self.xq)):
+            res = Ir[Lr[i]:Lr[i + 1]]
+            if D[i, -1] < radius:
+                self.assertTrue(set(I[i]) <= set(res))
+            else:
+                subset = I[i, D[i, :] < radius]
+                self.assertTrue(set(subset) == set(res))
+
+
     def test_ivf_flat_empty(self):
         d = self.xq.shape[1] * 8
 
@@ -180,6 +221,37 @@ class TestBinaryIVF(unittest.TestCase):
             assert(np.all(Iivfflat == -1))
             assert(np.all(Divfflat == 2147483647)) # NOTE(hoss): int32_t max
 
+    def test_ivf_reconstruction(self):
+        d = self.xq.shape[1] * 8
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.nprobe = 4
+        index.train(self.xt)
+
+        index.add(self.xb)
+        index.set_direct_map_type(faiss.DirectMap.Array)
+
+        for i in range(0, len(self.xb), 13):
+            np.testing.assert_array_equal(
+                index.reconstruct(i),
+                self.xb[i]
+            )
+
+        # try w/ hashtable
+        index = faiss.IndexBinaryIVF(quantizer, d, 8)
+        rs = np.random.RandomState(123)
+        ids = rs.choice(10000, size=len(self.xb), replace=False)
+        index.add_with_ids(self.xb, ids)
+        index.set_direct_map_type(faiss.DirectMap.Hashtable)
+
+        for i in range(0, len(self.xb), 13):
+            np.testing.assert_array_equal(
+                index.reconstruct(int(ids[i])),
+                self.xb[i]
+            )
+
+
 class TestHNSW(unittest.TestCase):
 
     def __init__(self, *args, **kwargs):
@@ -227,27 +299,6 @@ class TestHNSW(unittest.TestCase):
         self.assertTrue((Dref == Dbin).all())
 
 
-def compare_binary_result_lists(D1, I1, D2, I2):
-    """comparing result lists is difficult because there are many
-    ties. Here we sort by (distance, index) pairs and ignore the largest
-    distance of each result. Compatible result lists should pass this."""
-    assert D1.shape == I1.shape == D2.shape == I2.shape
-    n, k = D1.shape
-    ndiff = (D1 != D2).sum()
-    assert ndiff == 0, '%d differences in distance matrix %s' % (
-        ndiff, D1.shape)
-
-    def normalize_DI(D, I):
-        norm = I.max() + 1.0
-        Dr = D.astype('float64') + I / norm
-        # ignore -1s and elements on last column
-        Dr[I1 == -1] = 1e20
-        Dr[D == D[:, -1:]] = 1e20
-        Dr.sort(axis=1)
-        return Dr
-    ndiff = (normalize_DI(D1, I1) != normalize_DI(D2, I2)).sum()
-    assert ndiff == 0, '%d differences in normalized D matrix' % ndiff
-
 
 class TestReplicasAndShards(unittest.TestCase):
 
@@ -265,7 +316,7 @@ class TestReplicasAndShards(unittest.TestCase):
 
         nrep = 5
         index = faiss.IndexBinaryReplicas()
-        for i in range(nrep):
+        for _i in range(nrep):
             sub_idx = faiss.IndexBinaryFlat(d)
             sub_idx.add(xb)
             index.addIndex(sub_idx)
@@ -276,7 +327,7 @@ class TestReplicasAndShards(unittest.TestCase):
         self.assertTrue((Iref == I).all())
 
         index2 = faiss.IndexBinaryReplicas()
-        for i in range(nrep):
+        for _i in range(nrep):
             sub_idx = faiss.IndexBinaryFlat(d)
             index2.addIndex(sub_idx)
 
@@ -310,7 +361,7 @@ class TestReplicasAndShards(unittest.TestCase):
         compare_binary_result_lists(Dref, Iref, D, I)
 
         index2 = faiss.IndexBinaryShards(d)
-        for i in range(nrep):
+        for _i in range(nrep):
             sub_idx = faiss.IndexBinaryFlat(d)
             index2.add_shard(sub_idx)
 
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py b/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py
index 1293381b17..73d6c726d4 100644
--- a/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py
+++ b/core/src/index/thirdparty/faiss/tests/test_index_binary_from_float.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from __future__ import absolute_import, division, print_function
+
 import numpy as np
 import unittest
 import faiss
diff --git a/core/src/index/thirdparty/faiss/tests/test_index_composite.py b/core/src/index/thirdparty/faiss/tests/test_index_composite.py
index 40b5daac8d..55230f9d9b 100644
--- a/core/src/index/thirdparty/faiss/tests/test_index_composite.py
+++ b/core/src/index/thirdparty/faiss/tests/test_index_composite.py
@@ -3,9 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 """ more elaborate that test_index.py """
+from __future__ import absolute_import, division, print_function
 
 import numpy as np
 import unittest
@@ -89,7 +88,7 @@ class TestRemove(unittest.TestCase):
         assert index.reconstruct(104)[0] == 1004
         try:
             index.reconstruct(103)
-        except:
+        except RuntimeError:
             pass
         else:
             assert False, 'should have raised an exception'
@@ -125,7 +124,7 @@ class TestRemove(unittest.TestCase):
         assert index.reconstruct(1004)[0] == 104
         try:
             index.reconstruct(1003)
-        except:
+        except RuntimeError:
             pass
         else:
             assert False, 'should have raised an exception'
@@ -141,7 +140,7 @@ class TestRemove(unittest.TestCase):
         assert index.reconstruct(1004)[0] == 104
         try:
             index.reconstruct(1003)
-        except:
+        except RuntimeError:
             pass
         else:
             assert False, 'should have raised an exception'
diff --git a/core/src/index/thirdparty/faiss/tests/test_io.py b/core/src/index/thirdparty/faiss/tests/test_io.py
new file mode 100644
index 0000000000..7e3d6edf59
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tests/test_io.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
+import numpy as np
+import unittest
+import faiss
+import tempfile
+import os
+import io
+import sys
+import warnings
+from multiprocessing.dummy import Pool as ThreadPool
+
+from common import get_dataset, get_dataset_2
+
+
+class TestIOVariants(unittest.TestCase):
+
+    def test_io_error(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            # should be fine
+            faiss.read_index(fname)
+
+            # now damage file
+            data = open(fname, 'rb').read()
+            data = data[:int(len(data) / 2)]
+            open(fname, 'wb').write(data)
+
+            # should make a nice readable exception that mentions the filename
+            try:
+                faiss.read_index(fname)
+            except RuntimeError as e:
+                if fname not in str(e):
+                    raise
+            else:
+                raise
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+class TestCallbacks(unittest.TestCase):
+
+    def do_write_callback(self, bsz):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        f = io.BytesIO()
+        # test with small block size
+        writer = faiss.PyCallbackIOWriter(f.write, 1234)
+
+        if bsz > 0:
+            writer = faiss.BufferedIOWriter(writer, bsz)
+
+        faiss.write_index(index, writer)
+        del writer   # make sure all writes committed
+
+        if sys.version_info[0] < 3:
+            buf = f.getvalue()
+        else:
+            buf = f.getbuffer()
+
+        index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8'))
+
+        self.assertEqual(index.d, index2.d)
+        self.assertTrue(np.all(
+            faiss.vector_to_array(index.xb) == faiss.vector_to_array(index2.xb)
+        ))
+
+        # This is not a callable function: shoudl raise an exception
+        writer = faiss.PyCallbackIOWriter("blabla")
+        self.assertRaises(
+            Exception,
+            faiss.write_index, index, writer
+        )
+
+    def test_buf_read(self):
+        x = np.random.uniform(size=20)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            x.tofile(fname)
+
+            f = open(fname, 'rb')
+            reader = faiss.PyCallbackIOReader(f.read, 1234)
+
+            bsz = 123
+            reader = faiss.BufferedIOReader(reader, bsz)
+
+            y = np.zeros_like(x)
+            print('nbytes=', y.nbytes)
+            reader(faiss.swig_ptr(y), y.nbytes, 1)
+
+            np.testing.assert_array_equal(x, y)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def do_read_callback(self, bsz):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            f = open(fname, 'rb')
+
+            reader = faiss.PyCallbackIOReader(f.read, 1234)
+
+            if bsz > 0:
+                reader = faiss.BufferedIOReader(reader, bsz)
+
+            index2 = faiss.read_index(reader)
+
+            self.assertEqual(index.d, index2.d)
+            np.testing.assert_array_equal(
+                faiss.vector_to_array(index.xb),
+                faiss.vector_to_array(index2.xb)
+            )
+
+            # This is not a callable function: should raise an exception
+            reader = faiss.PyCallbackIOReader("blabla")
+            self.assertRaises(
+                Exception,
+                faiss.read_index, reader
+            )
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_write_callback(self):
+        self.do_write_callback(0)
+
+    def test_write_buffer(self):
+        self.do_write_callback(123)
+        self.do_write_callback(2345)
+
+    def test_read_callback(self):
+        self.do_read_callback(0)
+
+    def test_read_callback_buffered(self):
+        self.do_read_callback(123)
+        self.do_read_callback(12345)
+
+    def test_read_buffer(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+
+        _, fname = tempfile.mkstemp()
+        try:
+            faiss.write_index(index, fname)
+
+            reader = faiss.BufferedIOReader(
+                faiss.FileIOReader(fname), 1234)
+
+            index2 = faiss.read_index(reader)
+
+            self.assertEqual(index.d, index2.d)
+            np.testing.assert_array_equal(
+                faiss.vector_to_array(index.xb),
+                faiss.vector_to_array(index2.xb)
+            )
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+    def test_transfer_pipe(self):
+        """ transfer an index through a Unix pipe """
+
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        index = faiss.IndexFlatL2(d)
+        index.add(x)
+        Dref, Iref = index.search(x, 10)
+
+        rf, wf = os.pipe()
+
+        # start thread that will decompress the index
+
+        def index_from_pipe():
+            reader = faiss.PyCallbackIOReader(lambda size: os.read(rf, size))
+            return faiss.read_index(reader)
+
+        fut = ThreadPool(1).apply_async(index_from_pipe, ())
+
+        # write to pipe
+        writer = faiss.PyCallbackIOWriter(lambda b: os.write(wf, b))
+        faiss.write_index(index, writer)
+
+        index2 = fut.get()
+
+        # closing is not really useful but it does not hurt
+        os.close(wf)
+        os.close(rf)
+
+        Dnew, Inew = index2.search(x, 10)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
diff --git a/core/src/index/thirdparty/faiss/tests/test_ivflib.py b/core/src/index/thirdparty/faiss/tests/test_ivflib.py
index f28ffc5318..0166013c08 100644
--- a/core/src/index/thirdparty/faiss/tests/test_ivflib.py
+++ b/core/src/index/thirdparty/faiss/tests/test_ivflib.py
@@ -3,11 +3,11 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function
 
 import unittest
 import faiss
-
+import numpy as np
 
 class TestIVFlib(unittest.TestCase):
 
@@ -19,3 +19,60 @@ class TestIVFlib(unittest.TestCase):
 
         for method in methods:
             assert callable(getattr(faiss, method, None))
+
+
+def search_single_scan(index, xq, k, bs=128):
+    """performs a search so that the inverted lists are accessed
+    sequentially by blocks of size bs"""
+
+    # handle pretransform
+    if isinstance(index, faiss.IndexPreTransform):
+        xq = index.apply_py(xq)
+        index = faiss.downcast_index(index.index)
+
+    # coarse assignment
+    coarse_dis, assign = index.quantizer.search(xq, index.nprobe)
+    nlist = index.nlist
+    assign_buckets = assign // bs
+    nq = len(xq)
+
+    rh = faiss.ResultHeap(nq, k)
+    index.parallel_mode |= index.PARALLEL_MODE_NO_HEAP_INIT
+
+    for l0 in range(0, nlist, bs):
+        bucket_no = l0 // bs
+        skip_rows, skip_cols = np.where(assign_buckets != bucket_no)
+        sub_assign = assign.copy()
+        sub_assign[skip_rows, skip_cols] = -1
+
+        index.search_preassigned(
+            nq, faiss.swig_ptr(xq), k,
+            faiss.swig_ptr(sub_assign), faiss.swig_ptr(coarse_dis),
+            faiss.swig_ptr(rh.D), faiss.swig_ptr(rh.I),
+            False, None
+        )
+
+    rh.finalize()
+
+    return rh.D, rh.I
+
+
+class TestSequentialScan(unittest.TestCase):
+
+    def test_sequential_scan(self):
+        d = 20
+        index = faiss.index_factory(d, 'IVF100,SQ8')
+
+        rs = np.random.RandomState(123)
+        xt = rs.rand(5000, d).astype('float32')
+        xb = rs.rand(10000, d).astype('float32')
+        index.train(xt)
+        index.add(xb)
+        k = 15
+        xq = rs.rand(200, d).astype('float32')
+
+        ref_D, ref_I = index.search(xq, k)
+        D, I = search_single_scan(index, xq, k, bs=10)
+
+        assert np.all(D == ref_D)
+        assert np.all(I == ref_I)
diff --git a/core/src/index/thirdparty/faiss/tests/test_merge.cpp b/core/src/index/thirdparty/faiss/tests/test_merge.cpp
index b32e7e68e4..47af106149 100644
--- a/core/src/index/thirdparty/faiss/tests/test_merge.cpp
+++ b/core/src/index/thirdparty/faiss/tests/test_merge.cpp
@@ -14,7 +14,6 @@
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/MetaIndexes.h>
-#include <faiss/impl/FaissAssert.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/OnDiskInvertedLists.h>
 #include <faiss/IVFlib.h>
diff --git a/core/src/index/thirdparty/faiss/tests/test_meta_index.py b/core/src/index/thirdparty/faiss/tests/test_meta_index.py
index d072516e00..137efc2aeb 100644
--- a/core/src/index/thirdparty/faiss/tests/test_meta_index.py
+++ b/core/src/index/thirdparty/faiss/tests/test_meta_index.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 # translation of test_meta_index.lua
 
diff --git a/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py b/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py
index 1aa5da0ba4..c96494dc1f 100644
--- a/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py
+++ b/core/src/index/thirdparty/faiss/tests/test_omp_threads_py.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from __future__ import absolute_import, division, print_function, unicode_literals
+
 import faiss
 import unittest
 
diff --git a/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp b/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp
index 6d11a69b6c..214e925d15 100644
--- a/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp
+++ b/core/src/index/thirdparty/faiss/tests/test_pq_encoding.cpp
@@ -45,13 +45,13 @@ TEST(PQEncoderGeneric, encode) {
 
     // NOTE(hoss): Necessary scope to ensure trailing bits are flushed to mem.
     {
-      faiss::ProductQuantizer::PQEncoderGeneric encoder(codes.get(), nbits);
+      faiss::PQEncoderGeneric encoder(codes.get(), nbits);
       for (const auto& v : values) {
         encoder.encode(v & mask);
       }
     }
 
-    faiss::ProductQuantizer::PQDecoderGeneric decoder(codes.get(), nbits);
+    faiss::PQDecoderGeneric decoder(codes.get(), nbits);
     for (int i = 0; i < nsubcodes; ++i) {
       uint64_t v = decoder.decode();
       EXPECT_EQ(values[i] & mask, v);
@@ -66,12 +66,12 @@ TEST(PQEncoder8, encode) {
   const uint64_t mask = 0xFF;
   std::unique_ptr<uint8_t[]> codes(new uint8_t[nsubcodes]);
 
-  faiss::ProductQuantizer::PQEncoder8 encoder(codes.get(), 8);
+  faiss::PQEncoder8 encoder(codes.get(), 8);
   for (const auto& v : values) {
     encoder.encode(v & mask);
   }
 
-  faiss::ProductQuantizer::PQDecoder8 decoder(codes.get(), 8);
+  faiss::PQDecoder8 decoder(codes.get(), 8);
   for (int i = 0; i < nsubcodes; ++i) {
     uint64_t v = decoder.decode();
     EXPECT_EQ(values[i] & mask, v);
@@ -85,12 +85,12 @@ TEST(PQEncoder16, encode) {
   const uint64_t mask = 0xFFFF;
   std::unique_ptr<uint8_t[]> codes(new uint8_t[2 * nsubcodes]);
 
-  faiss::ProductQuantizer::PQEncoder16 encoder(codes.get(), 16);
+  faiss::PQEncoder16 encoder(codes.get(), 16);
   for (const auto& v : values) {
     encoder.encode(v & mask);
   }
 
-  faiss::ProductQuantizer::PQDecoder16 decoder(codes.get(), 16);
+  faiss::PQDecoder16 decoder(codes.get(), 16);
   for (int i = 0; i < nsubcodes; ++i) {
     uint64_t v = decoder.decode();
     EXPECT_EQ(values[i] & mask, v);
diff --git a/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py b/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py
index 64cac3a1ef..35bf0f8eaa 100644
--- a/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py
+++ b/core/src/index/thirdparty/faiss/tests/test_referenced_objects.py
@@ -3,9 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-#! /usr/bin/env python2
-
 """make sure that the referenced objects are kept"""
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import numpy as np
 import unittest
@@ -78,7 +77,7 @@ class TestReferenced(unittest.TestCase):
 
     def test_shards(self):
         index = faiss.IndexShards(d)
-        for i in range(3):
+        for _i in range(3):
             sub_index = faiss.IndexFlatL2(d)
             sub_index.add(xb)
             index.add_shard(sub_index)
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/5-Multiple-GPUs.cpp b/core/src/index/thirdparty/faiss/tutorial/cpp/5-Multiple-GPUs.cpp
new file mode 100644
index 0000000000..3152b731a1
--- /dev/null
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/5-Multiple-GPUs.cpp
@@ -0,0 +1,100 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuAutoTune.h>
+#include <faiss/gpu/GpuCloner.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/DeviceUtils.h>
+
+
+int main() {
+    int d = 64;                            // dimension
+    int nb = 100000;                       // database size
+    int nq = 10000;                        // nb of queries
+
+    float *xb = new float[d * nb];
+    float *xq = new float[d * nq];
+
+    for(int i = 0; i < nb; i++) {
+        for(int j = 0; j < d; j++)
+            xb[d * i + j] = drand48();
+        xb[d * i] += i / 1000.;
+    }
+
+    for(int i = 0; i < nq; i++) {
+        for(int j = 0; j < d; j++)
+            xq[d * i + j] = drand48();
+        xq[d * i] += i / 1000.;
+    }
+
+    int ngpus = faiss::gpu::getNumDevices();
+
+    printf("Number of GPUs: %d\n", ngpus);
+
+    std::vector<faiss::gpu::GpuResources*> res;
+    std::vector<int> devs;
+    for(int i = 0; i < ngpus; i++) {
+        res.push_back(new faiss::gpu::StandardGpuResources);
+        devs.push_back(i);
+    }
+
+    faiss::IndexFlatL2 cpu_index(d);
+
+    faiss::Index *gpu_index =
+        faiss::gpu::index_cpu_to_gpu_multiple(
+            res,
+            devs,
+            &cpu_index
+        );
+
+    printf("is_trained = %s\n", gpu_index->is_trained ? "true" : "false");
+    gpu_index->add(nb, xb);  // add vectors to the index
+    printf("ntotal = %ld\n", gpu_index->ntotal);
+
+    int k = 4;
+
+    {       // search xq
+        long *I = new long[k * nq];
+        float *D = new float[k * nq];
+
+        gpu_index->search(nq, xq, k, D, I);
+
+        // print results
+        printf("I (5 first results)=\n");
+        for(int i = 0; i < 5; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("I (5 last results)=\n");
+        for(int i = nq - 5; i < nq; i++) {
+            for(int j = 0; j < k; j++)
+                printf("%5ld ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete [] I;
+        delete [] D;
+    }
+
+    delete gpu_index;
+
+    for(int i = 0; i < ngpus; i++) {
+        delete res[i];
+    }
+
+    delete [] xb;
+    delete [] xq;
+
+    return 0;
+}
diff --git a/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile b/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile
index 9746bf0458..472975f1d9 100644
--- a/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile
+++ b/core/src/index/thirdparty/faiss/tutorial/cpp/Makefile
@@ -6,7 +6,7 @@
 -include ../../makefile.inc
 
 CPU_TARGETS = 1-Flat 2-IVFFlat 3-IVFPQ 9-BinaryFlat
-GPU_TARGETS = 6-RUN 7-GPU 8-GPU
+GPU_TARGETS = 4-GPU 5-Multiple-GPUs
 
 default: cpu
 
diff --git a/core/src/index/thirdparty/faiss/utils/distances.cpp b/core/src/index/thirdparty/faiss/utils/distances.cpp
index 50f16b53b0..419f679766 100644
--- a/core/src/index/thirdparty/faiss/utils/distances.cpp
+++ b/core/src/index/thirdparty/faiss/utils/distances.cpp
@@ -585,7 +585,7 @@ void knn_inner_product (const float * x,
         float_minheap_array_t * res,
         ConcurrentBitsetPtr bitset)
 {
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+    if (nx < distance_compute_blas_threshold) {
         knn_inner_product_sse (x, y, d, nx, ny, res, bitset);
     } else {
         knn_inner_product_blas (x, y, d, nx, ny, res, bitset);
@@ -606,7 +606,7 @@ void knn_L2sqr (const float * x,
                 float_maxheap_array_t * res,
                 ConcurrentBitsetPtr bitset)
 {
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+    if (nx < distance_compute_blas_threshold) {
         knn_L2sqr_sse (x, y, d, nx, ny, res, bitset);
     } else {
         NopDistanceCorrection nop;
@@ -883,7 +883,6 @@ static void range_search_sse (const float * x,
                 float radius,
                 RangeSearchResult *res)
 {
-    FAISS_THROW_IF_NOT (d % 4 == 0);
 
 #pragma omp parallel
     {
@@ -933,7 +932,7 @@ void range_search_L2sqr (
         RangeSearchResult *res)
 {
 
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+    if (nx < distance_compute_blas_threshold) {
         range_search_sse<true> (x, y, d, nx, ny, radius, res);
     } else {
         range_search_blas<true> (x, y, d, nx, ny, radius, res);
@@ -948,7 +947,7 @@ void range_search_inner_product (
         RangeSearchResult *res)
 {
 
-    if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
+    if (nx < distance_compute_blas_threshold) {
         range_search_sse<false> (x, y, d, nx, ny, radius, res);
     } else {
         range_search_blas<false> (x, y, d, nx, ny, radius, res);
diff --git a/core/src/index/thirdparty/faiss/utils/extra_distances.cpp b/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
index 091f5d0793..de03b013ac 100644
--- a/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
+++ b/core/src/index/thirdparty/faiss/utils/extra_distances.cpp
@@ -149,11 +149,6 @@ struct VectorDistanceTanimoto {
 
 
 
-
-
-
-
-
 namespace {
 
 template<class VD>
@@ -206,9 +201,8 @@ void knn_extra_metrics_template (
 
             maxheap_heapify (k, simi, idxi);
             for (j = 0; j < ny; j++) {
-                if(!bitset || !bitset->test(j)){
+                if (!bitset || !bitset->test(j)) {
                     float disij = vd (x_i, y_j);
-
                     if (disij < simi[0]) {
                         maxheap_pop (k, simi, idxi);
                         maxheap_push (k, simi, idxi, disij, j);
@@ -248,21 +242,6 @@ struct ExtraDistanceComputer : DistanceComputer {
     }
 };
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 } // anonymous namespace
 
 void pairwise_extra_distances (
@@ -355,7 +334,6 @@ void knn_extra_metrics (
         knn_extra_metrics_template(vd, x, y, nx, ny, res, bitset);
         break;
     }
-
     default:
         FAISS_THROW_MSG ("metric type not implemented");
     }
diff --git a/core/src/index/thirdparty/faiss/utils/hamming.cpp b/core/src/index/thirdparty/faiss/utils/hamming.cpp
index e6dca07950..2e714c34b4 100644
--- a/core/src/index/thirdparty/faiss/utils/hamming.cpp
+++ b/core/src/index/thirdparty/faiss/utils/hamming.cpp
@@ -28,16 +28,14 @@
 
 #include <vector>
 #include <memory>
-#include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#include <assert.h>
-#include <limits.h>
 #include <omp.h>
 
 #include <faiss/utils/Heap.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
+#include <faiss/impl/AuxIndexStructures.h>
 
 static const size_t BLOCKSIZE_QUERY = 8192;
 static const size_t size_1M = 1 * 1024 * 1024;
@@ -363,7 +361,7 @@ void hammings_knn_hc (
         }
     }
     if (order) ha->reorder ();
- }
+}
 
 /* Return closest neighbors w.r.t Hamming distance, using max count. */
 template <class HammingComputer>
@@ -400,9 +398,9 @@ void hammings_knn_mc (
 #pragma omp parallel for
     for (size_t i = 0; i < na; ++i) {
       for (size_t j = j0; j < j1; ++j) {
-          if(!bitset || !bitset->test(j)){
-              cs[i].update_counter(b + j * bytes_per_code, j);
-          }
+        if (!bitset || !bitset->test(j)) {
+          cs[i].update_counter(b + j * bytes_per_code, j);
+        }
       }
     }
   }
@@ -593,6 +591,30 @@ void bitvec_print (const uint8_t * b, size_t d)
 }
 
 
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b)
+{
+    for(size_t i = 0; i < db; i++) {
+        FAISS_THROW_IF_NOT (order[i] >= 0 && order[i] < da);
+    }
+    size_t lda = (da + 7) / 8;
+    size_t ldb = (db + 7) / 8;
+
+#pragma omp parallel for if(n > 10000)
+    for (size_t i = 0; i < n; i++) {
+        const uint8_t *ai = a + i * lda;
+        uint8_t *bi = b + i * ldb;
+        memset (bi, 0, ldb);
+        for(size_t i = 0; i < db; i++) {
+            int o = order[i];
+            uint8_t the_bit = (ai[o >> 3] >> (o & 7)) & 1;
+            bi[i >> 3] |= the_bit << (i & 7);
+        }
+    }
+
+}
 
 
 
@@ -636,6 +658,7 @@ void hammings_knn(
 {
     hammings_knn_hc(ha, a, b, nb, ncodes, order);
 }
+
 void hammings_knn_hc (
         int_maxheap_array_t * ha,
         const uint8_t * a,
@@ -694,7 +717,7 @@ void hammings_knn_mc(
         break;
     case 8:
         // TODO(hoss): Write analog to hammings_knn_hc_1
-        // hammings_knn_mc_1 (ha, C64(a), C64(b), nb, order, true, bitset);
+        // hammings_knn_hc_1 (ha, C64(a), C64(b), nb, order, true);
         hammings_knn_mc<faiss::HammingComputer8>(
           8, a, b, na, nb, k, distances, labels, bitset
         );
@@ -721,7 +744,66 @@ void hammings_knn_mc(
         }
     }
 }
+template <class HammingComputer>
+static
+void hamming_range_search_template (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *res)
+{
 
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres (res);
+
+#pragma omp for
+        for (size_t i = 0; i < na; i++) {
+             HammingComputer hc (a + i * code_size, code_size);
+            const uint8_t * yi = b;
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t j = 0; j < nb; j++) {
+                int dis = hc.hamming (yi);
+                if (dis < radius) {
+                    qres.add(dis, j);
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize ();
+    }
+}
+
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t code_size,
+    RangeSearchResult *result)
+{
+
+#define HC(name) hamming_range_search_template<name> (a, b, na, nb, radius, code_size, result)
+
+    switch(code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
 
 
 
diff --git a/core/src/index/thirdparty/faiss/utils/hamming.h b/core/src/index/thirdparty/faiss/utils/hamming.h
index a607bdcf03..38bdd651f2 100644
--- a/core/src/index/thirdparty/faiss/utils/hamming.h
+++ b/core/src/index/thirdparty/faiss/utils/hamming.h
@@ -30,7 +30,6 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/ConcurrentBitset.h>
 
-
 /* The Hamming distance type */
 typedef int32_t hamdis_t;
 
@@ -40,6 +39,7 @@ namespace faiss {
  * General bit vector functions
  **************************************************/
 
+struct RangeSearchResult;
 
 void bitvec_print (const uint8_t * b, size_t d);
 
@@ -66,6 +66,14 @@ void bitvecs2fvecs (
 
 void fvec2bitvec (const float * x, uint8_t * b, size_t d);
 
+/** Shuffle the bits from b(i, j) := a(i, order[j])
+ */
+void bitvec_shuffle (size_t n, size_t da, size_t db,
+                     const int *order,
+                     const uint8_t *a,
+                     uint8_t *b);
+
+
 /***********************************************
  * Generic reader/writer for bit strings
  ***********************************************/
@@ -149,7 +157,8 @@ void hammings_knn (
   const uint8_t * b,
   size_t nb,
   size_t ncodes,
-  int ordered);
+  int ordered,
+  ConcurrentBitsetPtr bitset = nullptr);
 
 /** Return the k smallest Hamming distances for a set of binary query vectors,
  * using counting max.
@@ -174,6 +183,17 @@ void hammings_knn_mc (
   int64_t *labels,
   ConcurrentBitsetPtr bitset = nullptr);
 
+/** same as hammings_knn except we are doing a range search with radius */
+void hamming_range_search (
+    const uint8_t * a,
+    const uint8_t * b,
+    size_t na,
+    size_t nb,
+    int radius,
+    size_t ncodes,
+    RangeSearchResult *result);
+
+
 /* Counting the number of matches or of cross-matches (without returning them)
    For use with function that assume pre-allocated memory */
 void hamming_count_thres (
diff --git a/core/src/index/thirdparty/faiss/utils/jaccard-inl.h b/core/src/index/thirdparty/faiss/utils/jaccard-inl.h
index fb41b6aca1..9aa7fbd924 100644
--- a/core/src/index/thirdparty/faiss/utils/jaccard-inl.h
+++ b/core/src/index/thirdparty/faiss/utils/jaccard-inl.h
@@ -1,7 +1,3 @@
-//
-// Created by czr on 2019/12/19.
-//
-
 namespace faiss {
 
     struct JaccardComputer8 {
diff --git a/core/src/index/thirdparty/faiss/utils/utils.cpp b/core/src/index/thirdparty/faiss/utils/utils.cpp
index ad9791c6aa..8b973df9fd 100644
--- a/core/src/index/thirdparty/faiss/utils/utils.cpp
+++ b/core/src/index/thirdparty/faiss/utils/utils.cpp
@@ -214,102 +214,6 @@ void matrix_qr (int m, int n, float *a)
 }
 
 
-/***************************************************************************
- * Kmeans subroutine
- ***************************************************************************/
-
-// a bit above machine epsilon for float16
-
-#define EPS (1 / 1024.)
-
-/* For k-means, compute centroids given assignment of vectors to centroids */
-int km_update_centroids (const float * x,
-                         float * centroids,
-                         int64_t * assign,
-                         size_t d, size_t k, size_t n,
-                         size_t k_frozen)
-{
-    k -= k_frozen;
-    centroids += k_frozen * d;
-
-    std::vector<size_t> hassign(k);
-    memset (centroids, 0, sizeof(*centroids) * d * k);
-
-#pragma omp parallel
-    {
-        int nt = omp_get_num_threads();
-        int rank = omp_get_thread_num();
-        // this thread is taking care of centroids c0:c1
-        size_t c0 = (k * rank) / nt;
-        size_t c1 = (k * (rank + 1)) / nt;
-        const float *xi = x;
-        size_t nacc = 0;
-
-        for (size_t i = 0; i < n; i++) {
-            int64_t ci = assign[i];
-            assert (ci >= 0 && ci < k + k_frozen);
-            ci -= k_frozen;
-            if (ci >= c0 && ci < c1)  {
-                float * c = centroids + ci * d;
-                hassign[ci]++;
-                for (size_t j = 0; j < d; j++)
-                    c[j] += xi[j];
-                nacc++;
-            }
-            xi += d;
-        }
-
-    }
-
-#pragma omp parallel for
-    for (size_t ci = 0; ci < k; ci++) {
-        float * c = centroids + ci * d;
-        float ni = (float) hassign[ci];
-        if (ni != 0) {
-            for (size_t j = 0; j < d; j++)
-                c[j] /= ni;
-        }
-    }
-
-    /* Take care of void clusters */
-    size_t nsplit = 0;
-    RandomGenerator rng (1234);
-    for (size_t ci = 0; ci < k; ci++) {
-        if (hassign[ci] == 0) { /* need to redefine a centroid */
-            size_t cj;
-            for (cj = 0; 1; cj = (cj + 1) % k) {
-                /* probability to pick this cluster for split */
-                float p = (hassign[cj] - 1.0) / (float) (n - k);
-                float r = rng.rand_float ();
-                if (r < p) {
-                    break; /* found our cluster to be split */
-                }
-            }
-            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
-
-            /* small symmetric pertubation. Much better than  */
-            for (size_t j = 0; j < d; j++) {
-                if (j % 2 == 0) {
-                    centroids[ci * d + j] *= 1 + EPS;
-                    centroids[cj * d + j] *= 1 - EPS;
-                } else {
-                    centroids[ci * d + j] *= 1 - EPS;
-                    centroids[cj * d + j] *= 1 + EPS;
-                }
-            }
-
-            /* assume even split of the cluster */
-            hassign[ci] = hassign[cj] / 2;
-            hassign[cj] -= hassign[ci];
-            nsplit++;
-        }
-    }
-
-    return nsplit;
-}
-
-#undef EPS
-
 
 
 /***************************************************************************
diff --git a/core/src/index/thirdparty/faiss/utils/utils.h b/core/src/index/thirdparty/faiss/utils/utils.h
index bba0fce000..c71cf95b5d 100644
--- a/core/src/index/thirdparty/faiss/utils/utils.h
+++ b/core/src/index/thirdparty/faiss/utils/utils.h
@@ -65,21 +65,6 @@ int fvec_madd_and_argmin (size_t n, const float *a,
 void reflection (const float * u, float * x, size_t n, size_t d, size_t nu);
 
 
-/** For k-means: update stage.
- *
- * @param x          training vectors, size n * d
- * @param centroids  centroid vectors, size k * d
- * @param assign     nearest centroid for each training vector, size n
- * @param k_frozen   do not update the k_frozen first centroids
- * @return           nb of spliting operations to fight empty clusters
- */
-int km_update_centroids (
-        const float * x,
-        float * centroids,
-        int64_t * assign,
-        size_t d, size_t k, size_t n,
-        size_t k_frozen);
-
 /** compute the Q of the QR decomposition for m > n
  * @param a   size n * m: input matrix and output Q
  */