upgrade faiss 1.6.3 (#2400)

* roll back to original faiss 1.6.0 Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * update to faiss_1.6.3 Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * patch all change to faiss 1.6.3 Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * faiss CPU version build pass Signed-off-by: yudong.cai <yudong.cai@zilliz.com> * faiss GPU version build pass Signed-off-by: yudong.cai <yudong.cai@zilliz.com>
2026-01-07 19:31:51 +08:00 · 2020-05-22 09:27:16 +08:00 · 2020-05-22 09:27:16 +08:00 · 386e58ce0d
commit 386e58ce0d
parent 5dcd68bb73
197 changed files with 8168 additions and 2206 deletions
--- a/core/src/index/thirdparty/faiss/Clustering.cpp
+++ b/core/src/index/thirdparty/faiss/Clustering.cpp
@ -10,11 +10,12 @@
 #include <faiss/Clustering.h>
 #include <faiss/impl/AuxIndexStructures.h>

-
 #include <cmath>
 #include <cstdio>
 #include <cstring>

+#include <omp.h>
+
 #include <faiss/utils/utils.h>
 #include <faiss/utils/random.h>
 #include <faiss/utils/distances.h>
@ -33,7 +34,8 @@ ClusteringParameters::ClusteringParameters ():
    frozen_centroids(false),
    min_points_per_centroid(39),
    max_points_per_centroid(256),
-    seed(1234)
+    seed(1234),
+    decode_block_size(32768)
 {}
 // 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k

@ -76,35 +78,233 @@ void Clustering::post_process_centroids ()
 }


-void Clustering::train (idx_t nx, const float *x_in, Index & index) {
+void Clustering::train (idx_t nx, const float *x_in, Index & index,
+                        const float *weights) {
+    train_encoded (nx, reinterpret_cast<const uint8_t *>(x_in), nullptr,
+                   index, weights);
+}
+
+
+namespace {
+
+using idx_t = Clustering::idx_t;
+
+idx_t subsample_training_set(
+          const Clustering &clus, idx_t nx, const uint8_t *x,
+          size_t line_size, const float * weights,
+          uint8_t **x_out,
+          float **weights_out
+)
+{
+    if (clus.verbose) {
+        printf("Sampling a subset of %ld / %ld for training\n",
+               clus.k * clus.max_points_per_centroid, nx);
+    }
+    std::vector<int> perm (nx);
+    rand_perm (perm.data (), nx, clus.seed);
+    nx = clus.k * clus.max_points_per_centroid;
+    uint8_t * x_new = new uint8_t [nx * line_size];
+    *x_out = x_new;
+    for (idx_t i = 0; i < nx; i++) {
+        memcpy (x_new + i * line_size, x + perm[i] * line_size, line_size);
+    }
+    if (weights) {
+        float *weights_new = new float[nx];
+        for (idx_t i = 0; i < nx; i++) {
+            weights_new[i] = weights[perm[i]];
+        }
+        *weights_out = weights_new;
+    } else {
+        *weights_out = nullptr;
+    }
+    return nx;
+}
+
+/** compute centroids as (weighted) sum of training points
+ *
+ * @param x            training vectors, size n * code_size (from codec)
+ * @param codec        how to decode the vectors (if NULL then cast to float*)
+ * @param weights      per-training vector weight, size n (or NULL)
+ * @param assign       nearest centroid for each training vector, size n
+ * @param k_frozen     do not update the k_frozen first centroids
+ * @param centroids    centroid vectors (output only), size k * d
+ * @param hassign      histogram of assignments per centroid (size k),
+ *                     should be 0 on input
+ *
+ */
+
+void compute_centroids (size_t d, size_t k, size_t n,
+                       size_t k_frozen,
+                       const uint8_t * x, const Index *codec,
+                       const int64_t * assign,
+                       const float * weights,
+                       float * hassign,
+                       float * centroids)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+
+    memset (centroids, 0, sizeof(*centroids) * d * k);
+
+    size_t line_size = codec ? codec->sa_code_size() : d * sizeof (float);
+
+#pragma omp parallel
+    {
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+        // this thread is taking care of centroids c0:c1
+        size_t c0 = (k * rank) / nt;
+        size_t c1 = (k * (rank + 1)) / nt;
+        std::vector<float> decode_buffer (d);
+
+        for (size_t i = 0; i < n; i++) {
+            int64_t ci = assign[i];
+            assert (ci >= 0 && ci < k + k_frozen);
+            ci -= k_frozen;
+            if (ci >= c0 && ci < c1)  {
+                float * c = centroids + ci * d;
+                const float * xi;
+                if (!codec) {
+                    xi = reinterpret_cast<const float*>(x + i * line_size);
+                } else {
+                    float *xif = decode_buffer.data();
+                    codec->sa_decode (1, x + i * line_size, xif);
+                    xi = xif;
+                }
+                if (weights) {
+                    float w = weights[i];
+                    hassign[ci] += w;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j] * w;
+                    }
+                } else {
+                    hassign[ci] += 1.0;
+                    for (size_t j = 0; j < d; j++) {
+                        c[j] += xi[j];
+                    }
+                }
+            }
+        }
+
+    }
+
+#pragma omp parallel for
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) {
+            continue;
+        }
+        float norm = 1 / hassign[ci];
+        float * c = centroids + ci * d;
+        for (size_t j = 0; j < d; j++) {
+            c[j] *= norm;
+        }
+    }
+
+}
+
+// a bit above machine epsilon for float16
+#define EPS (1 / 1024.)
+
+/** Handle empty clusters by splitting larger ones.
+ *
+ * It works by slightly changing the centroids to make 2 clusters from
+ * a single one. Takes the same arguements as compute_centroids.
+ *
+ * @return           nb of spliting operations (larger is worse)
+ */
+int split_clusters (size_t d, size_t k, size_t n,
+                    size_t k_frozen,
+                    float * hassign,
+                    float * centroids)
+{
+    k -= k_frozen;
+    centroids += k_frozen * d;
+
+    /* Take care of void clusters */
+    size_t nsplit = 0;
+    RandomGenerator rng (1234);
+    for (size_t ci = 0; ci < k; ci++) {
+        if (hassign[ci] == 0) { /* need to redefine a centroid */
+            size_t cj;
+            for (cj = 0; 1; cj = (cj + 1) % k) {
+                /* probability to pick this cluster for split */
+                float p = (hassign[cj] - 1.0) / (float) (n - k);
+                float r = rng.rand_float ();
+                if (r < p) {
+                    break; /* found our cluster to be split */
+                }
+            }
+            memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
+
+            /* small symmetric pertubation */
+            for (size_t j = 0; j < d; j++) {
+                if (j % 2 == 0) {
+                    centroids[ci * d + j] *= 1 + EPS;
+                    centroids[cj * d + j] *= 1 - EPS;
+                } else {
+                    centroids[ci * d + j] *= 1 - EPS;
+                    centroids[cj * d + j] *= 1 + EPS;
+                }
+            }
+
+            /* assume even split of the cluster */
+            hassign[ci] = hassign[cj] / 2;
+            hassign[cj] -= hassign[ci];
+            nsplit++;
+        }
+    }
+
+    return nsplit;
+
+}
+
+
+
+};
+
+
+void Clustering::train_encoded (idx_t nx, const uint8_t *x_in,
+                                const Index * codec, Index & index,
+                                const float *weights) {
+
    FAISS_THROW_IF_NOT_FMT (nx >= k,
             "Number of training points (%ld) should be at least "
             "as large as number of clusters (%ld)", nx, k);

+    FAISS_THROW_IF_NOT_FMT ((!codec || codec->d == d),
+             "Codec dimension %d not the same as data dimension %d",
+             int(codec->d), int(d));
+
+    FAISS_THROW_IF_NOT_FMT (index.d == d,
+            "Index dimension %d not the same as data dimension %d",
+            int(index.d), int(d));
+
    double t0 = getmillisecs();

-    // yes it is the user's responsibility, but it may spare us some
-    // hard-to-debug reports.
-    for (size_t i = 0; i < nx * d; i++) {
-      FAISS_THROW_IF_NOT_MSG (finite (x_in[i]),
-                        "input contains NaN's or Inf's");
+    if (!codec) {
+        // Check for NaNs in input data. Normally it is the user's
+        // responsibility, but it may spare us some hard-to-debug
+        // reports.
+        const float *x = reinterpret_cast<const float *>(x_in);
+        for (size_t i = 0; i < nx * d; i++) {
+            FAISS_THROW_IF_NOT_MSG (finite (x[i]),
+                                    "input contains NaN's or Inf's");
+        }
    }

-    const float *x = x_in;
-    ScopeDeleter<float> del1;
+    const uint8_t *x = x_in;
+    std::unique_ptr<uint8_t []> del1;
+    std::unique_ptr<float []> del3;
+    size_t line_size = codec ? codec->sa_code_size() : sizeof(float) * d;

    if (nx > k * max_points_per_centroid) {
-        if (verbose)
-            printf("Sampling a subset of %ld / %ld for training\n",
-                   k * max_points_per_centroid, nx);
-        std::vector<int> perm (nx);
-        rand_perm (perm.data (), nx, seed);
-        nx = k * max_points_per_centroid;
-        float * x_new = new float [nx * d];
-        for (idx_t i = 0; i < nx; i++)
-            memcpy (x_new + i * d, x + perm[i] * d, sizeof(x_new[0]) * d);
-        x = x_new;
-        del1.set (x);
+        uint8_t *x_new;
+        float *weights_new;
+        nx = subsample_training_set (*this, nx, x, line_size, weights,
+                                &x_new, &weights_new);
+        del1.reset (x_new); x = x_new;
+        del3.reset (weights_new); weights = weights_new;
    } else if (nx < k * min_points_per_centroid) {
        fprintf (stderr,
                 "WARNING clustering %ld points to %ld centroids: "
@ -112,41 +312,53 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
                 nx, k, idx_t(k) * min_points_per_centroid);
    }

-
    if (nx == k) {
+        // this is a corner case, just copy training set to clusters
        if (verbose) {
            printf("Number of training points (%ld) same as number of "
                   "clusters, just copying\n", nx);
        }
-        // this is a corner case, just copy training set to clusters
        centroids.resize (d * k);
-        memcpy (centroids.data(), x_in, sizeof (*x_in) * d * k);
+        if (!codec) {
+            memcpy (centroids.data(), x_in, sizeof (float) * d * k);
+        } else {
+            codec->sa_decode (nx, x_in, centroids.data());
+        }
+
+        // one fake iteration...
+        ClusteringIterationStats stats = { 0.0, 0.0, 0.0, 1.0, 0 };
+        iteration_stats.push_back (stats);
+
        index.reset();
-        index.add(k, x_in);
+        index.add(k, centroids.data());
        return;
    }


-    if (verbose)
+    if (verbose) {
        printf("Clustering %d points in %ldD to %ld clusters, "
               "redo %d times, %d iterations\n",
               int(nx), d, k, nredo, niter);
+        if (codec) {
+            printf("Input data encoded in %ld bytes per vector\n",
+                   codec->sa_code_size ());
+        }
+    }

-    idx_t * assign = new idx_t[nx];
-    ScopeDeleter<idx_t> del (assign);
-    float * dis = new float[nx];
-    ScopeDeleter<float> del2(dis);
+    std::unique_ptr<idx_t []> assign(new idx_t[nx]);
+    std::unique_ptr<float []> dis(new float[nx]);

-    // for redo
+    // remember best iteration for redo
    float best_err = HUGE_VALF;
-    std::vector<float> best_obj;
+    std::vector<ClusteringIterationStats> best_obj;
    std::vector<float> best_centroids;

    // support input centroids

    FAISS_THROW_IF_NOT_MSG (
       centroids.size() % d == 0,
-       "size of provided input centroids not a multiple of dimension");
+       "size of provided input centroids not a multiple of dimension"
+    );

    size_t n_input_centroids = centroids.size() / d;

@ -162,23 +374,36 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    }
    t0 = getmillisecs();

+    // temporary buffer to decode vectors during the optimization
+    std::vector<float> decode_buffer
+        (codec ? d * decode_block_size : 0);
+
    for (int redo = 0; redo < nredo; redo++) {

        if (verbose && nredo > 1) {
            printf("Outer iteration %d / %d\n", redo, nredo);
        }

-        // initialize remaining centroids with random points from the dataset
+        // initialize (remaining) centroids with random points from the dataset
        centroids.resize (d * k);
        std::vector<int> perm (nx);

        rand_perm (perm.data(), nx, seed + 1 + redo * 15486557L);
-        for (int i = n_input_centroids; i < k ; i++)
-            memcpy (&centroids[i * d], x + perm[i] * d,
-                    d * sizeof (float));
+
+        if (!codec) {
+            for (int i = n_input_centroids; i < k ; i++) {
+                memcpy (&centroids[i * d], x + perm[i] * line_size, line_size);
+            }
+        } else {
+            for (int i = n_input_centroids; i < k ; i++) {
+                codec->sa_decode (1, x + perm[i] * line_size, &centroids[i * d]);
+            }
+        }

        post_process_centroids ();

+        // prepare the index
+
        if (index.ntotal != 0) {
            index.reset();
        }
@ -188,49 +413,89 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
        }

        index.add (k, centroids.data());
+
+        // k-means iterations
+
        float err = 0;
        for (int i = 0; i < niter; i++) {
            double t0s = getmillisecs();
-            index.assign(nx, x, assign, dis);
+
+            if (!codec) {
+                index.assign (nx, reinterpret_cast<const float *>(x),
+                              assign.get(), dis.get());
+            } else {
+                // search by blocks of decode_block_size vectors
+                size_t code_size = codec->sa_code_size ();
+                for (size_t i0 = 0; i0 < nx; i0 += decode_block_size) {
+                    size_t i1 = i0 + decode_block_size;
+                    if (i1 > nx) { i1 = nx; }
+                    codec->sa_decode (i1 - i0, x + code_size * i0,
+                                      decode_buffer.data ());
+                    index.search (i1 - i0, decode_buffer.data (), 1,
+                                  dis.get() + i0, assign.get() + i0);
+                }
+            }
+
            InterruptCallback::check();
            t_search_tot += getmillisecs() - t0s;

+            // accumulate error
            err = 0;
-            for (int j = 0; j < nx; j++)
+            for (int j = 0; j < nx; j++) {
                err += dis[j];
-            obj.push_back (err);
+            }

-            int nsplit = km_update_centroids (
-                  x, centroids.data(),
-                  assign, d, k, nx, frozen_centroids ? n_input_centroids : 0);
+            // update the centroids
+            std::vector<float> hassign (k);
+
+            size_t k_frozen = frozen_centroids ? n_input_centroids : 0;
+            compute_centroids (
+                  d, k, nx, k_frozen,
+                  x, codec, assign.get(), weights,
+                  hassign.data(), centroids.data()
+            );
+
+            int nsplit = split_clusters (
+                  d, k, nx, k_frozen,
+                  hassign.data(), centroids.data()
+            );
+
+            // collect statistics
+            ClusteringIterationStats stats =
+                { err, (getmillisecs() - t0) / 1000.0,
+                  t_search_tot / 1000, imbalance_factor (nx, k, assign.get()),
+                  nsplit };
+            iteration_stats.push_back(stats);

            if (verbose) {
                printf ("  Iteration %d (%.2f s, search %.2f s): "
                        "objective=%g imbalance=%.3f nsplit=%d       \r",
-                        i, (getmillisecs() - t0) / 1000.0,
-                        t_search_tot / 1000,
-                        err, imbalance_factor (nx, k, assign),
-                        nsplit);
+                        i, stats.time, stats.time_search, stats.obj,
+                        stats.imbalance_factor, nsplit);
                fflush (stdout);
            }

            post_process_centroids ();

-            index.reset ();
-            if (update_index)
-                index.train (k, centroids.data());
+            // add centroids to index for the next iteration (or for output)
+
+            index.reset ();
+            if (update_index) {
+                index.train (k, centroids.data());
+            }

-            assert (index.ntotal == 0);
            index.add (k, centroids.data());
            InterruptCallback::check ();
        }
+
        if (verbose) printf("\n");
        if (nredo > 1) {
            if (err < best_err) {
-                if (verbose)
+                if (verbose) {
                    printf ("Objective improved: keep new clusters\n");
+                }
                best_centroids = centroids;
-                best_obj = obj;
+                best_obj = iteration_stats;
                best_err = err;
            }
            index.reset ();
@ -238,7 +503,7 @@ void Clustering::train (idx_t nx, const float *x_in, Index & index) {
    }
    if (nredo > 1) {
        centroids = best_centroids;
-        obj = best_obj;
+        iteration_stats = best_obj;
        index.reset();
        index.add(k, best_centroids.data());
    }
@ -255,7 +520,7 @@ float kmeans_clustering (size_t d, size_t n, size_t k,
    IndexFlatL2 index (d);
    clus.train (n, x, index);
    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
-    return clus.obj.back();
+    return clus.iteration_stats.back().obj;
 }

 } // namespace faiss
--- a/core/src/index/thirdparty/faiss/Clustering.h
+++ b/core/src/index/thirdparty/faiss/Clustering.h
@ -26,7 +26,7 @@ struct ClusteringParameters {
    bool verbose;
    bool spherical;     ///< do we want normalized centroids?
    bool int_centroids; ///< round centroids coordinates to integer
-    bool update_index;  ///< update index after each iteration?
+    bool update_index;  ///< re-train index after each iteration?
    bool frozen_centroids;  ///< use the centroids provided as input and do not change them during iterations

    int min_points_per_centroid; ///< otherwise you get a warning
@ -34,12 +34,23 @@ struct ClusteringParameters {

    int seed; ///< seed for the random number generator

+    size_t decode_block_size;  ///< how many vectors at a time to decode
+
    /// sets reasonable defaults
    ClusteringParameters ();
 };


-/** clustering based on assignment - centroid update iterations
+struct ClusteringIterationStats {
+    float obj;               ///< objective values (sum of distances reported by index)
+    double time;             ///< seconds for iteration
+    double time_search;      ///< seconds for just search
+    double imbalance_factor; ///< imbalance factor of iteration
+    int nsplit;              ///< number of cluster splits
+};
+
+
+/** K-means clustering based on assignment - centroid update iterations
 *
 * The clustering is based on an Index object that assigns training
 * points to the centroids. Therefore, at each iteration the centroids
@ -50,27 +61,44 @@ struct ClusteringParameters {
 * centroids table it is not empty on input, it is also used for
 * initialization.
 *
- * To do several clusterings, just call train() several times on
- * different training sets, clearing the centroid table in between.
 */
 struct Clustering: ClusteringParameters {
    typedef Index::idx_t idx_t;
    size_t d;              ///< dimension of the vectors
    size_t k;              ///< nb of centroids

-    /// centroids (k * d)
+    /** centroids (k * d)
+     * if centroids are set on input to train, they will be used as initialization
+     */
    std::vector<float> centroids;

-    /// objective values (sum of distances reported by index) over
-    /// iterations
-    std::vector<float> obj;
+    /// stats at every iteration of clustering
+    std::vector<ClusteringIterationStats> iteration_stats;

-    /// the only mandatory parameters are k and d
    Clustering (int d, int k);
    Clustering (int d, int k, const ClusteringParameters &cp);

-    /// Index is used during the assignment stage
-    virtual void train (idx_t n, const float * x, faiss::Index & index);
+    /** run k-means training
+     *
+     * @param x          training vectors, size n * d
+     * @param index      index used for assignment
+     * @param x_weights  weight associated to each vector: NULL or size n
+     */
+    virtual void train (idx_t n, const float * x, faiss::Index & index,
+                        const float *x_weights = nullptr);
+
+
+    /** run with encoded vectors
+     *
+     * win addition to train()'s parameters takes a codec as parameter
+     * to decode the input vectors.
+     *
+     * @param codec      codec used to decode the vectors (nullptr =
+     *                   vectors are in fact floats)     *
+     */
+    void train_encoded (idx_t nx, const uint8_t *x_in,
+                        const Index * codec, Index & index,
+                        const float *weights = nullptr);

    /// Post-process the centroids after each centroid update.
    /// includes optional L2 normalization and nearest integer rounding
--- a/core/src/index/thirdparty/faiss/DirectMap.cpp
+++ b/core/src/index/thirdparty/faiss/DirectMap.cpp
@ -0,0 +1,267 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/DirectMap.h>
+
+#include <cstdio>
+#include <cassert>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/AuxIndexStructures.h>
+
+namespace faiss {
+
+DirectMap::DirectMap(): type(NoMap)
+{}
+
+void DirectMap::set_type (Type new_type, const InvertedLists *invlists, size_t ntotal) {
+
+    FAISS_THROW_IF_NOT (new_type == NoMap || new_type == Array ||
+                        new_type == Hashtable);
+
+    if (new_type == type) {
+        // nothing to do
+        return;
+    }
+
+    array.clear ();
+    hashtable.clear ();
+    type = new_type;
+
+    if (new_type == NoMap) {
+        return;
+    } else if (new_type == Array) {
+        array.resize (ntotal, -1);
+    } else if (new_type == Hashtable) {
+        hashtable.reserve (ntotal);
+    }
+
+    for (size_t key = 0; key < invlists->nlist; key++) {
+        size_t list_size = invlists->list_size (key);
+        InvertedLists::ScopedIds idlist (invlists, key);
+
+        if (new_type == Array) {
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                FAISS_THROW_IF_NOT_MSG (
+                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
+                       "direct map supported only for seuquential ids");
+                array [idlist [ofs]] = lo_build(key, ofs);
+            }
+        } else if (new_type == Hashtable) {
+            for (long ofs = 0; ofs < list_size; ofs++) {
+                hashtable [idlist [ofs]] = lo_build(key, ofs);
+            }
+        }
+    }
+}
+
+void DirectMap::clear()
+{
+    array.clear ();
+    hashtable.clear ();
+}
+
+
+DirectMap::idx_t DirectMap::get (idx_t key) const
+{
+    if (type == Array) {
+        FAISS_THROW_IF_NOT_MSG (
+             key >= 0 && key < array.size(), "invalid key"
+        );
+        idx_t lo = array[key];
+        FAISS_THROW_IF_NOT_MSG(lo >= 0, "-1 entry in direct_map");
+        return lo;
+    } else if (type == Hashtable) {
+        auto res = hashtable.find (key);
+        FAISS_THROW_IF_NOT_MSG (res != hashtable.end(), "key not found");
+        return res->second;
+    } else {
+        FAISS_THROW_MSG ("direct map not initialized");
+    }
+}
+
+
+
+void DirectMap::add_single_id (idx_t id, idx_t list_no, size_t offset)
+{
+    if (type == NoMap) return;
+
+    if (type == Array) {
+        assert (id == array.size());
+        if (list_no >= 0) {
+            array.push_back (lo_build (list_no, offset));
+        } else {
+            array.push_back (-1);
+        }
+    } else if (type == Hashtable) {
+        if (list_no >= 0) {
+            hashtable[id] = lo_build (list_no, offset);
+        }
+    }
+
+}
+
+void DirectMap::check_can_add (const idx_t *ids) {
+    if (type == Array && ids) {
+        FAISS_THROW_MSG ("cannot have array direct map and add with ids");
+    }
+}
+
+/********************* DirectMapAdd implementation */
+
+
+DirectMapAdd::DirectMapAdd (DirectMap &direct_map, size_t n, const idx_t *xids):
+    direct_map(direct_map), type(direct_map.type), n(n), xids(xids)
+{
+    if (type == DirectMap::Array)  {
+        FAISS_THROW_IF_NOT (xids == nullptr);
+        ntotal = direct_map.array.size();
+        direct_map.array.resize (ntotal + n, -1);
+    } else if (type == DirectMap::Hashtable) {
+        // can't parallel update hashtable so use temp array
+        all_ofs.resize (n, -1);
+    }
+}
+
+
+void DirectMapAdd::add (size_t i, idx_t list_no, size_t ofs)
+{
+    if (type == DirectMap::Array) {
+        direct_map.array [ntotal + i] = lo_build (list_no, ofs);
+    } else if (type == DirectMap::Hashtable) {
+        all_ofs [i] = lo_build (list_no, ofs);
+    }
+}
+
+DirectMapAdd::~DirectMapAdd ()
+{
+    if (type == DirectMap::Hashtable) {
+        for (int i = 0; i < n; i++) {
+            idx_t id = xids ? xids[i] : ntotal + i;
+            direct_map.hashtable [id] = all_ofs [i];
+        }
+    }
+}
+
+/********************************************************/
+
+using ScopedCodes = InvertedLists::ScopedCodes;
+using ScopedIds = InvertedLists::ScopedIds;
+
+
+size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists *invlists)
+{
+    size_t nlist = invlists->nlist;
+    std::vector<idx_t> toremove(nlist);
+
+    size_t nremove = 0;
+
+    if (type == NoMap) {
+        // exhaustive scan of IVF
+#pragma omp parallel for
+        for (idx_t i = 0; i < nlist; i++) {
+            idx_t l0 = invlists->list_size (i), l = l0, j = 0;
+            ScopedIds idsi (invlists, i);
+            while (j < l) {
+                if (sel.is_member (idsi[j])) {
+                    l--;
+                    invlists->update_entry (
+                        i, j,
+                        invlists->get_single_id (i, l),
+                        ScopedCodes (invlists, i, l).get()
+                    );
+                } else {
+                    j++;
+                }
+            }
+            toremove[i] = l0 - l;
+        }
+        // this will not run well in parallel on ondisk because of
+        // possible shrinks
+        for (idx_t i = 0; i < nlist; i++) {
+            if (toremove[i] > 0) {
+                nremove += toremove[i];
+                invlists->resize(i, invlists->list_size(i) - toremove[i]);
+            }
+        }
+    } else if (type == Hashtable) {
+        const IDSelectorArray *sela =
+            dynamic_cast<const IDSelectorArray*>(&sel);
+        FAISS_THROW_IF_NOT_MSG (
+             sela,
+             "remove with hashtable works only with IDSelectorArray"
+        );
+
+        for (idx_t i = 0; i < sela->n; i++) {
+            idx_t id = sela->ids[i];
+            auto res = hashtable.find (id);
+            if (res != hashtable.end()) {
+                size_t list_no = lo_listno (res->second);
+                size_t offset = lo_offset (res->second);
+                idx_t last = invlists->list_size (list_no) - 1;
+                hashtable.erase (res);
+                if (offset < last) {
+                    idx_t last_id = invlists->get_single_id (list_no, last);
+                    invlists->update_entry (
+                        list_no, offset,
+                        last_id,
+                        ScopedCodes (invlists, list_no, last).get()
+                    );
+                    // update hash entry for last element
+                    hashtable [last_id] = list_no << 32 | offset;
+                }
+                invlists->resize(list_no, last);
+                nremove++;
+            }
+        }
+
+    } else {
+        FAISS_THROW_MSG("remove not supported with this direct_map format");
+    }
+    return nremove;
+}
+
+void DirectMap::update_codes (InvertedLists *invlists,
+                              int n, const idx_t *ids,
+                              const idx_t *assign,
+                              const uint8_t *codes)
+{
+    FAISS_THROW_IF_NOT (type == Array);
+
+    size_t code_size = invlists->code_size;
+
+    for (size_t i = 0; i < n; i++) {
+        idx_t id = ids[i];
+        FAISS_THROW_IF_NOT_MSG (0 <= id && id < array.size(),
+                                "id to update out of range");
+        { // remove old one
+            idx_t dm = array [id];
+            int64_t ofs = lo_offset (dm);
+            int64_t il = lo_listno (dm);
+            size_t l = invlists->list_size (il);
+            if (ofs != l - 1) { // move l - 1 to ofs
+                int64_t id2 = invlists->get_single_id (il, l - 1);
+                array[id2] = lo_build (il, ofs);
+                invlists->update_entry (il, ofs, id2,
+                                        invlists->get_single_code (il, l - 1));
+            }
+            invlists->resize (il, l - 1);
+        }
+        { // insert new one
+            int64_t il = assign[i];
+            size_t l = invlists->list_size (il);
+            idx_t dm = lo_build (il, l);
+            array [id] = dm;
+            invlists->add_entry (il, id, codes + i * code_size);
+        }
+    }
+}
+
+
+}
--- a/core/src/index/thirdparty/faiss/DirectMap.h
+++ b/core/src/index/thirdparty/faiss/DirectMap.h
@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_DIRECT_MAP_H
+#define FAISS_DIRECT_MAP_H
+
+#include <faiss/InvertedLists.h>
+#include <unordered_map>
+
+
+namespace faiss {
+
+// When offsets list id + offset are encoded in an uint64
+// we call this LO = list-offset
+
+inline uint64_t lo_build (uint64_t list_id, uint64_t offset) {
+    return  list_id << 32 | offset;
+}
+
+inline uint64_t lo_listno (uint64_t lo) {
+    return lo >> 32;
+}
+
+inline uint64_t lo_offset (uint64_t lo) {
+    return lo & 0xffffffff;
+}
+
+/**
+ * Direct map: a way to map back from ids to inverted lists
+ */
+struct DirectMap {
+    typedef Index::idx_t idx_t;
+
+    enum Type {
+       NoMap = 0,     // default
+       Array = 1,     // sequential ids (only for add, no add_with_ids)
+       Hashtable = 2  // arbitrary ids
+    };
+    Type type;
+
+    /// map for direct access to the elements. Map ids to LO-encoded entries.
+    std::vector <idx_t> array;
+    std::unordered_map <idx_t, idx_t> hashtable;
+
+    DirectMap();
+
+    /// set type and initialize
+    void set_type (Type new_type, const InvertedLists *invlists, size_t ntotal);
+
+    /// get an entry
+    idx_t get (idx_t id) const;
+
+    /// for quick checks
+    bool no () const {return type == NoMap; }
+
+    /**
+     * update the direct_map
+     */
+
+    /// throw if Array and ids is not NULL
+    void check_can_add (const idx_t *ids);
+
+    /// non thread-safe version
+    void add_single_id (idx_t id, idx_t list_no, size_t offset);
+
+    /// remove all entries
+    void clear();
+
+    /**
+     * operations on inverted lists that require translation with a DirectMap
+     */
+
+    /// remove ids from the InvertedLists, possibly using the direct map
+    size_t remove_ids(const IDSelector& sel, InvertedLists *invlists);
+
+    /// update entries, using the direct map
+    void update_codes (InvertedLists *invlists,
+                       int n, const idx_t *ids,
+                       const idx_t *list_nos,
+                       const uint8_t *codes);
+
+
+
+};
+
+/// Thread-safe way of updating the direct_map
+struct DirectMapAdd {
+
+    typedef Index::idx_t idx_t;
+
+    using Type = DirectMap::Type;
+
+    DirectMap &direct_map;
+    DirectMap::Type type;
+    size_t ntotal;
+    size_t n;
+    const idx_t *xids;
+
+    std::vector<idx_t> all_ofs;
+
+    DirectMapAdd (DirectMap &direct_map, size_t n, const idx_t *xids);
+
+    /// add vector i (with id xids[i]) at list_no and offset
+    void add (size_t i, idx_t list_no, size_t offset);
+
+    ~DirectMapAdd ();
+};
+
+
+
+}
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/IVFlib.cpp
+++ b/core/src/index/thirdparty/faiss/IVFlib.cpp
@ -13,6 +13,7 @@

 #include <faiss/IndexPreTransform.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/MetaIndexes.h>
 #include <faiss/utils/utils.h>


@ -56,17 +57,35 @@ void check_compatible_for_merge (const Index * index0,

 }

-const IndexIVF * extract_index_ivf (const Index * index)
+const IndexIVF * try_extract_index_ivf (const Index * index)
 {
    if (auto *pt =
        dynamic_cast<const IndexPreTransform *>(index)) {
        index = pt->index;
    }

+    if (auto *idmap =
+        dynamic_cast<const IndexIDMap *>(index)) {
+        index = idmap->index;
+    }
+    if (auto *idmap =
+        dynamic_cast<const IndexIDMap2 *>(index)) {
+        index = idmap->index;
+    }
+
    auto *ivf = dynamic_cast<const IndexIVF *>(index);

-    FAISS_THROW_IF_NOT (ivf);
+    return ivf;
+}

+IndexIVF * try_extract_index_ivf (Index * index) {
+    return const_cast<IndexIVF*> (try_extract_index_ivf ((const Index*)(index)));
+}
+
+const IndexIVF * extract_index_ivf (const Index * index)
+{
+    const IndexIVF *ivf = try_extract_index_ivf (index);
+    FAISS_THROW_IF_NOT (ivf);
    return ivf;
 }

@ -74,6 +93,7 @@ IndexIVF * extract_index_ivf (Index * index) {
    return const_cast<IndexIVF*> (extract_index_ivf ((const Index*)(index)));
 }

+
 void merge_into(faiss::Index *index0, faiss::Index *index1, bool shift_ids) {

    check_compatible_for_merge (index0, index1);
@ -146,8 +166,8 @@ void search_and_return_centroids(faiss::Index *index,
            if (result_centroid_ids)
                result_centroid_ids[i] = -1;
        } else {
-            long list_no = label >> 32;
-            long list_index = label & 0xffffffff;
+            long list_no = lo_listno (label);
+            long list_index = lo_offset (label);
            if (result_centroid_ids)
                result_centroid_ids[i] = list_no;
            labels[i] = index_ivf->invlists->get_single_id(list_no, list_index);
--- a/core/src/index/thirdparty/faiss/IVFlib.h
+++ b/core/src/index/thirdparty/faiss/IVFlib.h
@ -35,6 +35,10 @@ void check_compatible_for_merge (const Index * index1,
 const IndexIVF * extract_index_ivf (const Index * index);
 IndexIVF * extract_index_ivf (Index * index);

+/// same as above but returns nullptr instead of throwing on failure
+const IndexIVF * try_extract_index_ivf (const Index * index);
+IndexIVF * try_extract_index_ivf (Index * index);
+
 /** Merge index1 into index0. Works on IndexIVF's and IndexIVF's
 *  embedded in a IndexPreTransform. On output, the index1 is empty.
 *
--- a/core/src/index/thirdparty/faiss/Index.cpp
+++ b/core/src/index/thirdparty/faiss/Index.cpp
@ -36,7 +36,7 @@ void Index::range_search (idx_t , const float *, float,
  FAISS_THROW_MSG ("range search not implemented");
 }

-void Index::assign (idx_t n, const float *x, idx_t *labels, float *distance)
+void Index::assign (idx_t n, const float* x, idx_t* labels, float* distance)
 {
  float *dis_inner = (distance == nullptr) ? new float[n] : distance;
  search (n, x, 1, dis_inner, labels);
@ -45,7 +45,10 @@ void Index::assign (idx_t n, const float *x, idx_t *labels, float *distance)
  }
 }

-void Index::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
+void Index::add_with_ids(
+    idx_t /*n*/,
+    const float* /*x*/,
+    const idx_t* /*xids*/) {
  FAISS_THROW_MSG ("add_with_ids not implemented for this type of index");
 }

--- a/core/src/index/thirdparty/faiss/Index.h
+++ b/core/src/index/thirdparty/faiss/Index.h
@ -10,17 +10,16 @@
 #ifndef FAISS_INDEX_H
 #define FAISS_INDEX_H

-
+#include <faiss/MetricType.h>
+#include <faiss/utils/ConcurrentBitset.h>
 #include <cstdio>
 #include <typeinfo>
 #include <string>
 #include <sstream>

-#include <faiss/utils/ConcurrentBitset.h>
-
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 6
-#define FAISS_VERSION_PATCH 0
+#define FAISS_VERSION_PATCH 3

 /**
 * @namespace faiss
@ -41,39 +40,15 @@

 namespace faiss {

-
-/// Some algorithms support both an inner product version and a L2 search version.
-enum MetricType {
-    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
-    METRIC_L2 = 1,             ///< squared L2 search
-    METRIC_L1,                 ///< L1 (aka cityblock)
-    METRIC_Linf,               ///< infinity distance
-    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
-    METRIC_Jaccard,
-    METRIC_Tanimoto,
-    METRIC_Hamming,
-    METRIC_Substructure,       ///< Tversky case alpha = 0, beta = 1
-    METRIC_Superstructure,     ///< Tversky case alpha = 1, beta = 0
-
-    /// some additional metrics defined in scipy.spatial.distance
-    METRIC_Canberra = 20,
-    METRIC_BrayCurtis,
-    METRIC_JensenShannon,
-
-};
-
-
 /// Forward declarations see AuxIndexStructures.h
 struct IDSelector;
 struct RangeSearchResult;
 struct DistanceComputer;

-/** Abstract structure for an index
+/** Abstract structure for an index, supports adding vectors and searching them.
 *
- * Supports adding vertices and searching them.
- *
- * Currently only asymmetric queries are supported:
- * database-to-database queries are not implemented.
+ * All vectors provided at add or search time are 32-bit float arrays,
+ * although the internal representation may vary.
 */
 struct Index {
    using idx_t = int64_t;  ///< all indices are this type
@ -138,7 +113,8 @@ struct Index {
     * @param distances   output pairwise distances, size n*k
     * @param bitset      flags to check the validity of vectors
     */
-    virtual void search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
+    virtual void search (idx_t n, const float *x, idx_t k,
+                         float *distances, idx_t *labels,
                         ConcurrentBitsetPtr bitset = nullptr) const = 0;

    /** query n raw vectors from the index by ids.
@ -162,8 +138,8 @@ struct Index {
     * @param distances   output pairwise distances, size n*k
     * @param bitset      flags to check the validity of vectors
     */
-    virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
-                               ConcurrentBitsetPtr bitset = nullptr);
+     virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
+                                ConcurrentBitsetPtr bitset = nullptr);

    /** query n vectors of dimension d to the index.
     *
@ -185,7 +161,7 @@ struct Index {
     * @param x           input vectors to search, size n * d
     * @param labels      output labels of the NNs, size n
     */
-    virtual void assign (idx_t n, const float *x, idx_t *labels, float *distance = nullptr);
+    virtual void assign (idx_t n, const float* x, idx_t* labels, float* distance = nullptr);

    /// removes all elements from the database.
    virtual void reset() = 0;
--- a/core/src/index/thirdparty/faiss/Index2Layer.cpp
+++ b/core/src/index/thirdparty/faiss/Index2Layer.cpp
@ -42,7 +42,6 @@

 namespace faiss {

-using idx_t = Index::idx_t;

 /*************************************
 * Index2Layer implementation
@ -167,7 +166,7 @@ void Index2Layer::search(
    idx_t /*k*/,
    float* /*distances*/,
    idx_t* /*labels*/,
-    ConcurrentBitsetPtr bitset) const {
+    ConcurrentBitsetPtr) const {
  FAISS_THROW_MSG("not implemented");
 }

--- a/core/src/index/thirdparty/faiss/IndexBinary.h
+++ b/core/src/index/thirdparty/faiss/IndexBinary.h
@ -95,10 +95,11 @@ struct IndexBinary {
   * @param distances   output pairwise distances, size n*k
   * @param bitset      flags to check the validity of vectors
   */
-  virtual void search (idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
-                       ConcurrentBitsetPtr bitset = nullptr) const = 0;
+  virtual void search(idx_t n, const uint8_t *x, idx_t k,
+                      int32_t *distances, idx_t *labels,
+                      ConcurrentBitsetPtr bitset = nullptr) const = 0;

-  /** query n raw vectors from the index by ids.
+  /** Query n raw vectors from the index by ids.
   *
   * return n raw vectors.
   *
@ -122,12 +123,15 @@ struct IndexBinary {
  virtual void search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                             ConcurrentBitsetPtr bitset = nullptr);

-
-    /** Query n vectors of dimension d to the index.
+  /** Query n vectors of dimension d to the index.
   *
-   * return all vectors with distance < radius. Note that many
-   * indexes do not implement the range_search (only the k-NN search
-   * is mandatory).
+   * return all vectors with distance < radius. Note that many indexes
+   * do not implement the range_search (only the k-NN search is
+   * mandatory). The distances are converted to float to reuse the
+   * RangeSearchResult structure, but they are integer. By convention,
+   * only distances < radius (strict comparison) are returned,
+   * ie. radius = 0 does not return any result and 1 returns only
+   * exact same vectors.
   *
   * @param x           input vectors to search, size n * d / 8
   * @param radius      search radius
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.cpp
@ -39,7 +39,8 @@ void IndexBinaryFlat::reset() {
 }

 void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
-                             int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const {
    const idx_t block_size = query_batch_size;
    if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
        float *D = reinterpret_cast<float*>(distances);
@ -63,7 +64,6 @@ void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
                D[i] = -log2(1-D[i]);
            }
        }
-
    } else if (metric_type == METRIC_Substructure || metric_type == METRIC_Superstructure) {
        float *D = reinterpret_cast<float*>(distances);
        for (idx_t s = 0; s < n; s += block_size) {
@ -76,7 +76,6 @@ void IndexBinaryFlat::search(idx_t n, const uint8_t *x, idx_t k,
            binary_distence_knn_mc(metric_type, x + s * code_size, xb.data(), nn, ntotal, k, code_size,
                    D + s * k, labels + s * k, bitset);
        }
-
    } else {
        for (idx_t s = 0; s < n; s += block_size) {
            idx_t nn = block_size;
@ -123,5 +122,11 @@ void IndexBinaryFlat::reconstruct(idx_t key, uint8_t *recons) const {
  memcpy(recons, &(xb[code_size * key]), sizeof(*recons) * code_size);
 }

+void IndexBinaryFlat::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+    hamming_range_search (x, xb.data(), n, ntotal, radius, code_size, result);
+}

 }  // namespace faiss
--- a/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFlat.h
@ -37,8 +37,13 @@ struct IndexBinaryFlat : IndexBinary {

  void reset() override;

-  void search (idx_t n, const uint8_t *x, idx_t k,
-               int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+  void search(idx_t n, const uint8_t *x, idx_t k,
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;
+
+  void range_search(idx_t n, const uint8_t *x, int radius,
+                   RangeSearchResult *result,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;

  void reconstruct(idx_t key, uint8_t *recons) const override;

--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.cpp
@ -50,7 +50,8 @@ void IndexBinaryFromFloat::reset() {
 }

 void IndexBinaryFromFloat::search(idx_t n, const uint8_t *x, idx_t k,
-                                  int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                                  int32_t *distances, idx_t *labels,
+                                  ConcurrentBitsetPtr bitset) const {
  constexpr idx_t bs = 32768;
  std::unique_ptr<float[]> xf(new float[bs * d]);
  std::unique_ptr<float[]> df(new float[bs * k]);
--- a/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryFromFloat.h
@ -41,7 +41,8 @@ struct IndexBinaryFromFloat : IndexBinary {
  void reset() override;

  void search(idx_t n, const uint8_t *x, idx_t k,
-              int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;

  void train(idx_t n, const uint8_t *x) override;
 };
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.cpp
@ -196,7 +196,8 @@ void IndexBinaryHNSW::train(idx_t n, const uint8_t *x)
 }

 void IndexBinaryHNSW::search(idx_t n, const uint8_t *x, idx_t k,
-                             int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
 {
 #pragma omp parallel
  {
--- a/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHNSW.h
@ -45,7 +45,8 @@ struct IndexBinaryHNSW : IndexBinary {

  /// entry point for search
  void search(idx_t n, const uint8_t *x, idx_t k,
-              int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              int32_t *distances, idx_t *labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;

  void reconstruct(idx_t key, uint8_t* recons) const override;

--- a/core/src/index/thirdparty/faiss/IndexBinaryHash.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHash.cpp
@ -0,0 +1,496 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+
+#include <faiss/IndexBinaryHash.h>
+
+#include <cstdio>
+#include <memory>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+
+
+namespace faiss {
+
+void IndexBinaryHash::InvertedList::add (
+        idx_t id, size_t code_size, const uint8_t *code)
+{
+    ids.push_back(id);
+    vecs.insert(vecs.end(), code, code + code_size);
+}
+
+IndexBinaryHash::IndexBinaryHash(int d, int b):
+    IndexBinary(d), b(b), nflip(0)
+{
+    is_trained = true;
+}
+
+IndexBinaryHash::IndexBinaryHash(): b(0), nflip(0)
+{
+    is_trained = true;
+}
+
+void IndexBinaryHash::reset()
+{
+    invlists.clear();
+    ntotal = 0;
+}
+
+
+void IndexBinaryHash::add(idx_t n, const uint8_t *x)
+{
+    add_with_ids(n, x, nullptr);
+}
+
+void IndexBinaryHash::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids)
+{
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+    // simplistic add function. Cannot really be parallelized.
+
+    for (idx_t i = 0; i < n; i++) {
+        idx_t id = xids ? xids[i] : ntotal + i;
+        const uint8_t * xi = x + i * code_size;
+        idx_t hash = *((uint64_t*)xi) & mask;
+        invlists[hash].add(id, code_size, xi);
+    }
+    ntotal += n;
+}
+
+namespace {
+
+
+/** Enumerate all bit vectors of size nbit with up to maxflip 1s
+ * test in P127257851 P127258235
+ */
+struct FlipEnumerator {
+    int nbit, nflip, maxflip;
+    uint64_t mask, x;
+
+    FlipEnumerator (int nbit, int maxflip): nbit(nbit), maxflip(maxflip) {
+        nflip = 0;
+        mask = 0;
+        x = 0;
+    }
+
+    bool next() {
+        if (x == mask) {
+            if (nflip == maxflip) {
+                return false;
+            }
+            // increase Hamming radius
+            nflip++;
+            mask = (((uint64_t)1 << nflip) - 1);
+            x = mask << (nbit - nflip);
+            return true;
+        }
+
+        int i = __builtin_ctzll(x);
+
+        if (i > 0) {
+            x ^= (uint64_t)3 << (i - 1);
+        } else {
+            // nb of LSB 1s
+            int n1 = __builtin_ctzll(~x);
+            // clear them
+            x &= ((uint64_t)(-1) << n1);
+            int n2 = __builtin_ctzll(x);
+            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
+        }
+        return true;
+    }
+
+};
+
+using idx_t = Index::idx_t;
+
+
+struct RangeSearchResults {
+    int radius;
+    RangeQueryResult &qres;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < radius) {
+            qres.add (dis, id);
+        }
+    }
+
+};
+
+struct KnnSearchResults {
+    // heap params
+    idx_t k;
+    int32_t * heap_sim;
+    idx_t * heap_ids;
+
+    using C = CMax<int, idx_t>;
+
+    inline void add (float dis, idx_t id) {
+        if (dis < heap_sim[0]) {
+            heap_pop<C> (k, heap_sim, heap_ids);
+            heap_push<C> (k, heap_sim, heap_ids, dis, id);
+        }
+    }
+
+};
+
+template<class HammingComputer, class SearchResults>
+void
+search_single_query_template(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+    size_t code_size = index.code_size;
+    uint64_t mask = ((uint64_t)1 << index.b) - 1;
+    uint64_t qhash = *((uint64_t*)q) & mask;
+    HammingComputer hc (q, code_size);
+    FlipEnumerator fe(index.b, index.nflip);
+
+    // loop over neighbors that are at most at nflip bits
+    do {
+        uint64_t hash = qhash ^ fe.x;
+        auto it = index.invlists.find (hash);
+
+        if (it == index.invlists.end()) {
+            continue;
+        }
+
+        const IndexBinaryHash::InvertedList &il = it->second;
+
+        size_t nv = il.ids.size();
+
+        if (nv == 0) {
+            n0++;
+        } else {
+            const uint8_t *codes = il.vecs.data();
+            for (size_t i = 0; i < nv; i++) {
+                int dis = hc.hamming (codes);
+                res.add(dis, il.ids[i]);
+                codes += code_size;
+            }
+            ndis += nv;
+            nlist++;
+        }
+    } while(fe.next());
+}
+
+template<class SearchResults>
+void
+search_single_query(const IndexBinaryHash & index, const uint8_t *q,
+                    SearchResults &res,
+                    size_t &n0, size_t &nlist, size_t &ndis)
+{
+#define HC(name) search_single_query_template<name>(index, q, res, n0, nlist, ndis);
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+
+} // anonymous namespace
+
+
+
+void IndexBinaryHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_single_query (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_single_query (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryHash::hashtable_size() const
+{
+    return invlists.size();
+}
+
+
+void IndexBinaryHash::display() const
+{
+    for (auto it = invlists.begin(); it != invlists.end(); ++it) {
+        printf("%ld: [", it->first);
+        const std::vector<idx_t> & v = it->second.ids;
+        for (auto x: v) {
+            printf("%ld ", 0 + x);
+        }
+        printf("]\n");
+
+    }
+}
+
+
+void IndexBinaryHashStats::reset()
+{
+    memset ((void*)this, 0, sizeof (*this));
+}
+
+IndexBinaryHashStats indexBinaryHash_stats;
+
+/*******************************************************
+ * IndexBinaryMultiHash implementation
+ ******************************************************/
+
+
+IndexBinaryMultiHash::IndexBinaryMultiHash(int d, int nhash, int b):
+    IndexBinary(d),
+    storage(new IndexBinaryFlat(d)), own_fields(true),
+    maps(nhash), nhash(nhash), b(b), nflip(0)
+{
+    FAISS_THROW_IF_NOT(nhash * b <= d);
+}
+
+IndexBinaryMultiHash::IndexBinaryMultiHash():
+    storage(nullptr), own_fields(true),
+    nhash(0), b(0), nflip(0)
+{}
+
+IndexBinaryMultiHash::~IndexBinaryMultiHash()
+{
+    if (own_fields) {
+        delete storage;
+    }
+}
+
+
+void IndexBinaryMultiHash::reset()
+{
+    storage->reset();
+    ntotal = 0;
+    for(auto map: maps) {
+        map.clear();
+    }
+}
+
+void IndexBinaryMultiHash::add(idx_t n, const uint8_t *x)
+{
+    storage->add(n, x);
+    // populate maps
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    for(idx_t i = 0; i < n; i++) {
+        const uint8_t *xi = x + i * code_size;
+        int ho = 0;
+        for(int h = 0; h < nhash; h++) {
+            uint64_t hash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+            hash &= mask;
+            maps[h][hash].push_back(i + ntotal);
+            ho += b;
+        }
+    }
+    ntotal += n;
+}
+
+
+namespace {
+
+template <class HammingComputer, class SearchResults>
+static
+void verify_shortlist(
+        const IndexBinaryFlat & index,
+        const uint8_t * q,
+        const std::unordered_set<Index::idx_t> & shortlist,
+        SearchResults &res)
+{
+    size_t code_size = index.code_size;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+    HammingComputer hc (q, code_size);
+    const uint8_t *codes = index.xb.data();
+
+    for (auto i: shortlist) {
+        int dis = hc.hamming (codes + i * code_size);
+        res.add(dis, i);
+    }
+}
+
+template<class SearchResults>
+void
+search_1_query_multihash(const IndexBinaryMultiHash & index, const uint8_t *xi,
+                         SearchResults &res,
+                         size_t &n0, size_t &nlist, size_t &ndis)
+{
+
+    std::unordered_set<idx_t> shortlist;
+    int b = index.b;
+    uint64_t mask = ((uint64_t)1 << b) - 1;
+
+    int ho = 0;
+    for(int h = 0; h < index.nhash; h++) {
+        uint64_t qhash = *(uint64_t*)(xi + (ho >> 3)) >> (ho & 7);
+        qhash &= mask;
+        const IndexBinaryMultiHash::Map & map = index.maps[h];
+
+        FlipEnumerator fe(index.b, index.nflip);
+        // loop over neighbors that are at most at nflip bits
+        do {
+            uint64_t hash = qhash ^ fe.x;
+            auto it = map.find (hash);
+
+            if (it != map.end()) {
+                const std::vector<idx_t> & v = it->second;
+                for (auto i: v) {
+                    shortlist.insert(i);
+                }
+                nlist++;
+            } else {
+                n0++;
+            }
+        } while(fe.next());
+
+        ho += b;
+    }
+    ndis += shortlist.size();
+
+    // verify shortlist
+
+#define HC(name) verify_shortlist<name> (*index.storage, xi, shortlist, res)
+    switch(index.code_size) {
+    case 4: HC(HammingComputer4); break;
+    case 8: HC(HammingComputer8); break;
+    case 16: HC(HammingComputer16); break;
+    case 20: HC(HammingComputer20); break;
+    case 32: HC(HammingComputer32); break;
+    default:
+        if (index.code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else {
+            HC(HammingComputerDefault);
+        }
+    }
+#undef HC
+}
+
+} // anonymous namespace
+
+void IndexBinaryMultiHash::range_search(idx_t n, const uint8_t *x, int radius,
+                                   RangeSearchResult *result,
+                                   ConcurrentBitsetPtr bitset) const
+{
+
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel if(n > 100) reduction(+: ndis, n0, nlist)
+    {
+        RangeSearchPartialResult pres (result);
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) { // loop queries
+            RangeQueryResult & qres = pres.new_result (i);
+            RangeSearchResults res = {radius, qres};
+            const uint8_t *q = x + i * code_size;
+
+            search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+        }
+        pres.finalize ();
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+void IndexBinaryMultiHash::search(idx_t n, const uint8_t *x, idx_t k,
+                             int32_t *distances, idx_t *labels,
+                             ConcurrentBitsetPtr bitset) const
+{
+
+    using HeapForL2 = CMax<int32_t, idx_t>;
+    size_t nlist = 0, ndis = 0, n0 = 0;
+
+#pragma omp parallel for if(n > 100) reduction(+: nlist, ndis, n0)
+    for (size_t i = 0; i < n; i++) {
+        int32_t * simi = distances + k * i;
+        idx_t * idxi = labels + k * i;
+
+        heap_heapify<HeapForL2> (k, simi, idxi);
+        KnnSearchResults res = {k, simi, idxi};
+        const uint8_t *q = x + i * code_size;
+
+        search_1_query_multihash (*this, q, res, n0, nlist, ndis);
+
+    }
+    indexBinaryHash_stats.nq += n;
+    indexBinaryHash_stats.n0 += n0;
+    indexBinaryHash_stats.nlist += nlist;
+    indexBinaryHash_stats.ndis += ndis;
+}
+
+size_t IndexBinaryMultiHash::hashtable_size() const
+{
+    size_t tot = 0;
+    for (auto map: maps) {
+        tot += map.size();
+    }
+
+    return tot;
+}
+
+
+}
--- a/core/src/index/thirdparty/faiss/IndexBinaryHash.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryHash.h
@ -0,0 +1,120 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_BINARY_HASH_H
+#define FAISS_BINARY_HASH_H
+
+
+
+#include <vector>
+#include <unordered_map>
+
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/utils/Heap.h>
+
+
+namespace faiss {
+
+struct RangeSearchResult;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryHash : IndexBinary {
+
+    struct InvertedList {
+        std::vector<idx_t> ids;
+        std::vector<uint8_t> vecs;
+
+        void add (idx_t id, size_t code_size, const uint8_t *code);
+    };
+
+    using InvertedListMap = std::unordered_map<idx_t, InvertedList>;
+    InvertedListMap invlists;
+
+    int b, nflip;
+
+    IndexBinaryHash(int d, int b);
+
+    IndexBinaryHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    void display() const;
+    size_t hashtable_size() const;
+
+};
+
+struct IndexBinaryHashStats {
+    size_t nq;       // nb of queries run
+    size_t n0;       // nb of empty lists
+    size_t nlist;    // nb of non-empty inverted lists scanned
+    size_t ndis;     // nb of distancs computed
+
+    IndexBinaryHashStats () {reset (); }
+    void reset ();
+};
+
+extern IndexBinaryHashStats indexBinaryHash_stats;
+
+
+/** just uses the b first bits as a hash value */
+struct IndexBinaryMultiHash: IndexBinary {
+
+    // where the vectors are actually stored
+    IndexBinaryFlat *storage;
+    bool own_fields;
+
+    // maps hash values to the ids that hash to them
+    using Map = std::unordered_map<idx_t, std::vector<idx_t> >;
+
+    // the different hashes, size nhash
+    std::vector<Map> maps;
+
+    int nhash; ///< nb of hash maps
+    int b; ///< nb bits per hash map
+    int nflip; ///< nb bit flips to use at search time
+
+    IndexBinaryMultiHash(int d, int nhash, int b);
+
+    IndexBinaryMultiHash();
+
+    ~IndexBinaryMultiHash();
+
+    void reset() override;
+
+    void add(idx_t n, const uint8_t *x) override;
+
+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
+     void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
+
+    size_t hashtable_size() const;
+
+};
+
+}
+
+#endif
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
@ -12,17 +12,20 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexBinaryIVF.h>

-#include <cstdio>
-#include <memory>
 #include <cmath>
+#include <cstdio>
+#include <omp.h>
+
+#include <memory>

 #include <faiss/utils/BinaryDistance.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 #include <faiss/utils/Heap.h>
-
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexLSH.h>


 namespace faiss {
@ -33,7 +36,6 @@ IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
      own_invlists(true),
      nprobe(1),
      max_codes(0),
-      maintain_direct_map(false),
      quantizer(quantizer),
      nlist(nlist),
      own_fields(false),
@ -51,7 +53,6 @@ IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist, M
      own_invlists(true),
      nprobe(1),
      max_codes(0),
-      maintain_direct_map(false),
      quantizer(quantizer),
      nlist(nlist),
      own_fields(false),
@ -68,7 +69,6 @@ IndexBinaryIVF::IndexBinaryIVF()
      own_invlists(false),
      nprobe(1),
      max_codes(0),
-      maintain_direct_map(false),
      quantizer(nullptr),
      nlist(0),
      own_fields(false),
@ -87,8 +87,7 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
                              const idx_t *precomputed_idx) {
  FAISS_THROW_IF_NOT(is_trained);
  assert(invlists);
-  FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
-                         "cannot have direct map and add with ids");
+  direct_map.check_can_add (xids);

  const idx_t * idx;

@ -107,13 +106,15 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
    idx_t id = xids ? xids[i] : ntotal + i;
    idx_t list_no = idx[i];

-    if (list_no < 0)
-      continue;
-    const uint8_t *xi = x + i * code_size;
-    size_t offset = invlists->add_entry(list_no, id, xi);
+    if (list_no < 0) {
+        direct_map.add_single_id (id, -1, 0);
+    } else {
+        const uint8_t *xi = x + i * code_size;
+        size_t offset = invlists->add_entry(list_no, id, xi);
+
+        direct_map.add_single_id (id, list_no, offset);
+    }

-    if (maintain_direct_map)
-      direct_map.push_back(list_no << 32 | offset);
    n_add++;
  }
  if (verbose) {
@ -123,30 +124,23 @@ void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
  ntotal += n_add;
 }

-void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
-  // nothing to do
-  if (new_maintain_direct_map == maintain_direct_map)
-    return;
-
-  if (new_maintain_direct_map) {
-    direct_map.resize(ntotal, -1);
-    for (size_t key = 0; key < nlist; key++) {
-      size_t list_size = invlists->list_size(key);
-      const idx_t *idlist = invlists->get_ids(key);
-
-      for (size_t ofs = 0; ofs < list_size; ofs++) {
-        FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
-                               "direct map supported only for seuquential ids");
-        direct_map[idlist[ofs]] = key << 32 | ofs;
-      }
+void IndexBinaryIVF::make_direct_map (bool b)
+{
+    if (b) {
+        direct_map.set_type (DirectMap::Array, invlists, ntotal);
+    } else {
+        direct_map.set_type (DirectMap::NoMap, invlists, ntotal);
    }
-  } else {
-    direct_map.clear();
-  }
-  maintain_direct_map = new_maintain_direct_map;
 }

-void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
+void IndexBinaryIVF::set_direct_map_type (DirectMap::Type type)
+{
+    direct_map.set_type (type, invlists, ntotal);
+}
+
+
+void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k,
+                            int32_t *distances, idx_t *labels,
                            ConcurrentBitsetPtr bitset) const {
  std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
  std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
@ -164,10 +158,7 @@ void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k, int32_t *distanc
 }

 void IndexBinaryIVF::get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, ConcurrentBitsetPtr bitset) {
-
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);

    /* only get vector by 1 id */
    FAISS_ASSERT(n == 1);
@ -180,9 +171,7 @@ void IndexBinaryIVF::get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, Con

 void IndexBinaryIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                                   ConcurrentBitsetPtr bitset) {
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);

    auto x = new uint8_t[n * d];
    for (idx_t i = 0; i < n; ++i) {
@ -194,11 +183,8 @@ void IndexBinaryIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *
 }

 void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
-  FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
-                         "direct map is not initialized");
-  idx_t list_no = direct_map[key] >> 32;
-  idx_t offset = direct_map[key] & 0xffffffff;
-  reconstruct_from_offset(list_no, offset, recons);
+    idx_t lo = direct_map.get (key);
+    reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
 }

 void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
@ -267,39 +253,9 @@ void IndexBinaryIVF::reset() {
 }

 size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
-  FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
-                         "direct map remove not implemented");
-
-  std::vector<idx_t> toremove(nlist);
-
-#pragma omp parallel for
-  for (idx_t i = 0; i < nlist; i++) {
-    idx_t l0 = invlists->list_size (i), l = l0, j = 0;
-    const idx_t *idsi = invlists->get_ids(i);
-    while (j < l) {
-      if (sel.is_member(idsi[j])) {
-        l--;
-        invlists->update_entry(
-          i, j,
-          invlists->get_single_id(i, l),
-          invlists->get_single_code(i, l));
-      } else {
-        j++;
-      }
-    }
-    toremove[i] = l0 - l;
-  }
-  // this will not run well in parallel on ondisk because of possible shrinks
-  size_t nremove = 0;
-  for (idx_t i = 0; i < nlist; i++) {
-    if (toremove[i] > 0) {
-      nremove += toremove[i];
-      invlists->resize(
-        i, invlists->list_size(i) - toremove[i]);
-    }
-  }
-  ntotal -= nremove;
-  return nremove;
+    size_t nremove = direct_map.remove_ids (sel, invlists);
+    ntotal -= nremove;
+    return nremove;
 }

 void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
@ -319,9 +275,6 @@ void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
    Clustering clus(d, nlist, cp);
    quantizer->reset();

-    std::unique_ptr<float[]> x_f(new float[n * d]);
-    binary_to_real(n * d, x, x_f.get());
-
    IndexFlat index_tmp;

    if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
@ -338,8 +291,12 @@ void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
             clustering_index->d);
    }

-    clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
+    // LSH codec that is able to convert the binary vectors to floats.
+    IndexLSH codec(d, d, false, false);

+    clus.train_encoded (n, x, &codec, clustering_index ? *clustering_index : index_tmp);
+
+    // convert clusters to binary
    std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
    real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());

@ -355,8 +312,7 @@ void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
  FAISS_THROW_IF_NOT(other.d == d);
  FAISS_THROW_IF_NOT(other.nlist == nlist);
  FAISS_THROW_IF_NOT(other.code_size == code_size);
-  FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
-                          !other.maintain_direct_map),
+  FAISS_THROW_IF_NOT_MSG(direct_map.no() && other.direct_map.no(),
                         "direct map copy not implemented");
  FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
                         "can only merge indexes of the same type");
@ -383,13 +339,15 @@ namespace {
 using idx_t = Index::idx_t;


-template<class HammingComputer, bool store_pairs>
+template<class HammingComputer>
 struct IVFBinaryScannerL2: BinaryInvertedListScanner {

    HammingComputer hc;
    size_t code_size;
+    bool store_pairs;

-    IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
+    IVFBinaryScannerL2 (size_t code_size, bool store_pairs):
+        code_size (code_size), store_pairs(store_pairs)
    {}

    void set_query (const uint8_t *query_vector) override {
@ -418,7 +376,6 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
        for (size_t j = 0; j < n; j++) {
            if (!bitset || !bitset->test(ids[j])) {
                uint32_t dis = hc.hamming (codes);
-
                if (dis < simi[0]) {
                    idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
                    heap_swap_top<C> (k, simi, idxi, dis, id);
@ -430,12 +387,26 @@ struct IVFBinaryScannerL2: BinaryInvertedListScanner {
        return nup;
    }

-
+    void scan_codes_range (size_t n,
+                           const uint8_t *codes,
+                           const idx_t *ids,
+                           int radius,
+                           RangeQueryResult &result) const
+    {
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming (codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
+                result.add (dis, id);
+            }
+            codes += code_size;
+        }
+    }
 };

 template<class DistanceComputer, bool store_pairs>
 struct IVFBinaryScannerJaccard: BinaryInvertedListScanner {
-
    DistanceComputer hc;
    size_t code_size;

@ -478,35 +449,11 @@ struct IVFBinaryScannerJaccard: BinaryInvertedListScanner {
        }
        return nup;
    }
-
 };

 template <bool store_pairs>
 BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {

-    switch (code_size) {
-#define HANDLE_CS(cs)                                                  \
-    case cs:                                                            \
-        return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
-      HANDLE_CS(4);
-      HANDLE_CS(8);
-      HANDLE_CS(16);
-      HANDLE_CS(20);
-      HANDLE_CS(32);
-      HANDLE_CS(64);
-#undef HANDLE_CS
-    default:
-        if (code_size % 8 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM8,
-                store_pairs> (code_size);
-        } else if (code_size % 4 == 0) {
-            return new IVFBinaryScannerL2<HammingComputerM4,
-                store_pairs> (code_size);
-        } else {
-            return new IVFBinaryScannerL2<HammingComputerDefault,
-                store_pairs> (code_size);
-        }
-    }
 }

 template <bool store_pairs>
@ -703,7 +650,6 @@ void search_knn_binary_dis_heap(const IndexBinaryIVF& ivf,
    indexIVF_stats.nlist += nlistv;
    indexIVF_stats.ndis += ndis;
    indexIVF_stats.nheap_updates += nheap;
-
 }

 template<class HammingComputer, bool store_pairs>
@ -763,12 +709,11 @@ void search_knn_hamming_count(const IndexBinaryIVF& ivf,
        : ivf.invlists->get_ids(key);

      for (size_t j = 0; j < list_size; j++) {
-          if(!bitset || !bitset->test(ids[j])){
-              const uint8_t * yj = list_vecs + ivf.code_size * j;
-
-              idx_t id = store_pairs ? (key << 32 | j) : ids[j];
-              csi.update_counter(yj, id);
-          }
+        if (!bitset || !bitset->test(ids[j])) {
+          const uint8_t *yj = list_vecs + ivf.code_size * j;
+          idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+          csi.update_counter(yj, id);
+        }
      }
      if (ids)
          ivf.invlists->release_ids (key, ids);
@ -816,7 +761,7 @@ void search_knn_hamming_count_1 (
 #define HANDLE_CS(cs)                                                  \
    case cs:                                                            \
       search_knn_hamming_count<HammingComputer ## cs, store_pairs>(    \
-           ivf, nx, x, keys, k, distances, labels, params, bitset);             \
+           ivf, nx, x, keys, k, distances, labels, params, bitset);     \
      break;
      HANDLE_CS(4);
      HANDLE_CS(8);
@ -838,7 +783,6 @@ void search_knn_hamming_count_1 (
        }
        break;
    }
-
 }

 }  // namespace
@ -846,25 +790,26 @@ void search_knn_hamming_count_1 (
 BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
      (bool store_pairs) const
 {
-    switch (metric_type) {
-    case METRIC_Jaccard:
-    case METRIC_Tanimoto:
-        if (store_pairs) {
-            return select_IVFBinaryScannerJaccard<true> (code_size);
-        } else {
-            return select_IVFBinaryScannerJaccard<false> (code_size);
-        }
-    case METRIC_Substructure:
-    case METRIC_Superstructure:
-        // unsupported
-        return nullptr;
+
+#define HC(name) return new IVFBinaryScannerL2<name> (code_size, store_pairs)
+    switch (code_size) {
+    case 4: HC(HammingComputer4);
+    case 8: HC(HammingComputer8);
+    case 16: HC(HammingComputer16);
+    case 20: HC(HammingComputer20);
+    case 32: HC(HammingComputer32);
+    case 64: HC(HammingComputer64);
    default:
-        if (store_pairs) {
-            return select_IVFBinaryScannerL2<true>(code_size);
+        if (code_size % 8 == 0) {
+            HC(HammingComputerM8);
+        } else if (code_size % 4 == 0) {
+            HC(HammingComputerM4);
        } else {
-            return select_IVFBinaryScannerL2<false>(code_size);
+            HC(HammingComputerDefault);
        }
    }
+#undef HC
+
 }

 void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
@ -875,7 +820,6 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
                                        const IVFSearchParameters *params,
                                        ConcurrentBitsetPtr bitset
                                        ) const {
-
    if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
        if (use_heap) {
            float *D = new float[k * n];
@ -914,6 +858,83 @@ void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
    }
 }

+void IndexBinaryIVF::range_search(
+        idx_t n, const uint8_t *x, int radius,
+        RangeSearchResult *res,
+        ConcurrentBitsetPtr bitset) const
+{
+    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
+
+    double t0 = getmillisecs();
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    indexIVF_stats.quantization_time += getmillisecs() - t0;
+
+    t0 = getmillisecs();
+    invlists->prefetch_lists(idx.get(), n * nprobe);
+
+    bool store_pairs = false;
+    size_t nlistv = 0, ndis = 0;
+
+    std::vector<RangeSearchPartialResult *> all_pres (omp_get_max_threads());
+
+#pragma omp parallel reduction(+: nlistv, ndis)
+    {
+        RangeSearchPartialResult pres(res);
+        std::unique_ptr<BinaryInvertedListScanner> scanner
+            (get_InvertedListScanner(store_pairs));
+        FAISS_THROW_IF_NOT (scanner.get ());
+
+        all_pres[omp_get_thread_num()] = &pres;
+
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult &qres)
+        {
+
+            idx_t key = idx[i * nprobe + ik];  /* select the list  */
+            if (key < 0) return;
+            FAISS_THROW_IF_NOT_FMT (
+                    key < (idx_t) nlist,
+                    "Invalid key=%ld  at ik=%ld nlist=%ld\n",
+                    key, ik, nlist);
+            const size_t list_size = invlists->list_size(key);
+
+            if (list_size == 0) return;
+
+            InvertedLists::ScopedCodes scodes (invlists, key);
+            InvertedLists::ScopedIds ids (invlists, key);
+
+            scanner->set_list (key, coarse_dis[i * nprobe + ik]);
+            nlistv++;
+            ndis += list_size;
+            scanner->scan_codes_range (list_size, scodes.get(),
+                                       ids.get(), radius, qres);
+        };
+
+#pragma omp for
+        for (size_t i = 0; i < n; i++) {
+            scanner->set_query (x + i * code_size);
+
+            RangeQueryResult & qres = pres.new_result (i);
+
+            for (size_t ik = 0; ik < nprobe; ik++) {
+                scan_list_func (i, ik, qres);
+            }
+
+        }
+
+        pres.finalize();
+
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.search_time += getmillisecs() - t0;
+
+}
+
+
+
+
 IndexBinaryIVF::~IndexBinaryIVF() {
  if (own_invlists) {
    delete invlists;
--- a/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexBinaryIVF.h
@ -46,8 +46,7 @@ struct IndexBinaryIVF : IndexBinary {
    bool use_heap = true;

    /// map for direct access to the elements. Enables reconstruct().
-    bool maintain_direct_map;
-    std::vector<idx_t> direct_map;
+    DirectMap direct_map;

    IndexBinary *quantizer;   ///< quantizer that maps vectors to inverted lists
    size_t nlist;             ///< number of possible key values
@ -113,8 +112,8 @@ struct IndexBinaryIVF : IndexBinary {
                                         bool store_pairs=false) const;

    /** assign the vectors, then call search_preassign */
-    void search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
-                ConcurrentBitsetPtr bitset = nullptr) const override;
+    void search(idx_t n, const uint8_t *x, idx_t k,
+                int32_t *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;

    /** get raw vectors by ids */
    void get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, ConcurrentBitsetPtr bitset = nullptr) override;
@ -122,6 +121,10 @@ struct IndexBinaryIVF : IndexBinary {
    void search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
                       ConcurrentBitsetPtr bitset = nullptr) override;

+    void range_search(idx_t n, const uint8_t *x, int radius,
+                      RangeSearchResult *result,
+                      ConcurrentBitsetPtr bitset = nullptr) const override;
+
    void reconstruct(idx_t key, uint8_t *recons) const override;

    /** Reconstruct a subset of the indexed vectors.
@ -177,6 +180,8 @@ struct IndexBinaryIVF : IndexBinary {
     */
    void make_direct_map(bool new_maintain_direct_map=true);

+    void set_direct_map_type (DirectMap::Type type);
+
    void replace_invlists(InvertedLists *il, bool own=false);
 };

@ -211,6 +216,12 @@ struct BinaryInvertedListScanner {
                               size_t k,
                               ConcurrentBitsetPtr bitset = nullptr) const = 0;

+    virtual void scan_codes_range (size_t n,
+                                   const uint8_t *codes,
+                                   const idx_t *ids,
+                                   int radius,
+                                   RangeQueryResult &result) const = 0;
+
    virtual ~BinaryInvertedListScanner () {}

 };
--- a/core/src/index/thirdparty/faiss/IndexFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexFlat.cpp
@ -38,26 +38,28 @@ void IndexFlat::reset() {
    ntotal = 0;
 }

-void IndexFlat::search(idx_t n, const float* x, idx_t k, float* distances, idx_t* labels,
-                       ConcurrentBitsetPtr bitset) const
+
+void IndexFlat::search (idx_t n, const float *x, idx_t k,
+                        float *distances, idx_t *labels,
+                        ConcurrentBitsetPtr bitset) const
 {
    // we see the distances and labels as heaps

    if (metric_type == METRIC_INNER_PRODUCT) {
        float_minheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
        knn_inner_product (x, xb.data(), d, n, ntotal, &res, bitset);
    } else if (metric_type == METRIC_L2) {
        float_maxheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
        knn_L2sqr (x, xb.data(), d, n, ntotal, &res, bitset);
    } else if (metric_type == METRIC_Jaccard) {
        float_maxheap_array_t res = {
                size_t(n), size_t(k), labels, distances};
-        knn_jaccard (x, xb.data(), d, n, ntotal, &res, bitset);
+        knn_jaccard(x, xb.data(), d, n, ntotal, &res, bitset);
    } else {
        float_maxheap_array_t res = {
-                size_t(n), size_t(k), labels, distances};
+            size_t(n), size_t(k), labels, distances};
        knn_extra_metrics (x, xb.data(), d, n, ntotal,
                           metric_type, metric_arg,
                           &res, bitset);
@ -67,7 +69,6 @@ void IndexFlat::search(idx_t n, const float* x, idx_t k, float* distances, idx_t
 void IndexFlat::assign(idx_t n, const float * x, idx_t * labels, float* distances)
 {
    // usually used in IVF k-means algorithm
-
    float *dis_inner = (distances == nullptr) ? new float[n] : distances;
    switch (metric_type) {
        case METRIC_INNER_PRODUCT:
--- a/core/src/index/thirdparty/faiss/IndexFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexFlat.h
@ -19,6 +19,7 @@ namespace faiss {

 /** Index that stores the full vectors and performs exhaustive search */
 struct IndexFlat: Index {
+
    /// database vectors, size ntotal * d
    std::vector<float> xb;

@ -154,7 +155,7 @@ struct IndexRefineFlat: Index {
 };


-/// optimized version for 1D "vectors"
+/// optimized version for 1D "vectors".
 struct IndexFlat1D:IndexFlatL2 {
    bool continuous_update; ///< is the permutation updated continuously?

--- a/core/src/index/thirdparty/faiss/IndexHNSW.cpp
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.cpp
@ -26,7 +26,6 @@
 #include <stdint.h>

 #ifdef __SSE__
-#include <immintrin.h>
 #endif

 #include <faiss/utils/distances.h>
@ -55,7 +54,6 @@ namespace faiss {
 using idx_t = Index::idx_t;
 using MinimaxHeap = HNSW::MinimaxHeap;
 using storage_idx_t = HNSW::storage_idx_t;
-using NodeDistCloser = HNSW::NodeDistCloser;
 using NodeDistFarther = HNSW::NodeDistFarther;

 HNSWStats hnsw_stats;
@ -67,6 +65,50 @@ HNSWStats hnsw_stats;
 namespace {


+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer: DistanceComputer {
+
+    /// owned by this
+    DistanceComputer *basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer *basedis):
+        basedis(basedis)
+    {}
+
+    void set_query(const float *x) override {
+        basedis->set_query(x);
+    }
+
+     /// compute distance of vector i to current query
+    float operator () (idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+     /// compute distance between two stored vectors
+    float symmetric_dis (idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer ()
+    {
+        delete basedis;
+    }
+
+};
+
+DistanceComputer *storage_distance_computer(const Index *storage)
+{
+    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+        return new NegativeDistanceComputer(storage->get_distance_computer());
+    } else {
+        return storage->get_distance_computer();
+    }
+}
+
+
+
 void hnsw_add_vertices(IndexHNSW &index_hnsw,
                       size_t n0,
                       size_t n, const float *x,
@ -152,7 +194,7 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
                VisitedTable vt (ntotal);

                DistanceComputer *dis =
-                    index_hnsw.storage->get_distance_computer();
+                    storage_distance_computer (index_hnsw.storage);
                ScopeDeleter1<DistanceComputer> del(dis);
                int prev_display = verbose && omp_get_thread_num() == 0 ? 0 : -1;
                size_t counter = 0;
@ -210,8 +252,8 @@ void hnsw_add_vertices(IndexHNSW &index_hnsw,
 * IndexHNSW implementation
 **************************************************************/

-IndexHNSW::IndexHNSW(int d, int M):
-    Index(d, METRIC_L2),
+IndexHNSW::IndexHNSW(int d, int M, MetricType metric):
+    Index(d, metric),
    hnsw(M),
    own_fields(false),
    storage(nullptr),
@ -258,7 +300,8 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
 #pragma omp parallel reduction(+ : nreorder)
        {
            VisitedTable vt (ntotal);
-            DistanceComputer *dis = storage->get_distance_computer();
+
+            DistanceComputer *dis = storage_distance_computer(storage);
            ScopeDeleter1<DistanceComputer> del(dis);

 #pragma omp for
@ -290,6 +333,14 @@ void IndexHNSW::search (idx_t n, const float *x, idx_t k,
        }
        InterruptCallback::check ();
    }
+
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        // we need to revert the negated distances
+        for (size_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
+
    hnsw_stats.nreorder += nreorder;
 }

@ -323,7 +374,7 @@ void IndexHNSW::shrink_level_0_neighbors(int new_size)
 {
 #pragma omp parallel
    {
-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
        ScopeDeleter1<DistanceComputer> del(dis);

 #pragma omp for
@ -367,7 +418,7 @@ void IndexHNSW::search_level_0(
    storage_idx_t ntotal = hnsw.levels.size();
 #pragma omp parallel
    {
-        DistanceComputer *qdis = storage->get_distance_computer();
+        DistanceComputer *qdis = storage_distance_computer(storage);
        ScopeDeleter1<DistanceComputer> del(qdis);

        VisitedTable vt (ntotal);
@ -436,7 +487,7 @@ void IndexHNSW::init_level_0_from_knngraph(

 #pragma omp parallel for
    for (idx_t i = 0; i < ntotal; i++) {
-        DistanceComputer *qdis = storage->get_distance_computer();
+        DistanceComputer *qdis = storage_distance_computer(storage);
        float vec[d];
        storage->reconstruct(i, vec);
        qdis->set_query(vec);
@ -480,7 +531,7 @@ void IndexHNSW::init_level_0_from_entry_points(
    {
        VisitedTable vt (ntotal);

-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
        ScopeDeleter1<DistanceComputer> del(dis);
        float vec[storage->d];

@ -518,7 +569,7 @@ void IndexHNSW::reorder_links()
        std::vector<float> distances (M);
        std::vector<size_t> order (M);
        std::vector<storage_idx_t> tmp (M);
-        DistanceComputer *dis = storage->get_distance_computer();
+        DistanceComputer *dis = storage_distance_computer(storage);
        ScopeDeleter1<DistanceComputer> del(dis);

 #pragma omp for
@ -826,8 +877,8 @@ IndexHNSWFlat::IndexHNSWFlat()
    is_trained = true;
 }

-IndexHNSWFlat::IndexHNSWFlat(int d, int M):
-    IndexHNSW(new IndexFlatL2(d), M)
+IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric):
+    IndexHNSW(new IndexFlat(d, metric), M)
 {
    own_fields = true;
    is_trained = true;
@ -860,8 +911,9 @@ void IndexHNSWPQ::train(idx_t n, const float* x)
 **************************************************************/


-IndexHNSWSQ::IndexHNSWSQ(int d, QuantizerType qtype, int M):
-    IndexHNSW (new IndexScalarQuantizer (d, qtype), M)
+IndexHNSWSQ::IndexHNSWSQ(int d, QuantizerType qtype, int M,
+                         MetricType metric):
+    IndexHNSW (new IndexScalarQuantizer (d, qtype, metric), M)
 {
    is_trained = false;
    own_fields = true;
@ -986,7 +1038,7 @@ void IndexHNSW2Level::search (idx_t n, const float *x, idx_t k,
 #pragma omp parallel
        {
            VisitedTable vt (ntotal);
-            DistanceComputer *dis = storage->get_distance_computer();
+            DistanceComputer *dis = storage_distance_computer(storage);
            ScopeDeleter1<DistanceComputer> del(dis);

            int candidates_size = hnsw.upper_beam;
--- a/core/src/index/thirdparty/faiss/IndexHNSW.h
+++ b/core/src/index/thirdparty/faiss/IndexHNSW.h
@ -79,7 +79,7 @@ struct IndexHNSW : Index {

    ReconstructFromNeighbors *reconstruct_from_neighbors;

-    explicit IndexHNSW (int d = 0, int M = 32);
+    explicit IndexHNSW (int d = 0, int M = 32, MetricType metric = METRIC_L2);
    explicit IndexHNSW (Index *storage, int M = 32);

    ~IndexHNSW() override;
@ -91,7 +91,8 @@ struct IndexHNSW : Index {

    /// entry point for search
    void search (idx_t n, const float *x, idx_t k,
-                 float *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+                 float *distances, idx_t *labels,
+                 ConcurrentBitsetPtr bitset = nullptr) const override;

    void reconstruct(idx_t key, float* recons) const override;

@ -132,7 +133,7 @@ struct IndexHNSW : Index {

 struct IndexHNSWFlat : IndexHNSW {
    IndexHNSWFlat();
-    IndexHNSWFlat(int d, int M);
+    IndexHNSWFlat(int d, int M, MetricType metric = METRIC_L2);
 };

 /** PQ index topped with with a HNSW structure to access elements
@ -149,7 +150,7 @@ struct IndexHNSWPQ : IndexHNSW {
 */
 struct IndexHNSWSQ : IndexHNSW {
    IndexHNSWSQ();
-    IndexHNSWSQ(int d, QuantizerType qtype, int M);
+    IndexHNSWSQ(int d, QuantizerType qtype, int M, MetricType metric = METRIC_L2);
 };

 /** 2-level code structure with fast random access
@ -162,8 +163,8 @@ struct IndexHNSW2Level : IndexHNSW {

    /// entry point for search
    void search (idx_t n, const float *x, idx_t k,
-                 float *distances, idx_t *labels, ConcurrentBitsetPtr bitset = nullptr) const override;
-
+                 float *distances, idx_t *labels,
+                 ConcurrentBitsetPtr bitset = nullptr) const override;
 };


--- a/core/src/index/thirdparty/faiss/IndexIVF.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp
@ -174,8 +174,7 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d,
    code_size (code_size),
    nprobe (1),
    max_codes (0),
-    parallel_mode (0),
-    maintain_direct_map (false)
+    parallel_mode (0)
 {
    FAISS_THROW_IF_NOT (d == quantizer->d);
    is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
@ -189,8 +188,7 @@ IndexIVF::IndexIVF (Index * quantizer, size_t d,
 IndexIVF::IndexIVF ():
    invlists (nullptr), own_invlists (false),
    code_size (0),
-    nprobe (1), max_codes (0), parallel_mode (0),
-    maintain_direct_map (false)
+    nprobe (1), max_codes (0), parallel_mode (0)
 {}

 void IndexIVF::add (idx_t n, const float * x)
@ -216,6 +214,8 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
    }

    FAISS_THROW_IF_NOT (is_trained);
+    direct_map.check_can_add (xids);
+
    std::unique_ptr<idx_t []> idx(new idx_t[n]);
    quantizer->assign (n, x, idx.get());
    size_t nadd = 0, nminus1 = 0;
@ -227,6 +227,8 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
    std::unique_ptr<uint8_t []> flat_codes(new uint8_t [n * code_size]);
    encode_vectors (n, x, idx.get(), flat_codes.get());

+    DirectMapAdd dm_adder(direct_map, n, xids);
+
 #pragma omp parallel reduction(+: nadd)
    {
        int nt = omp_get_num_threads();
@ -237,13 +239,21 @@ void IndexIVF::add_with_ids (idx_t n, const float * x, const idx_t *xids)
            idx_t list_no = idx [i];
            if (list_no >= 0 && list_no % nt == rank) {
                idx_t id = xids ? xids[i] : ntotal + i;
-                invlists->add_entry (list_no, id,
-                                     flat_codes.get() + i * code_size);
+                size_t ofs = invlists->add_entry (
+                     list_no, id,
+                     flat_codes.get() + i * code_size
+                );
+
+                dm_adder.add (i, list_no, ofs);
+
                nadd++;
+            } else if (rank == 0 && list_no == -1) {
+                dm_adder.add (i, -1, 0);
            }
        }
    }

+
    if (verbose) {
        printf("    added %ld / %ld vectors (%ld -1s)\n", nadd, n, nminus1);
    }
@ -272,33 +282,25 @@ void IndexIVF::restore_quantizer() {
    }
 }

-void IndexIVF::make_direct_map (bool new_maintain_direct_map)
+void IndexIVF::make_direct_map (bool b)
 {
-    // nothing to do
-    if (new_maintain_direct_map == maintain_direct_map)
-        return;
-
-    if (new_maintain_direct_map) {
-        direct_map.resize (ntotal, -1);
-        for (size_t key = 0; key < nlist; key++) {
-            size_t list_size = invlists->list_size (key);
-            ScopedIds idlist (invlists, key);
-
-            for (long ofs = 0; ofs < list_size; ofs++) {
-                FAISS_THROW_IF_NOT_MSG (
-                       0 <= idlist [ofs] && idlist[ofs] < ntotal,
-                       "direct map supported only for seuquential ids");
-                direct_map [idlist [ofs]] = key << 32 | ofs;
-            }
-        }
+    if (b) {
+        direct_map.set_type (DirectMap::Array, invlists, ntotal);
    } else {
-        direct_map.clear ();
+        direct_map.set_type (DirectMap::NoMap, invlists, ntotal);
    }
-    maintain_direct_map = new_maintain_direct_map;
 }

-void IndexIVF::search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
-                       ConcurrentBitsetPtr bitset) const {
+void IndexIVF::set_direct_map_type (DirectMap::Type type)
+{
+    direct_map.set_type (type, invlists, ntotal);
+}
+
+
+void IndexIVF::search (idx_t n, const float *x, idx_t k,
+                       float *distances, idx_t *labels,
+                       ConcurrentBitsetPtr bitset) const
+{
    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);

@ -315,10 +317,7 @@ void IndexIVF::search (idx_t n, const float *x, idx_t k, float *distances, idx_t
 }

 void IndexIVF::get_vector_by_id (idx_t n, const idx_t *xid, float *x, ConcurrentBitsetPtr bitset) {
-
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);

    /* only get vector by 1 id */
    FAISS_ASSERT(n == 1);
@ -331,9 +330,7 @@ void IndexIVF::get_vector_by_id (idx_t n, const idx_t *xid, float *x, Concurrent

 void IndexIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, float *distances, idx_t *labels,
                             ConcurrentBitsetPtr bitset) {
-    if (!maintain_direct_map) {
-        make_direct_map(true);
-    }
+    make_direct_map(true);

    auto x = new float[n * d];
    for (idx_t i = 0; i < n; ++i) {
@ -362,10 +359,13 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,

    bool interrupt = false;

+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+
    // don't start parallel section if single query
    bool do_parallel =
-        parallel_mode == 0 ? n > 1 :
-        parallel_mode == 1 ? nprobe > 1 :
+        pmode == 0 ? n > 1 :
+        pmode == 1 ? nprobe > 1 :
        nprobe * n > 1;

 #pragma omp parallel if(do_parallel) reduction(+: nlistv, ndis, nheap)
@ -382,6 +382,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
        // intialize + reorder a result heap

        auto init_result = [&](float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
            if (metric_type == METRIC_INNER_PRODUCT) {
                heap_heapify<HeapForIP> (k, simi, idxi);
            } else {
@ -390,6 +391,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
        };

        auto reorder_result = [&] (float *simi, idx_t *idxi) {
+            if (!do_heap_init) return;
            if (metric_type == METRIC_INNER_PRODUCT) {
                heap_reorder<HeapForIP> (k, simi, idxi);
            } else {
@ -400,7 +402,8 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
        // single list scan using the current scanner (with query
        // set porperly) and storing results in simi and idxi
        auto scan_one_list = [&] (idx_t key, float coarse_dis_i,
-                                  float *simi, idx_t *idxi, ConcurrentBitsetPtr bitset) {
+                                  float *simi, idx_t *idxi,
+                                  ConcurrentBitsetPtr bitset) {

            if (key < 0) {
                // not enough centroids for multiprobe
@ -441,7 +444,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
         * Actual loops, depending on parallel_mode
         ****************************************************/

-        if (parallel_mode == 0) {
+        if (pmode == 0) {

 #pragma omp for
            for (size_t i = 0; i < n; i++) {
@ -481,7 +484,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
                }

            } // parallel for
-        } else if (parallel_mode == 1) {
+        } else if (pmode == 1) {
            std::vector <idx_t> local_idx (k);
            std::vector <float> local_dis (k);

@ -524,7 +527,7 @@ void IndexIVF::search_preassigned (idx_t n, const float *x, idx_t k,
            }
        } else {
            FAISS_THROW_FMT ("parallel_mode %d not supported\n",
-                             parallel_mode);
+                             pmode);
        }
    } // parallel section

@ -674,13 +677,8 @@ InvertedListScanner *IndexIVF::get_InvertedListScanner (

 void IndexIVF::reconstruct (idx_t key, float* recons) const
 {
-    FAISS_THROW_IF_NOT_MSG (direct_map.size() == ntotal,
-                            "direct map is not initialized");
-    FAISS_THROW_IF_NOT_MSG (key >= 0 && key < direct_map.size(),
-                            "invalid key");
-    idx_t list_no = direct_map[key] >> 32;
-    idx_t offset = direct_map[key] & 0xffffffff;
-    reconstruct_from_offset (list_no, offset, recons);
+    idx_t lo = direct_map.get (key);
+    reconstruct_from_offset (lo_listno(lo), lo_offset(lo), recons);
 }


@ -748,8 +746,8 @@ void IndexIVF::search_and_reconstruct (idx_t n, const float *x, idx_t k,
                // Fill with NaNs
                memset(reconstructed, -1, sizeof(*reconstructed) * d);
            } else {
-                int list_no = key >> 32;
-                int offset = key & 0xffffffff;
+                int list_no = lo_listno (key);
+                int offset = lo_offset (key);

                // Update label to the actual id
                labels[ij] = invlists->get_single_id (list_no, offset);
@ -777,42 +775,41 @@ void IndexIVF::reset ()

 size_t IndexIVF::remove_ids (const IDSelector & sel)
 {
-    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
-                    "direct map remove not implemented");
-
-    std::vector<idx_t> toremove(nlist);
-
-#pragma omp parallel for
-    for (idx_t i = 0; i < nlist; i++) {
-        idx_t l0 = invlists->list_size (i), l = l0, j = 0;
-        ScopedIds idsi (invlists, i);
-        while (j < l) {
-            if (sel.is_member (idsi[j])) {
-                l--;
-                invlists->update_entry (
-                     i, j,
-                     invlists->get_single_id (i, l),
-                     ScopedCodes (invlists, i, l).get());
-            } else {
-                j++;
-            }
-        }
-        toremove[i] = l0 - l;
-    }
-    // this will not run well in parallel on ondisk because of possible shrinks
-    size_t nremove = 0;
-    for (idx_t i = 0; i < nlist; i++) {
-        if (toremove[i] > 0) {
-            nremove += toremove[i];
-            invlists->resize(
-                i, invlists->list_size(i) - toremove[i]);
-        }
-    }
+    size_t nremove = direct_map.remove_ids (sel, invlists);
    ntotal -= nremove;
    return nremove;
 }


+void IndexIVF::update_vectors (int n, const idx_t *new_ids, const float *x)
+{
+
+    if (direct_map.type == DirectMap::Hashtable) {
+        // just remove then add
+        IDSelectorArray sel(n, new_ids);
+        size_t nremove = remove_ids (sel);
+        FAISS_THROW_IF_NOT_MSG (nremove == n,
+                                "did not find all entries to remove");
+        add_with_ids (n, x, new_ids);
+        return;
+    }
+
+    FAISS_THROW_IF_NOT (direct_map.type == DirectMap::Array);
+    // here it is more tricky because we don't want to introduce holes
+    // in continuous range of ids
+
+    FAISS_THROW_IF_NOT (is_trained);
+    std::vector<idx_t> assign (n);
+    quantizer->assign (n, x, assign.data());
+
+    std::vector<uint8_t> flat_codes (n * code_size);
+    encode_vectors (n, x, assign.data(), flat_codes.data());
+
+    direct_map.update_codes (invlists, n, new_ids, assign.data(), flat_codes.data());
+
+}
+
+


 void IndexIVF::train (idx_t n, const float *x)
@ -845,15 +842,14 @@ void IndexIVF::check_compatible_for_merge (const IndexIVF &other) const
    FAISS_THROW_IF_NOT (other.code_size == code_size);
    FAISS_THROW_IF_NOT_MSG (typeid (*this) == typeid (other),
                  "can only merge indexes of the same type");
+    FAISS_THROW_IF_NOT_MSG (this->direct_map.no() && other.direct_map.no(),
+                            "merge direct_map not implemented");
 }


 void IndexIVF::merge_from (IndexIVF &other, idx_t add_id)
 {
    check_compatible_for_merge (other);
-    FAISS_THROW_IF_NOT_MSG ((!maintain_direct_map &&
-                             !other.maintain_direct_map),
-                  "direct map copy not implemented");

    invlists->merge_from (other.invlists, add_id);

@ -883,7 +879,7 @@ void IndexIVF::copy_subset_to (IndexIVF & other, int subset_type,

    FAISS_THROW_IF_NOT (nlist == other.nlist);
    FAISS_THROW_IF_NOT (code_size == other.code_size);
-    FAISS_THROW_IF_NOT (!other.maintain_direct_map);
+    FAISS_THROW_IF_NOT (other.direct_map.no());
    FAISS_THROW_IF_NOT_FMT (
          subset_type == 0 || subset_type == 1 || subset_type == 2,
          "subset type %d not implemented", subset_type);
@ -950,6 +946,7 @@ IndexIVF::dump() {
        auto codes = invlists->get_codes(i);
        int code_size = invlists->code_size;

+
        std::cout << "Bucket ID: " << i << ", with code size: " << code_size << ", vectors number: " << numVecs << std::endl;
        if(code_size == 8) {
            // int8 types
@ -965,6 +962,7 @@ IndexIVF::dump() {
    }
 }

+
 IndexIVF::~IndexIVF()
 {
    if (own_invlists) {
--- a/core/src/index/thirdparty/faiss/IndexIVF.h
+++ b/core/src/index/thirdparty/faiss/IndexIVF.h
@ -12,15 +12,16 @@


 #include <vector>
+#include <unordered_map>
 #include <stdint.h>

 #include <faiss/Index.h>
 #include <faiss/InvertedLists.h>
+#include <faiss/DirectMap.h>
 #include <faiss/Clustering.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/ConcurrentBitset.h>

-
 namespace faiss {


@ -34,7 +35,6 @@ struct Level1Quantizer {
    Index * quantizer_backup = nullptr; ///< quantizer for backup
    size_t nlist;             ///< number of possible key values

-
    /**
     * = 0: use the quantizer as index in a kmeans training
     * = 1: just pass on the training set to the train() of the quantizer
@ -109,14 +109,18 @@ struct IndexIVF: Index, Level1Quantizer {
    /** Parallel mode determines how queries are parallelized with OpenMP
     *
     * 0 (default): parallelize over queries
-     * 1: parallelize over over inverted lists
+     * 1: parallelize over inverted lists
     * 2: parallelize over both
+     *
+     * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
+     * prevent the heap to be initialized and finalized
     */
    int parallel_mode;
+    const int PARALLEL_MODE_NO_HEAP_INIT = 1024;

-    /// map for direct access to the elements. Enables reconstruct().
-    bool maintain_direct_map;
-    std::vector <idx_t> direct_map;
+    /** optional map that maps back ids to invlist entries. This
+     *  enables reconstruct() */
+    DirectMap direct_map;

    /** The Inverted file takes a quantizer (an Index) on input,
     * which implements the function mapping a vector to a list
@ -179,12 +183,13 @@ struct IndexIVF: Index, Level1Quantizer {
                                     const float *centroid_dis,
                                     float *distances, idx_t *labels,
                                     bool store_pairs,
-                                     const IVFSearchParameters *params = nullptr,
+                                     const IVFSearchParameters *params=nullptr,
                                     ConcurrentBitsetPtr bitset = nullptr
                                     ) const;

    /** assign the vectors, then call search_preassign */
-    void search (idx_t n, const float *x, idx_t k, float *distances, idx_t *labels,
+    void search (idx_t n, const float *x, idx_t k,
+                 float *distances, idx_t *labels,
                 ConcurrentBitsetPtr bitset = nullptr) const override;

    /** get raw vectors by ids */
@ -206,8 +211,19 @@ struct IndexIVF: Index, Level1Quantizer {
    virtual InvertedListScanner *get_InvertedListScanner (
        bool store_pairs=false) const;

+    /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2 */
    void reconstruct (idx_t key, float* recons) const override;

+    /** Update a subset of vectors.
+     *
+     * The index must have a direct_map
+     *
+     * @param nv     nb of vectors to update
+     * @param idx    vector indices to update, size nv
+     * @param v      vectors of new values, size nv*d
+     */
+    virtual void update_vectors (int nv, const idx_t *idx, const float *v);
+
    /** Reconstruct a subset of the indexed vectors.
     *
     * Overrides default implementation to bypass reconstruct() which requires
@ -286,6 +302,9 @@ struct IndexIVF: Index, Level1Quantizer {
     */
    void make_direct_map (bool new_maintain_direct_map=true);

+    void set_direct_map_type (DirectMap::Type type);
+
+
    /// replace the inverted lists, old one is deallocated if own_invlists
    void replace_invlists (InvertedLists *il, bool own=false);

--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.cpp
@ -45,8 +45,7 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
 {
    FAISS_THROW_IF_NOT (is_trained);
    assert (invlists);
-    FAISS_THROW_IF_NOT_MSG (!(maintain_direct_map && xids),
-                            "cannot have direct map and add with ids");
+    direct_map.check_can_add (xids);
    const int64_t * idx;
    ScopeDeleter<int64_t> del;

@ -60,19 +59,21 @@ void IndexIVFFlat::add_core (idx_t n, const float * x, const int64_t *xids,
    }
    int64_t n_add = 0;
    for (size_t i = 0; i < n; i++) {
-        int64_t id = xids ? xids[i] : ntotal + i;
-        int64_t list_no = idx [i];
+        idx_t id = xids ? xids[i] : ntotal + i;
+        idx_t list_no = idx [i];
+        size_t offset;

-        if (list_no < 0)
-            continue;
-        const float *xi = x + i * d;
-        size_t offset = invlists->add_entry (
-              list_no, id, (const uint8_t*) xi);
-
-        if (maintain_direct_map)
-            direct_map.push_back (list_no << 32 | offset);
-        n_add++;
+        if (list_no >= 0) {
+            const float *xi = x + i * d;
+            offset = invlists->add_entry (
+                     list_no, id, (const uint8_t*) xi);
+            n_add++;
+        } else {
+            offset = 0;
+        }
+        direct_map.add_single_id (id, list_no, offset);
    }
+
    if (verbose) {
        printf("IndexIVFFlat::add_core: added %ld / %ld vectors\n",
               n_add, n);
@ -154,7 +155,7 @@ struct IVFFlatScanner: InvertedListScanner {
        const float *list_vecs = (const float*)codes;
        size_t nup = 0;
        for (size_t j = 0; j < list_size; j++) {
-            if(!bitset || !bitset->test(ids[j])){
+            if (!bitset || !bitset->test(ids[j])) {
                const float * yj = list_vecs + d * j;
                float dis = metric == METRIC_INNER_PRODUCT ?
                            fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
@ -181,7 +182,7 @@ struct IVFFlatScanner: InvertedListScanner {
            float dis = metric == METRIC_INNER_PRODUCT ?
                fvec_inner_product (xi, yj, d) : fvec_L2sqr (xi, yj, d);
            if (C::cmp (radius, dis)) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
                res.add (dis, id);
            }
        }
@ -212,41 +213,6 @@ InvertedListScanner* IndexIVFFlat::get_InvertedListScanner



-void IndexIVFFlat::update_vectors (int n, idx_t *new_ids, const float *x)
-{
-
-    FAISS_THROW_IF_NOT (maintain_direct_map);
-    FAISS_THROW_IF_NOT (is_trained);
-    std::vector<idx_t> assign (n);
-    quantizer->assign (n, x, assign.data());
-
-    for (size_t i = 0; i < n; i++) {
-        idx_t id = new_ids[i];
-        FAISS_THROW_IF_NOT_MSG (0 <= id && id < ntotal,
-                                "id to update out of range");
-        { // remove old one
-            int64_t dm = direct_map[id];
-            int64_t ofs = dm & 0xffffffff;
-            int64_t il = dm >> 32;
-            size_t l = invlists->list_size (il);
-            if (ofs != l - 1) { // move l - 1 to ofs
-                int64_t id2 = invlists->get_single_id (il, l - 1);
-                direct_map[id2] = (il << 32) | ofs;
-                invlists->update_entry (il, ofs, id2,
-                                        invlists->get_single_code (il, l - 1));
-            }
-            invlists->resize (il, l - 1);
-        }
-        { // insert new one
-            int64_t il = assign[i];
-            size_t l = invlists->list_size (il);
-            int64_t dm = (il << 32) | l;
-            direct_map[id] = dm;
-            invlists->add_entry (il, id, (const uint8_t*)(x + i * d));
-        }
-    }
-
-}

 void IndexIVFFlat::reconstruct_from_offset (int64_t list_no, int64_t offset,
                                            float* recons) const
@ -298,8 +264,7 @@ void IndexIVFFlatDedup::add_with_ids(

    FAISS_THROW_IF_NOT (is_trained);
    assert (invlists);
-    FAISS_THROW_IF_NOT_MSG (
-           !maintain_direct_map,
+    FAISS_THROW_IF_NOT_MSG (direct_map.no(),
           "IVFFlatDedup not implemented with direct_map");
    int64_t * idx = new int64_t [na];
    ScopeDeleter<int64_t> del (idx);
@ -435,7 +400,7 @@ size_t IndexIVFFlatDedup::remove_ids(const IDSelector& sel)

    // mostly copied from IndexIVF.cpp

-    FAISS_THROW_IF_NOT_MSG (!maintain_direct_map,
+    FAISS_THROW_IF_NOT_MSG (direct_map.no(),
                    "direct map remove not implemented");

    std::vector<int64_t> toremove(nlist);
@ -489,7 +454,7 @@ void IndexIVFFlatDedup::range_search(
    FAISS_THROW_MSG ("not implemented");
 }

-void IndexIVFFlatDedup::update_vectors (int , idx_t *, const float *)
+void IndexIVFFlatDedup::update_vectors (int , const idx_t *, const float *)
 {
    FAISS_THROW_MSG ("not implemented");
 }
--- a/core/src/index/thirdparty/faiss/IndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFFlat.h
@ -44,15 +44,6 @@ struct IndexIVFFlat: IndexIVF {
    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
        const override;

-    /** Update a subset of vectors.
-     *
-     * The index must have a direct_map
-     *
-     * @param nv     nb of vectors to update
-     * @param idx    vector indices to update, size nv
-     * @param v      vectors of new values, size nv*d
-     */
-    virtual void update_vectors (int nv, idx_t *idx, const float *v);

    void reconstruct_from_offset (int64_t list_no, int64_t offset,
                                  float* recons) const override;
@ -101,8 +92,7 @@ struct IndexIVFFlatDedup: IndexIVFFlat {
        ConcurrentBitsetPtr bitset = nullptr) const override;

    /// not implemented
-    void update_vectors (int nv, idx_t *idx, const float *v) override;
-
+    void update_vectors (int nv, const idx_t *idx, const float *v) override;

    /// not implemented
    void reconstruct_from_offset (int64_t list_no, int64_t offset,
--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.cpp
@ -37,8 +37,8 @@ namespace faiss {
 ******************************************/

 IndexIVFPQ::IndexIVFPQ (Index * quantizer, size_t d, size_t nlist,
-                        size_t M, size_t nbits_per_idx):
-    IndexIVF (quantizer, d, nlist, 0, METRIC_L2),
+                        size_t M, size_t nbits_per_idx, MetricType metric):
+    IndexIVF (quantizer, d, nlist, 0, metric),
    pq (d, M, nbits_per_idx)
 {
    FAISS_THROW_IF_NOT (nbits_per_idx <= 8);
@ -279,6 +279,8 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,

    InterruptCallback::check();

+    direct_map.check_can_add (xids);
+
    FAISS_THROW_IF_NOT (is_trained);
    double t0 = getmillisecs ();
    const idx_t * idx;
@ -313,13 +315,14 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
    size_t n_ignore = 0;
    for (size_t i = 0; i < n; i++) {
        idx_t key = idx[i];
+        idx_t id = xids ? xids[i] : ntotal + i;
        if (key < 0) {
+            direct_map.add_single_id (id, -1, 0);
            n_ignore ++;
            if (residuals_2)
                memset (residuals_2, 0, sizeof(*residuals_2) * d);
            continue;
        }
-        idx_t id = xids ? xids[i] : ntotal + i;

        uint8_t *code = xcodes + i * code_size;
        size_t offset = invlists->add_entry (key, id, code);
@ -332,11 +335,9 @@ void IndexIVFPQ::add_core_o (idx_t n, const float * x, const idx_t *xids,
                res2[j] = xi[j] - res2[j];
        }

-        if (maintain_direct_map)
-            direct_map.push_back (key << 32 | offset);
+        direct_map.add_single_id (id, key, offset);
    }

-
    double t3 = getmillisecs ();
    if(verbose) {
        char comment[100] = {0};
@ -800,9 +801,9 @@ struct KnnSearchResults {

    size_t nup;

-    inline void add (idx_t j, float dis, faiss::ConcurrentBitsetPtr bitset = nullptr) {
+    inline void add (idx_t j, float dis, ConcurrentBitsetPtr bitset = nullptr) {
        if (C::cmp (heap_sim[0], dis)) {
-            idx_t id = ids ? ids[j] : (key << 32 | j);
+            idx_t id = ids ? ids[j] : lo_build (key, j);
            if (bitset != nullptr && bitset->test((faiss::ConcurrentBitset::id_type_t)id))
                return;
            heap_swap_top<C> (k, heap_sim, heap_ids, dis, id);
@ -823,7 +824,7 @@ struct RangeSearchResults {

    inline void add (idx_t j, float dis, faiss::ConcurrentBitsetPtr bitset = nullptr) {
        if (C::cmp (radius, dis)) {
-            idx_t id = ids ? ids[j] : (key << 32 | j);
+            idx_t id = ids ? ids[j] : lo_build (key, j);
            rres.add (dis, id);
        }
    }
@ -836,7 +837,7 @@ struct RangeSearchResults {
 * The scanning functions call their favorite precompute_*
 * function to precompute the tables they need.
 *****************************************************/
-template <typename IDType, MetricType METRIC_TYPE>
+template <typename IDType, MetricType METRIC_TYPE, class PQDecoder>
 struct IVFPQScannerT: QueryTables {

    const uint8_t * list_codes;
@ -846,7 +847,6 @@ struct IVFPQScannerT: QueryTables {
    IVFPQScannerT (const IndexIVFPQ & ivfpq, const IVFSearchParameters *params):
        QueryTables (ivfpq, params)
    {
-        FAISS_THROW_IF_NOT (pq.nbits == 8);
        assert(METRIC_TYPE == metric_type);
    }

@ -872,15 +872,16 @@ struct IVFPQScannerT: QueryTables {
    template<class SearchResultType>
    void scan_list_with_table (size_t ncode, const uint8_t *codes,
                               SearchResultType & res,
-                               faiss::ConcurrentBitsetPtr bitset = nullptr) const
+                               ConcurrentBitsetPtr bitset = nullptr) const
    {
        for (size_t j = 0; j < ncode; j++) {
-
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;
            float dis = dis0;
            const float *tab = sim_table;

            for (size_t m = 0; m < pq.M; m++) {
-                dis += tab[*codes++];
+                dis += tab[decoder.decode()];
                tab += pq.ksub;
            }

@ -897,12 +898,14 @@ struct IVFPQScannerT: QueryTables {
                                 faiss::ConcurrentBitsetPtr bitset = nullptr) const
    {
        for (size_t j = 0; j < ncode; j++) {
+            PQDecoder decoder(codes, pq.nbits);
+            codes += pq.code_size;

            float dis = dis0;
            const float *tab = sim_table_2;

            for (size_t m = 0; m < pq.M; m++) {
-                int ci = *codes++;
+                int ci = decoder.decode();
                dis += sim_table_ptrs [m][ci] - 2 * tab [ci];
                tab += pq.ksub;
            }
@ -914,8 +917,8 @@ struct IVFPQScannerT: QueryTables {
    /// nothing is precomputed: access residuals on-the-fly
    template<class SearchResultType>
    void scan_on_the_fly_dist (size_t ncode, const uint8_t *codes,
-                                 SearchResultType &res,
-                                 faiss::ConcurrentBitsetPtr bitset = nullptr) const
+                               SearchResultType &res,
+                               faiss::ConcurrentBitsetPtr bitset = nullptr) const
    {
        const float *dvec;
        float dis0 = 0;
@ -969,12 +972,13 @@ struct IVFPQScannerT: QueryTables {
            int hd = hc.hamming (b_code);
            if (hd < ht) {
                n_hamming_pass ++;
+                PQDecoder decoder(codes, pq.nbits);

                float dis = dis0;
                const float *tab = sim_table;

                for (size_t m = 0; m < pq.M; m++) {
-                    dis += tab[*b_code++];
+                    dis += tab[decoder.decode()];
                    tab += pq.ksub;
                }

@ -999,7 +1003,7 @@ struct IVFPQScannerT: QueryTables {
        case cs:                                                        \
            scan_list_polysemous_hc \
            <HammingComputer ## cs, SearchResultType>   \
-                (ncode, codes, res, bitset);             \
+                (ncode, codes, res, bitset);            \
            break
        HANDLE_CODE_SIZE(4);
        HANDLE_CODE_SIZE(8);
@ -1030,16 +1034,18 @@ struct IVFPQScannerT: QueryTables {
 * much we precompute (2 = precompute distance tables, 1 = precompute
 * pointers to distances, 0 = compute distances one by one).
 * Currently only 2 is supported */
-template<MetricType METRIC_TYPE, class C, int precompute_mode>
+template<MetricType METRIC_TYPE, class C, class PQDecoder>
 struct IVFPQScanner:
-    IVFPQScannerT<Index::idx_t, METRIC_TYPE>,
+    IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
    InvertedListScanner
 {
    bool store_pairs;
+    int precompute_mode;

-    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs):
-        IVFPQScannerT<Index::idx_t, METRIC_TYPE>(ivfpq, nullptr),
-        store_pairs(store_pairs)
+    IVFPQScanner(const IndexIVFPQ & ivfpq, bool store_pairs,
+                 int precompute_mode):
+        IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
+        store_pairs(store_pairs), precompute_mode(precompute_mode)
    {
    }

@ -1055,9 +1061,10 @@ struct IVFPQScanner:
        assert(precompute_mode == 2);
        float dis = this->dis0;
        const float *tab = this->sim_table;
+        PQDecoder decoder(code, this->pq.nbits);

        for (size_t m = 0; m < this->pq.M; m++) {
-            dis += tab[*code++];
+            dis += tab[decoder.decode()];
            tab += this->pq.ksub;
        }
        return dis;
@ -1124,7 +1131,22 @@ struct IVFPQScanner:
    }
 };

+template<class PQDecoder>
+InvertedListScanner *get_InvertedListScanner1 (const IndexIVFPQ &index,
+                                               bool store_pairs)
+{

+   if (index.metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFPQScanner
+            <METRIC_INNER_PRODUCT, CMin<float, idx_t>, PQDecoder>
+            (index, store_pairs, 2);
+    } else if (index.metric_type == METRIC_L2) {
+        return new IVFPQScanner
+            <METRIC_L2, CMax<float, idx_t>, PQDecoder>
+            (index, store_pairs, 2);
+    }
+    return nullptr;
+}


 } // anonymous namespace
@ -1132,12 +1154,13 @@ struct IVFPQScanner:
 InvertedListScanner *
 IndexIVFPQ::get_InvertedListScanner (bool store_pairs) const
 {
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFPQScanner<METRIC_INNER_PRODUCT, CMin<float, idx_t>, 2>
-            (*this, store_pairs);
-    } else if (metric_type == METRIC_L2) {
-        return new IVFPQScanner<METRIC_L2, CMax<float, idx_t>, 2>
-            (*this, store_pairs);
+
+    if (pq.nbits == 8) {
+        return get_InvertedListScanner1<PQDecoder8> (*this, store_pairs);
+    } else if (pq.nbits == 16) {
+        return get_InvertedListScanner1<PQDecoder16> (*this, store_pairs);
+    } else {
+        return get_InvertedListScanner1<PQDecoderGeneric> (*this, store_pairs);
    }
    return nullptr;

--- a/core/src/index/thirdparty/faiss/IndexIVFPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQ.h
@ -42,14 +42,14 @@ struct IndexIVFPQ: IndexIVF {
    int polysemous_ht;             ///< Hamming thresh for polysemous filtering

    /** Precompute table that speed up query preprocessing at some
-     * memory cost
+     * memory cost (used only for by_residual with L2 metric)
     * =-1: force disable
     * =0: decide heuristically (default: use tables only if they are
     *     < precomputed_tables_max_bytes)
     * =1: tables that work for all quantizers (size 256 * nlist * M)
     * =2: specific version for MultiIndexQuantizer (much more compact)
     */
-    int use_precomputed_table;     ///< if by_residual, build precompute tables
+    int use_precomputed_table;
    static size_t precomputed_table_max_bytes;

    /// if use_precompute_table
@ -58,7 +58,7 @@ struct IndexIVFPQ: IndexIVF {

    IndexIVFPQ (
            Index * quantizer, size_t d, size_t nlist,
-            size_t M, size_t nbits_per_idx);
+            size_t M, size_t nbits_per_idx, MetricType metric = METRIC_L2);

    void add_with_ids(idx_t n, const float* x, const idx_t* xids = nullptr)
        override;
@ -93,9 +93,9 @@ struct IndexIVFPQ: IndexIVF {
     * the duplicates are returned in pre-allocated arrays (see the
     * max sizes).
     *
-     * @params lims   limits between groups of duplicates
+     * @param lims   limits between groups of duplicates
     *                (max size ntotal / 2 + 1)
-     * @params ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
+     * @param ids    ids[lims[i]] : ids[lims[i+1]-1] is a group of
     *                duplicates (max size ntotal)
     * @return n      number of groups found
     */
@ -135,15 +135,14 @@ struct IndexIVFPQ: IndexIVF {
 /// statistics are robust to internal threading, but not if
 /// IndexIVFPQ::search_preassigned is called by multiple threads
 struct IndexIVFPQStats {
-    size_t nrefine;  // nb of refines (IVFPQR)
+    size_t nrefine;  ///< nb of refines (IVFPQR)

    size_t n_hamming_pass;
-    // nb of passed Hamming distance tests (for polysemous)
+    ///< nb of passed Hamming distance tests (for polysemous)

-    // timings measured with the CPU RTC
-    // on all threads
+    // timings measured with the CPU RTC on all threads
    size_t search_cycles;
-    size_t refine_cycles; // only for IVFPQR
+    size_t refine_cycles; ///< only for IVFPQR

    IndexIVFPQStats () {reset (); }
    void reset ();
--- a/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFPQR.cpp
@ -145,8 +145,8 @@ void IndexIVFPQR::search_preassigned (idx_t n, const float *x, idx_t k,

                if (sl == -1) continue;

-                int list_no = sl >> 32;
-                int ofs = sl & 0xffffffff;
+                int list_no = lo_listno(sl);
+                int ofs = lo_offset(sl);

                assert (list_no >= 0 && list_no < nlist);
                assert (ofs >= 0 && ofs < invlists->list_size (list_no));
--- a/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
+++ b/core/src/index/thirdparty/faiss/IndexIVFSpectralHash.cpp
@ -266,7 +266,7 @@ struct IVFScanner: InvertedListScanner {
    {
        size_t nup = 0;
        for (size_t j = 0; j < list_size; j++) {
-            if(!bitset || !bitset->test(ids[j])){
+            if (!bitset || !bitset->test(ids[j])) {
                float dis = hc.hamming (codes);

                if (dis < simi [0]) {
@ -290,7 +290,7 @@ struct IVFScanner: InvertedListScanner {
        for (size_t j = 0; j < list_size; j++) {
            float dis = hc.hamming (codes);
            if (dis < radius) {
-                int64_t id = store_pairs ? (list_no << 32 | j) : ids[j];
+                int64_t id = store_pairs ? lo_build (list_no, j) : ids[j];
                res.add (dis, id);
            }
            codes += code_size;
--- a/core/src/index/thirdparty/faiss/IndexLSH.h
+++ b/core/src/index/thirdparty/faiss/IndexLSH.h
@ -70,7 +70,10 @@ struct IndexLSH:Index {

    IndexLSH ();

-    /* standalone codec interface */
+    /* standalone codec interface.
+     *
+     * The vectors are decoded to +/- 1 (not 0, 1) */
+
    size_t sa_code_size () const override;

    void sa_encode (idx_t n, const float *x,
--- a/core/src/index/thirdparty/faiss/IndexLattice.cpp
+++ b/core/src/index/thirdparty/faiss/IndexLattice.cpp
@ -128,7 +128,7 @@ void IndexLattice::add(idx_t , const float* )


 void  IndexLattice::search(idx_t , const float* , idx_t ,
-                           float* , idx_t* , ConcurrentBitsetPtr bitset) const
+                           float* , idx_t* , ConcurrentBitsetPtr ) const
 {
    FAISS_THROW_MSG("not implemented");
 }
--- a/core/src/index/thirdparty/faiss/IndexLattice.h
+++ b/core/src/index/thirdparty/faiss/IndexLattice.h
@ -58,7 +58,8 @@ struct IndexLattice: Index {
    /// not implemented
    void add(idx_t n, const float* x) override;
    void search(idx_t n, const float* x, idx_t k,
-                float* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+                float* distances, idx_t* labels,
+                ConcurrentBitsetPtr bitset = nullptr) const override;
    void reset() override;

 };
--- a/core/src/index/thirdparty/faiss/IndexPQ.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPQ.cpp
@ -204,8 +204,8 @@ DistanceComputer * IndexPQ::get_distance_computer() const {


 void IndexPQ::search (idx_t n, const float *x, idx_t k,
-                           float *distances, idx_t *labels,
-                           ConcurrentBitsetPtr bitset) const
+                      float *distances, idx_t *labels,
+                      ConcurrentBitsetPtr bitset) const
 {
    FAISS_THROW_IF_NOT (is_trained);
    if (search_type == ST_PQ) {  // Simple PQ search
--- a/core/src/index/thirdparty/faiss/IndexPQ.h
+++ b/core/src/index/thirdparty/faiss/IndexPQ.h
@ -156,7 +156,8 @@ struct MultiIndexQuantizer: Index  {

    void search(
        idx_t n, const float* x, idx_t k,
-        float* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+        float* distances, idx_t* labels,
+        ConcurrentBitsetPtr bitset = nullptr) const override;

    /// add and reset will crash at runtime
    void add(idx_t n, const float* x) override;
--- a/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
+++ b/core/src/index/thirdparty/faiss/IndexPreTransform.cpp
@ -14,7 +14,6 @@
 #include <cstring>
 #include <memory>

-#include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>

 namespace faiss {
@ -181,7 +180,8 @@ void IndexPreTransform::add_with_ids (idx_t n, const float * x,


 void IndexPreTransform::search (idx_t n, const float *x, idx_t k,
-                               float *distances, idx_t *labels, ConcurrentBitsetPtr bitset) const
+                               float *distances, idx_t *labels,
+                               ConcurrentBitsetPtr bitset) const
 {
    FAISS_THROW_IF_NOT (is_trained);
    const float *xt = apply_chain (n, x);
--- a/core/src/index/thirdparty/faiss/IndexReplicas.h
+++ b/core/src/index/thirdparty/faiss/IndexReplicas.h
@ -60,7 +60,8 @@ class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
              const component_t* x,
              idx_t k,
              distance_t* distances,
-              idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              idx_t* labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;

  /// reconstructs from the first index
  void reconstruct(idx_t, component_t *v) const override;
--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.cpp
@ -254,6 +254,8 @@ void IndexIVFScalarQuantizer::add_with_ids
    size_t nadd = 0;
    std::unique_ptr<Quantizer> squant(sq.select_quantizer ());

+    DirectMapAdd dm_add (direct_map, n, xids);
+
 #pragma omp parallel reduction(+: nadd)
    {
        std::vector<float> residual (d);
@ -276,13 +278,18 @@ void IndexIVFScalarQuantizer::add_with_ids
                memset (one_code.data(), 0, code_size);
                squant->encode_vector (xi, one_code.data());

-                invlists->add_entry (list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry (list_no, id, one_code.data());

+                dm_add.add (i, list_no, ofs);
                nadd++;

+            } else if (rank == 0 && list_no == -1) {
+                dm_add.add (i, -1, 0);
            }
        }
    }
+
+
    ntotal += n;
 }

--- a/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/IndexScalarQuantizer.h
@ -17,7 +17,6 @@
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/impl/ScalarQuantizerOp.h>

-
 namespace faiss {

 /**
--- a/core/src/index/thirdparty/faiss/IndexShards.cpp
+++ b/core/src/index/thirdparty/faiss/IndexShards.cpp
@ -264,7 +264,8 @@ IndexShardsTemplate<IndexT>::search(idx_t n,
                                    const component_t *x,
                                    idx_t k,
                                    distance_t *distances,
-                                    idx_t *labels, ConcurrentBitsetPtr bitset) const {
+                                    idx_t *labels,
+                                    ConcurrentBitsetPtr bitset) const {
  long nshard = this->count();

  std::vector<distance_t> all_distances(nshard * k * n);
--- a/core/src/index/thirdparty/faiss/IndexShards.h
+++ b/core/src/index/thirdparty/faiss/IndexShards.h
@ -75,7 +75,8 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
  void add_with_ids(idx_t n, const component_t* x, const idx_t* xids) override;

  void search(idx_t n, const component_t* x, idx_t k,
-              distance_t* distances, idx_t* labels, ConcurrentBitsetPtr bitset = nullptr) const override;
+              distance_t* distances, idx_t* labels,
+              ConcurrentBitsetPtr bitset = nullptr) const override;

  void train(idx_t n, const component_t* x) override;

--- a/core/src/index/thirdparty/faiss/InvertedLists.cpp
+++ b/core/src/index/thirdparty/faiss/InvertedLists.cpp
@ -64,8 +64,6 @@ PageLockMemory::PageLockMemory(PageLockMemory &&other) {

 namespace faiss {

-using ScopedIds = InvertedLists::ScopedIds;
-using ScopedCodes = InvertedLists::ScopedCodes;


 /*****************************************
--- a/core/src/index/thirdparty/faiss/InvertedLists.h
+++ b/core/src/index/thirdparty/faiss/InvertedLists.h
@ -19,7 +19,6 @@
 #include <vector>
 #include <faiss/Index.h>

-
 #ifndef USE_CPU
 namespace faiss {

@ -276,6 +275,7 @@ struct ReadOnlyArrayInvertedLists: InvertedLists {

    bool is_valid();
 };
+
 /*****************************************************************
 * Meta-inverted lists
 *
--- a/core/src/index/thirdparty/faiss/Makefile
+++ b/core/src/index/thirdparty/faiss/Makefile
@ -12,7 +12,7 @@ AVX512_SRC  = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp)
 OBJ         = $(SRC:.cpp=.o)
 INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss

-GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/utils/*.h)
+GPU_HEADERS = $(wildcard gpu/*.h gpu/impl/*.h gpu/impl/*.cuh gpu/utils/*.h gpu/utils/*.cuh)
 GPU_CPPSRC  = $(wildcard gpu/*.cpp gpu/impl/*.cpp gpu/utils/*.cpp)
 GPU_CUSRC   = $(wildcard gpu/*.cu gpu/impl/*.cu gpu/utils/*.cu \
 gpu/utils/nvidia/*.cu gpu/utils/blockselect/*.cu gpu/utils/warpselect/*.cu)
--- a/core/src/index/thirdparty/faiss/MetaIndexes.cpp
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.cpp
@ -22,7 +22,6 @@ namespace faiss {

 namespace {

-typedef Index::idx_t idx_t;

 } // namespace

@ -83,9 +82,10 @@ void IndexIDMapTemplate<IndexT>::add_with_ids
 template <typename IndexT>
 void IndexIDMapTemplate<IndexT>::search
    (idx_t n, const typename IndexT::component_t *x, idx_t k,
-     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels, ConcurrentBitsetPtr bitset) const
+     typename IndexT::distance_t *distances, typename IndexT::idx_t *labels,
+     ConcurrentBitsetPtr bitset) const
 {
-    index->search(n, x, k, distances, labels, bitset);
+    index->search (n, x, k, distances, labels, bitset);
    idx_t *li = labels;
 #pragma omp parallel for
    for (idx_t i = 0; i < n * k; i++) {
@ -121,7 +121,8 @@ void IndexIDMapTemplate<IndexT>::search_by_id (idx_t n, const idx_t *xid, idx_t
 template <typename IndexT>
 void IndexIDMapTemplate<IndexT>::range_search
    (typename IndexT::idx_t n, const typename IndexT::component_t *x,
-     typename IndexT::distance_t radius, RangeSearchResult *result, ConcurrentBitsetPtr bitset) const
+     typename IndexT::distance_t radius, RangeSearchResult *result,
+     ConcurrentBitsetPtr bitset) const
 {
  index->range_search(n, x, radius, result, bitset);
 #pragma omp parallel for
--- a/core/src/index/thirdparty/faiss/MetaIndexes.h
+++ b/core/src/index/thirdparty/faiss/MetaIndexes.h
@ -37,8 +37,10 @@ struct IndexIDMapTemplate : IndexT {
    /// this will fail. Use add_with_ids
    void add(idx_t n, const component_t* x) override;

-    void search (idx_t n, const component_t *x, idx_t k, distance_t *distances, idx_t *labels,
-                 ConcurrentBitsetPtr bitset = nullptr) const override;
+    void search(
+        idx_t n, const component_t* x, idx_t k,
+        distance_t* distances, idx_t* labels,
+        ConcurrentBitsetPtr bitset = nullptr) const override;

    void get_vector_by_id(idx_t n, const idx_t *xid, component_t *x, ConcurrentBitsetPtr bitset = nullptr) override;

--- a/core/src/index/thirdparty/faiss/MetricType.h
+++ b/core/src/index/thirdparty/faiss/MetricType.h
@ -0,0 +1,41 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#ifndef FAISS_METRIC_TYPE_H
+#define FAISS_METRIC_TYPE_H
+
+namespace faiss {
+
+/// The metric space for vector comparison for Faiss indices and algorithms.
+///
+/// Most algorithms support both inner product and L2, with the flat
+/// (brute-force) indices supporting additional metric types for vector
+/// comparison.
+enum MetricType {
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by a faiss::Index
+                               /// metric_arg
+    METRIC_Jaccard,
+    METRIC_Tanimoto,
+    METRIC_Hamming,
+    METRIC_Substructure,       ///< Tversky case alpha = 0, beta = 1
+    METRIC_Superstructure,     ///< Tversky case alpha = 1, beta = 0
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,
+};
+
+}
+
+#endif
--- a/core/src/index/thirdparty/faiss/README.md
+++ b/core/src/index/thirdparty/faiss/README.md
@ -4,6 +4,10 @@ Faiss is a library for efficient similarity search and clustering of dense vecto

 ## NEWS

+*NEW: version 1.6.1 (2019-11-29) bugfix.*
+
+*NEW: version 1.6.0 (2019-10-15) code structure reorg, support for codec interface.*
+
 *NEW: version 1.5.3 (2019-06-24) fix performance regression in IndexIVF.*

 *NEW: version 1.5.2 (2019-05-27) the license was relaxed to MIT from BSD+Patents. Read LICENSE for details.*
@ -24,7 +28,7 @@ Faiss is a library for efficient similarity search and clustering of dense vecto

 ## Introduction

-Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
+Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.

 Most of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. 

--- a/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
+++ b/core/src/index/thirdparty/faiss/acinclude/ax_check_cpu.m4
@ -8,7 +8,7 @@ AC_MSG_CHECKING([for cpu arch])

  case $target in
    amd64-* | x86_64-*)
-      ARCH_CPUFLAGS="-mavx2 -mf16c -msse4 -mpopcnt"
+      ARCH_CPUFLAGS="-mpopcnt -msse4"
      ARCH_CXXFLAGS="-m64"
      ;;
    aarch64*-*)
--- a/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
@ -64,6 +64,7 @@ def fvecs_write(fname, m):
    ivecs_write(fname, m.view('int32'))


+
 #################################################################
 # Dataset
 #################################################################
--- a/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_gpu_1bn.py
@ -170,7 +170,8 @@ def dataset_iterator(x, preproc, bs):
    block_ranges = [(i0, min(nb, i0 + bs))
                    for i0 in range(0, nb, bs)]

-    def prepare_block((i0, i1)):
+    def prepare_block(i01):
+        i0, i1 = i01
        xb = sanitize(x[i0:i1])
        return i0, preproc.apply_py(xb)

@ -575,7 +576,8 @@ def compute_populated_index_2(preproc):
    coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple(
        vres, vdev, indexall.quantizer)

-    def quantize((i0, xs)):
+    def quantize(args):
+        (i0, xs) = args
        _, assign = coarse_quantizer_gpu.search(xs, 1)
        return i0, xs, assign.ravel()

--- a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_1bn.py
@ -160,7 +160,7 @@ def matrix_slice_iterator(x, bs):
                    for i0 in range(0, nb, bs)]

    return rate_limited_imap(
-        lambda (i0, i1): x[i0:i1].astype('float32').copy(),
+        lambda i01: x[i01[0]:i01[1]].astype('float32').copy(),
        block_ranges)


@ -203,6 +203,7 @@ xq = xq.astype('float32').copy()

 # a static C++ object that collects statistics about searches
 ivfpq_stats = faiss.cvar.indexIVFPQ_stats
+ivf_stats = faiss.cvar.indexIVF_stats


 if parametersets == ['autotune'] or parametersets == ['autotuneMT']:
@ -243,10 +244,11 @@ else:
        ps.set_index_parameters(index, param)
        t0 = time.time()
        ivfpq_stats.reset()
+        ivf_stats.reset()
        D, I = index.search(xq, 100)
        t1 = time.time()
        for rank in 1, 10, 100:
            n_ok = (I[:, :rank] == gt[:, :1]).sum()
            print("%.4f" % (n_ok / float(nq)), end=' ')
        print("%8.3f  " % ((t1 - t0) * 1000.0 / nq), end=' ')
-        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivfpq_stats.ncode))
+        print("%5.2f" % (ivfpq_stats.n_hamming_pass * 100.0 / ivf_stats.ndis))
--- a/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_polysemous_sift1m.py
@ -36,7 +36,8 @@ faiss.omp_set_num_threads(1)

 print("PQ baseline", end=' ')
 index.search_type = faiss.IndexPQ.ST_PQ
-evaluate()
+t, r = evaluate(index, xq, gt, 1)
+print("\t %7.3f ms per query, R@1 %.4f" % (t, r[1]))

 for ht in 64, 62, 58, 54, 50, 46, 42, 38, 34, 30:
    print("Polysemous", ht, end=' ')
--- a/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
+++ b/core/src/index/thirdparty/faiss/benchs/bench_vector_ops.py
@ -38,7 +38,7 @@ for d in 3, 4, 12, 36, 64:
    distances = np.empty((xd, yd), dtype='float32')

    t0 = time.time()
-    for i in xrange(xd):
+    for i in range(xd):
        faiss.fvec_inner_products_ny(swig_ptr(distances[i]),
                                     swig_ptr(x[i]),
                                     swig_ptr(y),
@ -66,7 +66,7 @@ for d in 3, 4, 12, 36, 64:
    distances = np.empty((xd, yd), dtype='float32')

    t0 = time.time()
-    for i in xrange(xd):
+    for i in range(xd):
        faiss.fvec_L2sqr_ny(swig_ptr(distances[i]),
                            swig_ptr(x[i]),
                            swig_ptr(y),
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/README.md
@ -1,3 +1,4 @@
+
 # Distributed on-disk index for 1T-scale datasets 

 This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors). 
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/combined_index.py
@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.

+#!/usr/bin/env python3
+
 import os
 import faiss
 import numpy as np
@ -29,7 +31,7 @@ class CombinedIndex:
                indexes.append(index)
                il = faiss.extract_index_ivf(index).invlists
            else:
-                assert False
+                raise AssertionError
            ilv.push_back(il)
        print()

--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 #! /usr/bin/env python3

 """
@ -356,7 +357,7 @@ def main():
        elif args.indata.endswith('.npy'):
            x = np.load(args.indata, mmap_mode='r')
        else:
-            assert False
+            raise AssertionError

        if args.i1 == -1:
            args.i1 = len(x)
@ -386,7 +387,8 @@ def main():
            True
        )
    else:
-        assert False
+        raise AssertionError
+

    if args.server:
        print('starting server')
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/distributed_query_demo.py
@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import faiss
 import numpy as np
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import time
 import numpy as np
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
@ -49,7 +49,7 @@ if __name__ == '__main__':

    index0 = None

-    for fname, index in pool.imap(load_index, args.inputs):
+    for _, index in pool.imap(load_index, args.inputs):
        if index is None:
            continue
        index_ivf = faiss.extract_index_ivf(index)
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
@ -2,6 +2,9 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
+#!/usr/bin/env python3
+
 """
 Simplistic RPC implementation.
 Exposes all functions of a Server object.
@ -163,7 +166,7 @@ class Server:
        except EOFError:
            self.log("EOF during communication")
            traceback.print_exc(50,self.logf)
-        except:
+        except BaseException:
            # unexpected
            traceback.print_exc(50,sys.stderr)
            sys.exit(1)
--- a/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
+++ b/core/src/index/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
@ -2,6 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
+
 import os
 import time
 import rpc
--- a/core/src/index/thirdparty/faiss/build.sh
+++ b/core/src/index/thirdparty/faiss/build.sh
@ -1,2 +1,3 @@
-./configure CPUFLAGS='-mavx -mf16c -msse4 -mpopcnt'   CXXFLAGS='-O0 -g -fPIC -m64 -Wno-sign-compare -Wall -Wextra' --prefix=$PWD --with-cuda-arch=-gencode=arch=compute_75,code=sm_75 --with-cuda=/usr/local/cuda
-make install -j
+#./configure CPUFLAGS='-mavx -mf16c -msse4 -mpopcnt'   CXXFLAGS='-O0 -g -fPIC -m64 -Wno-sign-compare -Wall -Wextra' --prefix=$PWD --with-cuda-arch=-gencode=arch=compute_75,code=sm_75 --with-cuda=/usr/local/cuda
+./configure --prefix=$PWD CFLAGS='-g -fPIC' CXXFLAGS='-O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp -g -fPIC -mf16c -O3' --without-python --with-cuda=/usr/local/cuda --with-cuda-arch='-gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75'
+make install -j8
--- a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.cpp
@ -17,16 +17,6 @@ using faiss::Index;
 using faiss::ParameterRange;
 using faiss::ParameterSpace;

-/** Build and index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric) {
-    try {
-        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
-            d, description, static_cast<faiss::MetricType>(metric)));
-    } CATCH_AND_HANDLE
-}
-
 const char* faiss_ParameterRange_name(const FaissParameterRange* range) {
    return reinterpret_cast<const ParameterRange*>(range)->name.c_str();
 }
@ -90,4 +80,4 @@ int faiss_ParameterSpace_add_range(FaissParameterSpace* space, const char* name,
            *p_range = reinterpret_cast<FaissParameterRange*>(&range);
        }
    } CATCH_AND_HANDLE
-}
+}
--- a/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/AutoTune_c.h
@ -18,11 +18,6 @@
 extern "C" {
 #endif

-/** Build and index with the sequence of processing steps described in
- *  the string.
- */
-int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric);
-
 /// possible values of a parameter, sorted from least to most expensive/accurate
 FAISS_DECLARE_CLASS(ParameterRange)

@ -66,4 +61,4 @@ int faiss_ParameterSpace_add_range(FaissParameterSpace*, const char*, FaissParam
 }
 #endif

-#endif
+#endif
--- a/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.cpp
@ -19,6 +19,7 @@ extern "C" {
 using faiss::Clustering;
 using faiss::ClusteringParameters;
 using faiss::Index;
+using faiss::ClusteringIterationStats;

 DEFINE_GETTER(Clustering, int, niter)
 DEFINE_GETTER(Clustering, int, nredo)
@ -38,6 +39,12 @@ DEFINE_GETTER(Clustering, size_t, d)
 /// getter for k
 DEFINE_GETTER(Clustering, size_t, k)

+DEFINE_GETTER(ClusteringIterationStats, float, obj)
+DEFINE_GETTER(ClusteringIterationStats, double, time)
+DEFINE_GETTER(ClusteringIterationStats, double, time_search)
+DEFINE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+DEFINE_GETTER(ClusteringIterationStats, int, nsplit)
+
 void faiss_ClusteringParameters_init(FaissClusteringParameters* params) {
    ClusteringParameters d;
    params->frozen_centroids = d.frozen_centroids;
@ -78,13 +85,12 @@ void faiss_Clustering_centroids(
    }
 }

-/// getter for objective values (sum of distances reported by index)
-/// over iterations
-void faiss_Clustering_obj(
-    FaissClustering* clustering, float** obj, size_t* size) {
-    std::vector<float>& v = reinterpret_cast<Clustering*>(clustering)->obj;
-    if (obj) {
-        *obj = v.data();
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+    FaissClustering* clustering, FaissClusteringIterationStats** iteration_stats, size_t* size) {
+    std::vector<ClusteringIterationStats>& v = reinterpret_cast<Clustering*>(clustering)->iteration_stats;
+    if (iteration_stats) {
+        *iteration_stats = reinterpret_cast<FaissClusteringIterationStats*>(v.data());
    }
    if (size) {
        *size = v.size();
--- a/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/Clustering_c.h
@ -47,7 +47,7 @@ void faiss_ClusteringParameters_init(FaissClusteringParameters* params);
 * points to the centroids. Therefore, at each iteration the centroids
 * are added to the index.
 *
- * On output, the centoids table is set to the latest version
+ * On output, the centroids table is set to the latest version
 * of the centroids and they are also added to the index. If the
 * centroids table it is not empty on input, it is also used for
 * initialization.
@ -75,14 +75,20 @@ FAISS_DECLARE_GETTER(Clustering, size_t, d)
 /// getter for k
 FAISS_DECLARE_GETTER(Clustering, size_t, k)

+FAISS_DECLARE_CLASS(ClusteringIterationStats)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, float, obj)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, time_search)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, double, imbalance_factor)
+FAISS_DECLARE_GETTER(ClusteringIterationStats, int, nsplit)
+
 /// getter for centroids (size = k * d)
 void faiss_Clustering_centroids(
    FaissClustering* clustering, float** centroids, size_t* size);

-/// getter for objective values (sum of distances reported by index)
-/// over iterations
-void faiss_Clustering_obj(
-    FaissClustering* clustering, float** obj, size_t* size);
+/// getter for iteration stats
+void faiss_Clustering_iteration_stats(
+    FaissClustering* clustering, FaissClusteringIterationStats** iteration_stats, size_t* size);

 /// the only mandatory parameters are k and d
 int faiss_Clustering_new(FaissClustering** p_clustering, int d, int k);
--- a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.cpp
@ -87,6 +87,13 @@ void faiss_IndexIVF_print_stats (const FaissIndexIVF* index) {
    reinterpret_cast<const IndexIVF*>(index)->invlists->print_stats();
 }

+/// get inverted lists ids
+void faiss_IndexIVF_invlists_get_ids (const FaissIndexIVF* index, size_t list_no, idx_t* invlist) {
+    const idx_t* list = reinterpret_cast<const IndexIVF*>(index)->invlists->get_ids(list_no);
+    size_t list_size = reinterpret_cast<const IndexIVF*>(index)->get_list_size(list_no);
+    memcpy(invlist, list, list_size*sizeof(idx_t));
+}
+
 void faiss_IndexIVFStats_reset(FaissIndexIVFStats* stats) {
    reinterpret_cast<IndexIVFStats*>(stats)->reset();    
 }
--- a/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/IndexIVF_c.h
@ -114,6 +114,13 @@ double faiss_IndexIVF_imbalance_factor (const FaissIndexIVF* index);
 /// display some stats about the inverted lists of the index
 void faiss_IndexIVF_print_stats (const FaissIndexIVF* index);

+/// Get the IDs in an inverted list. IDs are written to `invlist`, which must be large enough
+//// to accommodate the full list.
+///
+/// @param list_no the list ID
+/// @param invlist output pointer to a slice of memory, at least as long as the list's size
+/// @see faiss_IndexIVF_get_list_size(size_t) 
+void faiss_IndexIVF_invlists_get_ids (const FaissIndexIVF* index, size_t list_no, idx_t* invlist);

 typedef struct FaissIndexIVFStats {
    size_t nq;       // nb of queries run
--- a/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.cpp
@ -0,0 +1,21 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include "IndexPreTransform_c.h"
+#include "IndexPreTransform.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+using faiss::IndexPreTransform;
+
+DEFINE_DESTRUCTOR(IndexPreTransform)
+DEFINE_INDEX_DOWNCAST(IndexPreTransform)
+
+DEFINE_GETTER_PERMISSIVE(IndexPreTransform, FaissIndex*, index)
--- a/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/IndexPreTransform_c.h
@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_PRETRANSFORM_C_H
+#define FAISS_INDEX_PRETRANSFORM_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+FAISS_DECLARE_CLASS(IndexPreTransform)
+FAISS_DECLARE_DESTRUCTOR(IndexPreTransform)
+FAISS_DECLARE_INDEX_DOWNCAST(IndexPreTransform)
+
+FAISS_DECLARE_GETTER(IndexPreTransform, FaissIndex*, index)
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.cpp
@ -97,10 +97,9 @@ int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float*
    } CATCH_AND_HANDLE
 }

-int faiss_Index_display(const FaissIndex* index) {
+int faiss_Index_compute_residual_n(const FaissIndex* index, idx_t n, const float* x, float* residuals, const idx_t* keys) {
    try {
-        reinterpret_cast<const faiss::Index*>(index)->display();
+        reinterpret_cast<const faiss::Index *>(index)->compute_residual_n(n, x, residuals, keys);
    } CATCH_AND_HANDLE
 }
-
-}
+}
--- a/core/src/index/thirdparty/faiss/c_api/Index_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/Index_c.h
@ -26,8 +26,16 @@ typedef struct FaissIDSelector_H FaissIDSelector;

 /// Some algorithms support both an inner product version and a L2 search version.
 typedef enum FaissMetricType {
-    METRIC_INNER_PRODUCT = 0,
-    METRIC_L2 = 1,
+    METRIC_INNER_PRODUCT = 0,  ///< maximum inner product search
+    METRIC_L2 = 1,             ///< squared L2 search
+    METRIC_L1,                 ///< L1 (aka cityblock)
+    METRIC_Linf,               ///< infinity distance
+    METRIC_Lp,                 ///< L_p distance, p is given by metric_arg
+
+    /// some additional metrics defined in scipy.spatial.distance
+    METRIC_Canberra = 20,
+    METRIC_BrayCurtis,
+    METRIC_JensenShannon,    
 } FaissMetricType;

 /// Opaque type for referencing to an index object
@ -152,13 +160,24 @@ int faiss_Index_reconstruct_n (const FaissIndex* index, idx_t i0, idx_t ni, floa
 */
 int faiss_Index_compute_residual(const FaissIndex* index, const float* x, float* residual, idx_t key);

-/** Display the actual class name and some more info
+/** Computes a residual vector after indexing encoding.
+ *
+ * The residual vector is the difference between a vector and the
+ * reconstruction that can be decoded from its representation in
+ * the index. The residual can be used for multiple-stage indexing
+ * methods, like IndexIVF's methods.
+ *
 * @param index       opaque pointer to index object
+ * @param n           number of vectors
+ * @param x           input vector, size (n x d)
+ * @param residuals    output residual vectors, size (n x d)
+ * @param keys         encoded index, as returned by search and assign
 */
-int faiss_Index_display(const FaissIndex* index);
+int faiss_Index_compute_residual_n(const FaissIndex* index, idx_t n, const float* x, float* residuals, const idx_t* keys);
+

 #ifdef __cplusplus
 }
 #endif

-#endif
+#endif
--- a/core/src/index/thirdparty/faiss/c_api/Makefile
+++ b/core/src/index/thirdparty/faiss/c_api/Makefile
@ -13,8 +13,9 @@ DEBUGFLAG=-DNDEBUG # no debugging
 LIBNAME=libfaiss
 CLIBNAME=libfaiss_c
 LIBCOBJ=error_impl.o Index_c.o IndexFlat_c.o Clustering_c.o AutoTune_c.o \
-	AuxIndexStructures_c.o IndexIVF_c.o IndexIVFFlat_c.o IndexLSH_c.o \
-	index_io_c.o MetaIndexes_c.o IndexShards_c.o
+	impl/AuxIndexStructures_c.o IndexIVF_c.o IndexIVFFlat_c.o IndexLSH_c.o \
+	index_io_c.o MetaIndexes_c.o IndexShards_c.o index_factory_c.o \
+	clone_index_c.o IndexPreTransform_c.o
 CFLAGS=-fPIC -m64 -Wno-sign-compare -g -O3 -Wall -Wextra

 # Build static and shared object files by default
@ -42,38 +43,47 @@ clean:

 # Dependencies

-error_impl.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+error_impl.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 error_impl.o: error_impl.cpp error_c.h error_impl.h macros_impl.h

-index_io_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+index_io_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 index_io_c.o: index_io_c.cpp error_impl.cpp ../index_io.h macros_impl.h

-Index_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+index_factory_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+index_factory_c.o: index_factory_c.cpp error_impl.cpp ../index_io.h macros_impl.h
+
+clone_index_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+clone_index_c.o: index_factory_c.cpp error_impl.cpp ../index_io.h macros_impl.h
+
+Index_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 Index_c.o: Index_c.cpp Index_c.h ../Index.h macros_impl.h

-IndexFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexFlat_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexFlat_c.o: IndexFlat_c.cpp IndexFlat_c.h ../IndexFlat.h macros_impl.h

-IndexIVF_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVF_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexIVF_c.o: IndexIVF_c.cpp IndexIVF_c.h ../IndexIVF.h macros_impl.h

-IndexIVFFlat_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexIVFFlat_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexIVFFlat_c.o: IndexIVFFlat_c.cpp IndexIVFFlat_c.h ../IndexIVFFlat.h macros_impl.h

-IndexLSH_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexLSH_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexLSH_c.o: IndexLSH_c.cpp IndexLSH_c.h ../IndexLSH.h macros_impl.h

-IndexShards_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+IndexShards_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 IndexShards_c.o: IndexShards_c.cpp IndexShards_c.h ../Index.h ../IndexShards.h macros_impl.h

-Clustering_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+Clustering_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 Clustering_c.o: Clustering_c.cpp Clustering_c.h ../Clustering.h macros_impl.h

-AutoTune_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+AutoTune_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 AutoTune_c.o: AutoTune_c.cpp AutoTune_c.h ../AutoTune.h macros_impl.h

-AuxIndexStructures_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
-AuxIndexStructures_c.o: AuxIndexStructures_c.cpp AuxIndexStructures_c.h ../AuxIndexStructures.h macros_impl.h
+impl/AuxIndexStructures_c.o: CXXFLAGS += -I..  -I ../impl $(DEBUGFLAG)
+impl/AuxIndexStructures_c.o: impl/AuxIndexStructures_c.cpp impl/AuxIndexStructures_c.h ../impl/AuxIndexStructures.h macros_impl.h

-MetaIndexes_c.o: CXXFLAGS += -I.. $(DEBUGFLAG)
+MetaIndexes_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
 MetaIndexes_c.o: MetaIndexes_c.cpp MetaIndexes_c.h ../MetaIndexes.h macros_impl.h
+
+IndexPreTransform_c.o: CXXFLAGS += -I.. -I ../impl $(DEBUGFLAG)
+IndexPreTransform_c.o: IndexPreTransform_c.cpp IndexPreTransform_c.h ../IndexPreTransform.h macros_impl.h
--- a/core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/clone_index_c.cpp
@ -0,0 +1,23 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+#include "clone_index_c.h"
+#include "clone_index.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+
+int faiss_clone_index (const FaissIndex *idx, FaissIndex **p_out) {
+    try {
+        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
+        *p_out = reinterpret_cast<FaissIndex*>(out);
+    } CATCH_AND_HANDLE
+}
--- a/core/src/index/thirdparty/faiss/c_api/clone_index_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/clone_index_c.h
@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//  Copyright 2004-present Facebook. All Rights Reserved
+// -*- c++ -*-
+// I/O code for indexes
+
+
+#ifndef FAISS_CLONE_INDEX_C_H
+#define FAISS_CLONE_INDEX_C_H
+
+#include <stdio.h>
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* cloning functions */
+
+/** Clone an index. This is equivalent to `faiss::clone_index` */
+int faiss_clone_index (const FaissIndex *, FaissIndex ** p_out);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/core/src/index/thirdparty/faiss/c_api/example_c.c
+++ b/core/src/index/thirdparty/faiss/c_api/example_c.c
@ -17,6 +17,7 @@
 #include "Index_c.h"
 #include "IndexFlat_c.h"
 #include "AutoTune_c.h"
+#include "clone_index_c.h"

 #define FAISS_TRY(C)                                       \
    {                                                      \
--- a/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.cpp
@ -9,8 +9,8 @@
 // -*- c++ -*-

 #include "AuxIndexStructures_c.h"
-#include "AuxIndexStructures.h"
-#include "macros_impl.h"
+#include "../../impl/AuxIndexStructures.h"
+#include "../macros_impl.h"
 #include <iostream>

 using faiss::BufferList;
@ -20,6 +20,7 @@ using faiss::IDSelectorRange;
 using faiss::RangeSearchResult;
 using faiss::RangeSearchPartialResult;
 using faiss::RangeQueryResult;
+using faiss::DistanceComputer;

 DEFINE_GETTER(RangeSearchResult, size_t, nq)

@ -191,3 +192,29 @@ int faiss_RangeSearchPartialResult_new_result(
        return 0;
    } CATCH_AND_HANDLE
 }
+
+DEFINE_DESTRUCTOR(DistanceComputer)
+
+int faiss_DistanceComputer_set_query(FaissDistanceComputer *dc, const float *x) {
+    try {
+        reinterpret_cast<DistanceComputer*>(dc)->set_query(x);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_DistanceComputer_vector_to_query_dis(FaissDistanceComputer *dc, idx_t i, float *qd) {
+    try {
+        *qd = reinterpret_cast<DistanceComputer*>(dc)->operator()(i);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
+
+int faiss_DistanceComputer_symmetric_dis(FaissDistanceComputer *dc, idx_t i, idx_t j, float *vd) {
+    try {
+        *vd = reinterpret_cast<DistanceComputer*>(dc)->symmetric_dis(i, j);
+        return 0;
+    }
+    CATCH_AND_HANDLE
+}
--- a/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/impl/AuxIndexStructures_c.h
@ -11,8 +11,8 @@
 #ifndef FAISS_AUX_INDEX_STRUCTURES_C_H
 #define FAISS_AUX_INDEX_STRUCTURES_C_H

-#include "Index_c.h"
-#include "faiss_c.h"
+#include "../Index_c.h"
+#include "../faiss_c.h"

 #ifdef __cplusplus
 extern "C" {
@ -126,6 +126,22 @@ int faiss_RangeSearchPartialResult_set_lims(
 int faiss_RangeSearchPartialResult_new_result(
    FaissRangeSearchPartialResult* res, idx_t qno, FaissRangeQueryResult** qr);

+
+FAISS_DECLARE_CLASS(DistanceComputer)
+/// called before computing distances
+int faiss_DistanceComputer_set_query(FaissDistanceComputer *dc, const float *x);
+
+/**
+ * Compute distance of vector i to current query.
+ * This function corresponds to the function call operator: DistanceComputer::operator()
+ */
+int faiss_DistanceComputer_vector_to_query_dis( FaissDistanceComputer *dc, idx_t i, float *qd);
+/// compute distance between two stored vectors
+int faiss_DistanceComputer_symmetric_dis(FaissDistanceComputer *dc, idx_t i, idx_t j, float *vd);
+
+FAISS_DECLARE_DESTRUCTOR(DistanceComputer)
+
+
 #ifdef __cplusplus
 }
 #endif
--- a/core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/index_factory_c.cpp
@ -0,0 +1,26 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c++ -*-
+
+#include <cstring>
+#include "index_factory.h"
+#include "index_factory_c.h"
+#include "macros_impl.h"
+
+using faiss::Index;
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric) {
+    try {
+        *p_index = reinterpret_cast<FaissIndex*>(faiss::index_factory(
+            d, description, static_cast<faiss::MetricType>(metric)));
+    } CATCH_AND_HANDLE
+}
--- a/core/src/index/thirdparty/faiss/c_api/index_factory_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/index_factory_c.h
@ -0,0 +1,30 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Copyright 2004-present Facebook. All Rights Reserved.
+// -*- c -*-
+
+#ifndef FAISS_INDEX_FACTORY_C_H
+#define FAISS_INDEX_FACTORY_C_H
+
+#include "faiss_c.h"
+#include "Index_c.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** Build and index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_factory(FaissIndex** p_index, int d, const char* description, FaissMetricType metric);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.cpp
@ -39,11 +39,4 @@ int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out)
        auto out = faiss::read_index(fname, io_flags);
        *p_out = reinterpret_cast<FaissIndex*>(out);
    } CATCH_AND_HANDLE
-}
-
-int faiss_clone_index (const FaissIndex *idx, FaissIndex **p_out) {
-    try {
-        auto out = faiss::clone_index(reinterpret_cast<const Index*>(idx));
-        *p_out = reinterpret_cast<FaissIndex*>(out);
-    } CATCH_AND_HANDLE
-}
+}
--- a/core/src/index/thirdparty/faiss/c_api/index_io_c.h
+++ b/core/src/index/thirdparty/faiss/c_api/index_io_c.h
@ -44,12 +44,7 @@ int faiss_read_index(FILE *f, int io_flags, FaissIndex **p_out);
 */
 int faiss_read_index_fname(const char *fname, int io_flags, FaissIndex **p_out);

-/* cloning functions */
-
-/** Clone an index. This is equivalent to `faiss::clone_index` */
-int faiss_clone_index (const FaissIndex *, FaissIndex ** p_out);
-
 #ifdef __cplusplus
 }
 #endif
-#endif
+#endif
--- a/core/src/index/thirdparty/faiss/clone_index.cpp
+++ b/core/src/index/thirdparty/faiss/clone_index.cpp
@ -116,6 +116,12 @@ Index *Cloner::clone_Index (const Index *index)
               dynamic_cast<const IndexPreTransform*> (index)) {
        IndexPreTransform *res = new IndexPreTransform ();
        res->d = ipt->d;
+        res->ntotal = ipt->ntotal;
+        res->is_trained = ipt->is_trained;
+        res->metric_type = ipt->metric_type;
+        res->metric_arg = ipt->metric_arg;
+
+
        res->index = clone_Index (ipt->index);
        for (int i = 0; i < ipt->chain.size(); i++)
            res->chain.push_back (clone_VectorTransform (ipt->chain[i]));
--- a/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
+++ b/core/src/index/thirdparty/faiss/demos/demo_auto_tune.py
@ -25,14 +25,9 @@ import faiss
 #################################################################

 def ivecs_read(fname):
-    f = open(fname)
-    d, = np.fromfile(f, count = 1, dtype = 'int32')
-    sz = os.stat(fname).st_size
-    assert sz % (4 * (d + 1)) == 0
-    n = sz / (4 * (d + 1))
-    f.seek(0)
-    a = np.fromfile(f, count = n * (d +1), dtype = 'int32').reshape(n, d + 1)
-    return a[:, 1:].copy()
+    a = np.fromfile(fname, dtype="int32")
+    d = a[0]
+    return a.reshape(-1, d + 1)[:, 1:].copy()

 def fvecs_read(fname):
    return ivecs_read(fname).view('float32')
@ -41,8 +36,8 @@ def fvecs_read(fname):
 def plot_OperatingPoints(ops, nq, **kwargs):
    ops = ops.optimal_pts
    n = ops.size() * 2 - 1
-    pyplot.plot([ops.at( i      / 2).perf for i in range(n)],
-                [ops.at((i + 1) / 2).t / nq * 1000 for i in range(n)],
+    pyplot.plot([ops.at( i      // 2).perf for i in range(n)],
+                [ops.at((i + 1) // 2).t / nq * 1000 for i in range(n)],
                **kwargs)


--- a/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
+++ b/core/src/index/thirdparty/faiss/demos/demo_sift1M.cpp
@ -20,7 +20,7 @@
 #include <sys/time.h>

 #include <faiss/AutoTune.h>
-
+#include <faiss/index_factory.h>

 /**
 * To run this demo, please download the ANN_SIFT1M dataset from
--- a/core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
+++ b/core/src/index/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
@ -0,0 +1,185 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <faiss/Clustering.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/distances.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
+
+
+namespace {
+
+
+enum WeightedKMeansType {
+    WKMT_FlatL2,
+    WKMT_FlatIP,
+    WKMT_FlatIP_spherical,
+    WKMT_HNSW,
+};
+
+
+float weighted_kmeans_clustering (size_t d, size_t n, size_t k,
+                                  const float *input,
+                                  const float *weights,
+                                  float *centroids,
+                                  WeightedKMeansType index_num)
+{
+    using namespace faiss;
+    Clustering clus (d, k);
+    clus.verbose = true;
+
+    std::unique_ptr<Index> index;
+
+    switch (index_num) {
+    case WKMT_FlatL2:
+        index.reset(new IndexFlatL2 (d));
+        break;
+    case WKMT_FlatIP:
+        index.reset(new IndexFlatIP (d));
+        break;
+    case WKMT_FlatIP_spherical:
+        index.reset(new IndexFlatIP (d));
+        clus.spherical = true;
+        break;
+    case WKMT_HNSW:
+        IndexHNSWFlat *ihnsw = new IndexHNSWFlat (d, 32);
+        ihnsw->hnsw.efSearch = 128;
+        index.reset(ihnsw);
+        break;
+    }
+
+    clus.train(n, input, *index.get(), weights);
+    // on output the index contains the centroids.
+    memcpy(centroids, clus.centroids.data(), sizeof(*centroids) * d * k);
+    return clus.iteration_stats.back().obj;
+}
+
+
+int d = 32;
+float sigma = 0.1;
+
+#define BIGTEST
+
+#ifdef BIGTEST
+// the production setup = setting of https://fb.quip.com/CWgnAAYbwtgs
+int nc = 200000;
+int n_big = 4;
+int n_small = 2;
+#else
+int nc = 5;
+int n_big = 100;
+int n_small = 10;
+#endif
+
+int n; // number of training points
+
+void generate_trainset (std::vector<float> & ccent,
+                        std::vector<float> & x,
+                        std::vector<float> & weights)
+{
+    // same sampling as test_build_blocks.py test_weighted
+
+    ccent.resize (d * 2 * nc);
+    faiss::float_randn (ccent.data(), d * 2 * nc, 123);
+    faiss::fvec_renorm_L2 (d, 2 * nc, ccent.data());
+    n = nc * n_big + nc * n_small;
+    x.resize(d * n);
+    weights.resize(n);
+    faiss::float_randn (x.data(), x.size(), 1234);
+
+    float *xi = x.data();
+    float *w = weights.data();
+    for (int ci = 0; ci < nc * 2; ci++) { // loop over centroids
+        int np = ci < nc ? n_big : n_small; // nb of points around this centroid
+        for (int i = 0; i < np; i++) {
+            for (int j = 0; j < d; j++) {
+                xi[j] = xi[j] * sigma + ccent[ci * d + j];
+            }
+            *w++ = ci < nc ? 0.1 : 10;
+            xi += d;
+        }
+    }
+}
+
+}
+
+
+int main(int argc, char **argv) {
+    std::vector<float> ccent;
+    std::vector<float> x;
+    std::vector<float> weights;
+
+    printf("generate training set\n");
+    generate_trainset(ccent, x, weights);
+
+    std::vector<float> centroids;
+    centroids.resize(nc * d);
+
+    int the_index_num = -1;
+    int the_with_weights = -1;
+
+    if (argc == 3) {
+        the_index_num = atoi(argv[1]);
+        the_with_weights = atoi(argv[2]);
+    }
+
+
+    for (int index_num = WKMT_FlatL2;
+         index_num <= WKMT_HNSW;
+         index_num++) {
+
+        if (the_index_num >= 0 && index_num != the_index_num) {
+            continue;
+        }
+
+        for (int with_weights = 0; with_weights <= 1; with_weights++) {
+            if (the_with_weights >= 0 && with_weights != the_with_weights) {
+                continue;
+            }
+
+            printf("=================== index_num=%d Run %s weights\n",
+                   index_num, with_weights ? "with" : "without");
+
+            weighted_kmeans_clustering (
+                 d, n, nc, x.data(),
+                 with_weights ? weights.data() : nullptr,
+                 centroids.data(), (WeightedKMeansType)index_num
+            );
+
+            { // compute distance of points to centroids
+                faiss::IndexFlatL2 cent_index(d);
+                cent_index.add(nc, centroids.data());
+                std::vector<float> dis (n);
+                std::vector<faiss::Index::idx_t> idx (n);
+
+                cent_index.search (nc * 2, ccent.data(), 1,
+                                   dis.data(), idx.data());
+
+                float dis1 = 0, dis2 = 0;
+                for (int i = 0; i < nc ; i++) {
+                    dis1 += dis[i];
+                }
+                printf("average distance of points from big clusters: %g\n",
+                       dis1 / nc);
+
+                for (int i = 0; i < nc ; i++) {
+                    dis2 += dis[i + nc];
+                }
+
+                printf("average distance of points from small clusters: %g\n",
+                       dis2 / nc);
+
+            }
+
+        }
+    }
+    return 0;
+}
--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Linux
@ -9,7 +9,7 @@

 CXX      = g++ -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -mavx -msse4 -mpopcnt
 LDFLAGS  = -fPIC -fopenmp

 # common linux flags
--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.brew
@ -9,7 +9,7 @@
 # brew install llvm
 CXX      = /usr/local/opt/llvm/bin/clang++ -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare -I/usr/local/opt/llvm/include
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -msse4 -mpopcnt
 LLVM_VERSION_PATH=$(shell ls -rt /usr/local/Cellar/llvm/ | tail -n1)
 LDFLAGS  = -fPIC -fopenmp -L/usr/local/opt/llvm/lib -L/usr/local/Cellar/llvm/${LLVM_VERSION_PATH}/lib

--- a/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
+++ b/core/src/index/thirdparty/faiss/example_makefiles/makefile.inc.Mac.port
@ -12,7 +12,7 @@
 # port install g++-mp-6
 CXX      = /opt/local/bin/g++-mp-6 -std=c++11
 CXXFLAGS = -fPIC -m64 -Wall -g -O3 -fopenmp -Wno-sign-compare
-CPUFLAGS = -mavx2 -mf16c -msse4 -mpopcnt
+CPUFLAGS = -msse4 -mpopcnt
 LDFLAGS  = -g -fPIC -fopenmp

 # common linux flags
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.cpp
@ -279,7 +279,6 @@ faiss::Index * index_cpu_to_gpu(
    return cl.clone_Index(index_composition);
 }

-
 /**********************************************************
 * Cloning to multiple GPUs
 **********************************************************/
@ -372,6 +371,7 @@ Index * ToGpuClonerMultiple::clone_Index_to_shards (const Index *index)
                       index_ivfflat->quantizer, index->d,
                       index_ivfflat->nlist, index_ivfflat->metric_type);
            idx2.nprobe = index_ivfflat->nprobe;
+            idx2.is_trained = index->is_trained;
            copy_ivf_shard (index_ivfflat, &idx2, n, i);
            shards[i] = sub_cloners[i].clone_Index(&idx2);
        } else if (index_ivfsq) {
@ -380,7 +380,10 @@ Index * ToGpuClonerMultiple::clone_Index_to_shards (const Index *index)
                       index_ivfsq->sq.qtype,
                       index_ivfsq->metric_type,
                       index_ivfsq->by_residual);
+
            idx2.nprobe = index_ivfsq->nprobe;
+            idx2.is_trained = index->is_trained;
+            idx2.sq = index_ivfsq->sq;
            copy_ivf_shard (index_ivfsq, &idx2, n, i);
            shards[i] = sub_cloners[i].clone_Index(&idx2);
        } else if (index_flat) {
--- a/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuCloner.h
@ -73,9 +73,9 @@ faiss::Index * index_cpu_to_gpu(
       const GpuClonerOptions *options = nullptr);

 faiss::Index * index_cpu_to_gpu(
-        GpuResources* resources, int device,
-        IndexComposition* index_composition,
-        const GpuClonerOptions *options = nullptr);
+       GpuResources* resources, int device,
+       IndexComposition* index_composition,
+       const GpuClonerOptions *options = nullptr);

 faiss::Index * index_cpu_to_gpu_multiple(
       std::vector<GpuResources*> & resources,
--- a/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
+++ b/core/src/index/thirdparty/faiss/gpu/GpuClonerOptions.cpp
@ -13,7 +13,7 @@ GpuClonerOptions::GpuClonerOptions()
    : indicesOptions(INDICES_64_BIT),
      useFloat16CoarseQuantizer(false),
      useFloat16(false),
-      usePrecomputed(true),
+      usePrecomputed(false),
      reserveVecs(0),
      storeTransposed(false),
      storeInCpu(false),
--- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
@ -17,88 +17,75 @@

 namespace faiss { namespace gpu {

-void bruteForceKnn(GpuResources* resources,
-                   faiss::MetricType metric,
-                   // A region of memory size numVectors x dims, with dims
-                   // innermost
-                   const float* vectors,
-                   bool vectorsRowMajor,
-                   int numVectors,
-                   // A region of memory size numQueries x dims, with dims
-                   // innermost
-                   const float* queries,
-                   bool queriesRowMajor,
-                   int numQueries,
-                   int dims,
-                   int k,
-                   // A region of memory size numQueries x k, with k
-                   // innermost
-                   float* outDistances,
-                   // A region of memory size numQueries x k, with k
-                   // innermost
-                   faiss::Index::idx_t* outIndices) {
+template <typename T>
+void bfKnnConvert(GpuResources* resources, const GpuDistanceParams& args) {
  auto device = getCurrentDevice();
  auto stream = resources->getDefaultStreamCurrentDevice();
  auto& mem = resources->getMemoryManagerCurrentDevice();

-  auto tVectors = toDevice<float, 2>(resources,
-                                     device,
-                                     const_cast<float*>(vectors),
-                                     stream,
-                                     {vectorsRowMajor ? numVectors : dims,
-                                      vectorsRowMajor ? dims : numVectors});
-  auto tQueries = toDevice<float, 2>(resources,
-                                     device,
-                                     const_cast<float*>(queries),
-                                     stream,
-                                     {queriesRowMajor ? numQueries : dims,
-                                      queriesRowMajor ? dims : numQueries});
+  auto tVectors =
+    toDevice<T, 2>(resources,
+                   device,
+                   const_cast<T*>(reinterpret_cast<const T*>(args.vectors)),
+                   stream,
+                   {args.vectorsRowMajor ? args.numVectors : args.dims,
+                    args.vectorsRowMajor ? args.dims : args.numVectors});
+  auto tQueries =
+    toDevice<T, 2>(resources,
+                   device,
+                   const_cast<T*>(reinterpret_cast<const T*>(args.queries)),
+                   stream,
+                   {args.queriesRowMajor ? args.numQueries : args.dims,
+                    args.queriesRowMajor ? args.dims : args.numQueries});

-  auto tOutDistances = toDevice<float, 2>(resources,
-                                          device,
-                                          outDistances,
-                                          stream,
-                                          {numQueries, k});
+  DeviceTensor<float, 1, true> tVectorNorms;
+  if (args.vectorNorms) {
+    tVectorNorms = toDevice<float, 1>(resources,
+                                      device,
+                                      const_cast<float*>(args.vectorNorms),
+                                      stream,
+                                      {args.numVectors});
+  }

-  // FlatIndex only supports an interface returning int indices, allocate
-  // temporary memory for it
-  DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
+  auto tOutDistances =
+    toDevice<float, 2>(resources,
+                       device,
+                       args.outDistances,
+                       stream,
+                       {args.numQueries, args.k});
+
+  // The brute-force API only supports an interface for integer indices
+  DeviceTensor<int, 2, true>
+    tOutIntIndices(mem, {args.numQueries, args.k}, stream);

  // Empty bitset
  auto bitsetDevice = toDevice<uint8_t, 1>(resources, device, nullptr, stream, {0});

-  // Do the work
-  if (metric == faiss::MetricType::METRIC_L2) {
-    runL2Distance(resources,
-                  tVectors,
-                  vectorsRowMajor,
-                  nullptr, // compute norms in temp memory
-                  tQueries,
-                  queriesRowMajor,
-                  bitsetDevice,
-                  k,
-                  tOutDistances,
-                  tOutIntIndices);
-  } else if (metric == faiss::MetricType::METRIC_INNER_PRODUCT) {
-    runIPDistance(resources,
-                  tVectors,
-                  vectorsRowMajor,
-                  tQueries,
-                  queriesRowMajor,
-                  bitsetDevice,
-                  k,
-                  tOutDistances,
-                  tOutIntIndices);
-  } else {
-    FAISS_THROW_MSG("metric should be METRIC_L2 or METRIC_INNER_PRODUCT");
-  }
+  // Since we've guaranteed that all arguments are on device, call the
+  // implementation
+  bfKnnOnDevice<T>(resources,
+                   device,
+                   stream,
+                   tVectors,
+                   args.vectorsRowMajor,
+                   args.vectorNorms ? &tVectorNorms : nullptr,
+                   tQueries,
+                   args.queriesRowMajor,
+                   bitsetDevice,
+                   args.k,
+                   args.metric,
+                   args.metricArg,
+                   tOutDistances,
+                   tOutIntIndices,
+                   args.ignoreOutDistances);

  // Convert and copy int indices out
-  auto tOutIndices = toDevice<faiss::Index::idx_t, 2>(resources,
-                                                      device,
-                                                      outIndices,
-                                                      stream,
-                                                      {numQueries, k});
+  auto tOutIndices =
+    toDevice<faiss::Index::idx_t, 2>(resources,
+                                     device,
+                                     args.outIndices,
+                                     stream,
+                                     {args.numQueries, args.k});

  // Convert int to idx_t
  convertTensor<int, faiss::Index::idx_t, 2>(stream,
@ -106,8 +93,65 @@ void bruteForceKnn(GpuResources* resources,
                                             tOutIndices);

  // Copy back if necessary
-  fromDevice<float, 2>(tOutDistances, outDistances, stream);
-  fromDevice<faiss::Index::idx_t, 2>(tOutIndices, outIndices, stream);
+  fromDevice<float, 2>(tOutDistances, args.outDistances, stream);
+  fromDevice<faiss::Index::idx_t, 2>(tOutIndices, args.outIndices, stream);
+}
+
+void
+bfKnn(GpuResources* resources, const GpuDistanceParams& args) {
+  // For now, both vectors and queries must be of the same data type
+  FAISS_THROW_IF_NOT_MSG(
+    args.vectorType == args.queryType,
+    "limitation: both vectorType and queryType must currently "
+    "be the same (F32 or F16");
+
+  if (args.vectorType == DistanceDataType::F32) {
+    bfKnnConvert<float>(resources, args);
+  } else if (args.vectorType == DistanceDataType::F16) {
+    bfKnnConvert<half>(resources, args);
+  } else {
+    FAISS_THROW_MSG("unknown vectorType");
+  }
+}
+
+// legacy version
+void
+bruteForceKnn(GpuResources* resources,
+              faiss::MetricType metric,
+              // A region of memory size numVectors x dims, with dims
+              // innermost
+              const float* vectors,
+              bool vectorsRowMajor,
+              int numVectors,
+              // A region of memory size numQueries x dims, with dims
+              // innermost
+              const float* queries,
+              bool queriesRowMajor,
+              int numQueries,
+              int dims,
+              int k,
+              // A region of memory size numQueries x k, with k
+              // innermost
+              float* outDistances,
+              // A region of memory size numQueries x k, with k
+              // innermost
+              faiss::Index::idx_t* outIndices) {
+  std::cerr << "bruteForceKnn is deprecated; call bfKnn instead" << std::endl;
+
+  GpuDistanceParams args;
+  args.metric = metric;
+  args.k = k;
+  args.dims = dims;
+  args.vectors = vectors;
+  args.vectorsRowMajor = vectorsRowMajor;
+  args.numVectors = numVectors;
+  args.queries = queries;
+  args.queriesRowMajor = queriesRowMajor;
+  args.numQueries = numQueries;
+  args.outDistances = outDistances;
+  args.outIndices = outIndices;
+
+  bfKnn(resources, args);
 }

 } } // namespace
--- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.h
@ -14,6 +14,96 @@ namespace faiss { namespace gpu {

 class GpuResources;

+// Scalar type of the vector data
+enum class DistanceDataType {
+  F32 = 1,
+  F16,
+};
+
+/// Arguments to brute-force GPU k-nearest neighbor searching
+struct GpuDistanceParams {
+  GpuDistanceParams()
+      : metric(faiss::MetricType::METRIC_L2),
+        metricArg(0),
+        k(0),
+        dims(0),
+        vectors(nullptr),
+        vectorType(DistanceDataType::F32),
+        vectorsRowMajor(true),
+        numVectors(0),
+        vectorNorms(nullptr),
+        queries(nullptr),
+        queryType(DistanceDataType::F32),
+        queriesRowMajor(true),
+        numQueries(0),
+        outDistances(nullptr),
+        ignoreOutDistances(false),
+        outIndices(nullptr) {
+  }
+
+  //
+  // Search parameters
+  //
+
+  // Search parameter: distance metric
+  faiss::MetricType metric;
+
+  // Search parameter: distance metric argument (if applicable)
+  // For metric == METRIC_Lp, this is the p-value
+  float metricArg;
+
+  // Search parameter: return k nearest neighbors
+  int k;
+
+  // Vector dimensionality
+  int dims;
+
+  //
+  // Vectors being queried
+  //
+
+  // If vectorsRowMajor is true, this is
+  // numVectors x dims, with dims innermost; otherwise,
+  // dims x numVectors, with numVectors innermost
+  const void* vectors;
+  DistanceDataType vectorType;
+  bool vectorsRowMajor;
+  int numVectors;
+
+  // Precomputed L2 norms for each vector in `vectors`, which can be optionally
+  // provided in advance to speed computation for METRIC_L2
+  const float* vectorNorms;
+
+  //
+  // The query vectors (i.e., find k-nearest neighbors in `vectors` for each of
+  // the `queries`
+  //
+
+  // If queriesRowMajor is true, this is
+  // numQueries x dims, with dims innermost; otherwise,
+  // dims x numQueries, with numQueries innermost
+  const void* queries;
+  DistanceDataType queryType;
+  bool queriesRowMajor;
+  int numQueries;
+
+  //
+  // Output results
+  //
+
+  // A region of memory size numQueries x k, with k
+  // innermost (row major)
+  float* outDistances;
+
+  // Do we only care abouty the indices reported, rather than the output
+  // distances?
+  bool ignoreOutDistances;
+
+  // A region of memory size numQueries x k, with k
+  // innermost (row major)
+  faiss::Index::idx_t* outIndices;
+};
+
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
 /// neighbor searches on an externally-provided region of memory (e.g., from a
 /// pytorch tensor).
@ -26,6 +116,9 @@ class GpuResources;
 ///
 /// For each vector in `queries`, searches all of `vectors` to find its k
 /// nearest neighbors with respect to the given metric
+void bfKnn(GpuResources* resources, const GpuDistanceParams& args);
+
+/// Deprecated legacy implementation
 void bruteForceKnn(GpuResources* resources,
                   faiss::MetricType metric,
                   // If vectorsRowMajor is true, this is
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
@ -9,7 +9,6 @@
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/Metrics.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
@ -39,6 +38,7 @@ constexpr size_t kSearchVecSize = (size_t) 32 * 1024;
 GpuIndex::GpuIndex(GpuResources* resources,
                   int dims,
                   faiss::MetricType metric,
+                   float metricArg,
                   GpuIndexConfig config) :
    Index(dims, metric),
    resources_(resources),
@ -62,13 +62,30 @@ GpuIndex::GpuIndex(GpuResources* resources,
                     "Must compile with CUDA 8+ for Unified Memory support");
 #endif

-  FAISS_THROW_IF_NOT_MSG(isMetricSupported(metric),
-                         "Unsupported metric type on GPU");
+  metric_arg = metricArg;

  FAISS_ASSERT(resources_);
  resources_->initializeForDevice(device_);
 }

+void
+GpuIndex::copyFrom(const faiss::Index* index) {
+  d = index->d;
+  metric_type = index->metric_type;
+  metric_arg = index->metric_arg;
+  ntotal = index->ntotal;
+  is_trained = index->is_trained;
+}
+
+void
+GpuIndex::copyTo(faiss::Index* index) const {
+  index->d = d;
+  index->metric_type = metric_type;
+  index->metric_arg = metric_arg;
+  index->ntotal = ntotal;
+  index->is_trained = is_trained;
+}
+
 void
 GpuIndex::setMinPagingSize(size_t size) {
  minPagedSize_ = size;
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
@ -36,6 +36,7 @@ class GpuIndex : public faiss::Index {
  GpuIndex(GpuResources* resources,
           int dims,
           faiss::MetricType metric,
+           float metricArg,
           GpuIndexConfig config);

  inline int getDevice() const {
@ -88,6 +89,12 @@ class GpuIndex : public faiss::Index {
                          const Index::idx_t* keys) const override;

 protected:
+  /// Copy what we need from the CPU equivalent
+  void copyFrom(const faiss::Index* index);
+
+  /// Copy what we have to the CPU equivalent
+  void copyTo(faiss::Index* index) const;
+
  /// Does addImpl_ require IDs? If so, and no IDs are provided, we will
  /// generate them sequentially based on the order in which the IDs are added
  virtual bool addImplRequiresIDs_() const = 0;
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
@ -22,11 +22,13 @@ namespace faiss { namespace gpu {
 GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
                           const faiss::IndexFlat* index,
                           GpuIndexFlatConfig config) :
-    GpuIndex(resources, index->d, index->metric_type, config),
+    GpuIndex(resources,
+             index->d,
+             index->metric_type,
+             index->metric_arg,
+             config),
    config_(std::move(config)),
    data_(nullptr) {
-  verifySettings_();
-
  // Flat index doesn't need training
  this->is_trained = true;

@ -37,11 +39,9 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
                           int dims,
                           faiss::MetricType metric,
                           GpuIndexFlatConfig config) :
-    GpuIndex(resources, dims, metric, config),
+    GpuIndex(resources, dims, metric, 0, config),
    config_(std::move(config)),
    data_(nullptr) {
-  verifySettings_();
-
  // Flat index doesn't need training
  this->is_trained = true;

@ -49,9 +49,7 @@ GpuIndexFlat::GpuIndexFlat(GpuResources* resources,
  DeviceScope scope(device_);
  data_ = new FlatIndex(resources,
                        dims,
-                        metric == faiss::METRIC_L2,
                        config_.useFloat16,
-                        config_.useFloat16Accumulator,
                        config_.storeTransposed,
                        memorySpace_);
 }
@ -64,8 +62,7 @@ void
 GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
  DeviceScope scope(device_);

-  this->d = index->d;
-  this->metric_type = index->metric_type;
+  GpuIndex::copyFrom(index);

  // GPU code has 32 bit indices
  FAISS_THROW_IF_NOT_FMT(index->ntotal <=
@ -74,14 +71,11 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
                         "attempting to copy CPU index with %zu parameters",
                         (size_t) std::numeric_limits<int>::max(),
                         (size_t) index->ntotal);
-  this->ntotal = index->ntotal;

  delete data_;
  data_ = new FlatIndex(resources_,
                        this->d,
-                        index->metric_type == faiss::METRIC_L2,
                        config_.useFloat16,
-                        config_.useFloat16Accumulator,
                        config_.storeTransposed,
                        memorySpace_);

@ -95,7 +89,7 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) {
  xb_.clear();

  if (config_.storeInCpu) {
-      xb_ = index->xb;
+    xb_ = index->xb;
  }
 }

@ -103,9 +97,7 @@ void
 GpuIndexFlat::copyTo(faiss::IndexFlat* index) const {
  DeviceScope scope(device_);

-  index->d = this->d;
-  index->ntotal = this->ntotal;
-  index->metric_type = this->metric_type;
+  GpuIndex::copyTo(index);

  FAISS_ASSERT(data_);
  FAISS_ASSERT(data_->getSize() == this->ntotal);
@ -219,12 +211,12 @@ GpuIndexFlat::searchImpl_(int n,
  // Copy bitset to GPU
  if (!bitset) {
    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
-    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+    data_->query(queries, bitsetDevice, k, metric_type, metric_arg, outDistances, outIntLabels, true);
  } else {
    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
                                             const_cast<uint8_t*>(bitset->data()), stream,
                                             {(int) bitset->size()});
-    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+    data_->query(queries, bitsetDevice, k, metric_type, metric_arg, outDistances, outIntLabels, true);
  }

  // Convert int to idx_t
@ -236,9 +228,9 @@ GpuIndexFlat::searchImpl_(int n,
 void
 GpuIndexFlat::reconstruct(faiss::Index::idx_t key,
                          float* out) const {
-  if(config_.storeInCpu && xb_.size() > 0) {
-      memcpy (out, &(this->xb_[key * this->d]), sizeof(*out) * this->d);
-      return;
+  if (config_.storeInCpu && xb_.size() > 0) {
+    memcpy (out, &(this->xb_[key * this->d]), sizeof(*out) * this->d);
+    return;
  }

  DeviceScope scope(device_);
@ -322,21 +314,6 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n,
  fromDevice<float, 2>(residualDevice, residuals, stream);
 }

-void
-GpuIndexFlat::verifySettings_() const {
-  // If we want Hgemm, ensure that it is supported on this device
-  if (config_.useFloat16Accumulator) {
-    FAISS_THROW_IF_NOT_MSG(config_.useFloat16,
-                       "useFloat16Accumulator can only be enabled "
-                       "with useFloat16");
-
-    FAISS_THROW_IF_NOT_FMT(getDeviceSupportsFloat16Math(config_.device),
-                       "Device %d does not support Hgemm "
-                       "(useFloat16Accumulator)",
-                       config_.device);
-  }
-}
-
 //
 // GpuIndexFlatL2
 //
--- a/Show More
+++ b/Show More