From f93b4641726ad851ef75fa97fe10c04909e7f926 Mon Sep 17 00:00:00 2001
From: Xiaohai Xu <xiaohai.xu@zilliz.com>
Date: Thu, 26 Mar 2020 18:36:46 +0800
Subject: [PATCH] #1654 GPU Index Flat Delete (#1736)

* Add Flat Index Delete

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix log

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix bitset

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* Fix reference

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix bug

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix bug

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>

* fix bug

Signed-off-by: sahuang <xiaohai.xu@zilliz.com>
---
 .../index/thirdparty/faiss/gpu/GpuDistance.cu |   5 +
 .../index/thirdparty/faiss/gpu/GpuIndex.cu    |  20 +-
 .../src/index/thirdparty/faiss/gpu/GpuIndex.h |   9 +-
 .../thirdparty/faiss/gpu/GpuIndexFlat.cu      |  15 +-
 .../index/thirdparty/faiss/gpu/GpuIndexFlat.h |   3 +-
 .../thirdparty/faiss/gpu/GpuIndexIVFFlat.cu   |  15 +-
 .../thirdparty/faiss/gpu/GpuIndexIVFFlat.h    |   3 +-
 .../thirdparty/faiss/gpu/GpuIndexIVFPQ.cu     |  15 +-
 .../thirdparty/faiss/gpu/GpuIndexIVFPQ.h      |   3 +-
 .../faiss/gpu/GpuIndexIVFSQHybrid.cu          |  15 +-
 .../faiss/gpu/GpuIndexIVFSQHybrid.h           |   3 +-
 .../faiss/gpu/GpuIndexIVFScalarQuantizer.cu   |  15 +-
 .../faiss/gpu/GpuIndexIVFScalarQuantizer.h    |   3 +-
 .../thirdparty/faiss/gpu/impl/Distance.cu     |  19 ++
 .../thirdparty/faiss/gpu/impl/Distance.cuh    |  55 +++++
 .../thirdparty/faiss/gpu/impl/FlatIndex.cu    |   8 +-
 .../thirdparty/faiss/gpu/impl/FlatIndex.cuh   |   2 +
 .../thirdparty/faiss/gpu/impl/IVFFlat.cu      |   7 +-
 .../thirdparty/faiss/gpu/impl/IVFFlat.cuh     |   5 +-
 .../index/thirdparty/faiss/gpu/impl/IVFPQ.cu  |   8 +-
 .../index/thirdparty/faiss/gpu/impl/IVFPQ.cuh |   4 +-
 .../thirdparty/faiss/gpu/impl/L2Select.cu     | 230 ++++++++++++++++--
 .../thirdparty/faiss/gpu/impl/L2Select.cuh    |  16 ++
 .../thirdparty/faiss/gpu/perf/PerfSelect.cu   |   3 +-
 .../faiss/gpu/test/TestGpuSelect.cu           |   5 +-
 .../faiss/gpu/utils/BlockSelectFloat.cu       |   2 +
 .../faiss/gpu/utils/BlockSelectHalf.cu        |   2 +
 .../faiss/gpu/utils/BlockSelectKernel.cuh     | 114 +++++++++
 .../gpu/utils/blockselect/BlockSelectImpl.cuh |  26 +-
 29 files changed, 565 insertions(+), 65 deletions(-)

diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
index 6d7e67b89b..9ae5fdad5d 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu
@@ -64,6 +64,9 @@ void bruteForceKnn(GpuResources* resources,
   // temporary memory for it
   DeviceTensor<int, 2, true> tOutIntIndices(mem, {numQueries, k}, stream);
 
+  // Empty bitset
+  auto bitsetDevice = toDevice<uint8_t, 1>(resources, device, nullptr, stream, {0});
+
   // Do the work
   if (metric == faiss::MetricType::METRIC_L2) {
     runL2Distance(resources,
@@ -72,6 +75,7 @@ void bruteForceKnn(GpuResources* resources,
                   nullptr, // compute norms in temp memory
                   tQueries,
                   queriesRowMajor,
+                  bitsetDevice,
                   k,
                   tOutDistances,
                   tOutIntIndices);
@@ -81,6 +85,7 @@ void bruteForceKnn(GpuResources* resources,
                   vectorsRowMajor,
                   tQueries,
                   queriesRowMajor,
+                  bitsetDevice,
                   k,
                   tOutDistances,
                   tOutIntIndices);
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
index 6fddbfe1fb..9ae1662055 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu
@@ -235,7 +235,8 @@ GpuIndex::search(Index::idx_t n,
     if (dataSize >= minPagedSize_) {
       searchFromCpuPaged_(n, x, k,
                           outDistances.data(),
-                          outLabels.data());
+                          outLabels.data(),
+                          bitset);
       usePaged = true;
     }
   }
@@ -243,7 +244,8 @@ GpuIndex::search(Index::idx_t n,
   if (!usePaged) {
     searchNonPaged_(n, x, k,
                     outDistances.data(),
-                    outLabels.data());
+                    outLabels.data(),
+                    bitset);
   }
 
   // Copy back if necessary
@@ -256,7 +258,8 @@ GpuIndex::searchNonPaged_(int n,
                           const float* x,
                           int k,
                           float* outDistancesData,
-                          Index::idx_t* outIndicesData) const {
+                          Index::idx_t* outIndicesData,
+                          ConcurrentBitsetPtr bitset) const {
   auto stream = resources_->getDefaultStream(device_);
 
   // Make sure arguments are on the device we desire; use temporary
@@ -267,7 +270,7 @@ GpuIndex::searchNonPaged_(int n,
                                  stream,
                                  {n, (int) this->d});
 
-  searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData);
+  searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, bitset);
 }
 
 void
@@ -275,7 +278,8 @@ GpuIndex::searchFromCpuPaged_(int n,
                               const float* x,
                               int k,
                               float* outDistancesData,
-                              Index::idx_t* outIndicesData) const {
+                              Index::idx_t* outIndicesData,
+                              ConcurrentBitsetPtr bitset) const {
   Tensor<float, 2, true> outDistances(outDistancesData, {n, k});
   Tensor<Index::idx_t, 2, true> outIndices(outIndicesData, {n, k});
 
@@ -300,7 +304,8 @@ GpuIndex::searchFromCpuPaged_(int n,
                       x + (size_t) cur * this->d,
                       k,
                       outDistancesSlice.data(),
-                      outIndicesSlice.data());
+                      outIndicesSlice.data(),
+                      bitset);
     }
 
     return;
@@ -411,7 +416,8 @@ GpuIndex::searchFromCpuPaged_(int n,
                   bufGpus[cur3BufIndex]->data(),
                   k,
                   outDistancesSlice.data(),
-                  outIndicesSlice.data());
+                  outIndicesSlice.data(),
+                  bitset);
 
       // Create completion event
       eventGpuExecuteDone[cur3BufIndex] =
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
index 70aca88b3c..44efd660d3 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h
@@ -103,7 +103,8 @@ class GpuIndex : public faiss::Index {
                            const float* x,
                            int k,
                            float* distances,
-                           Index::idx_t* labels) const = 0;
+                           Index::idx_t* labels,
+                           ConcurrentBitsetPtr bitset = nullptr) const = 0;
 
 private:
   /// Handles paged adds if the add set is too large, passes to
@@ -122,7 +123,8 @@ private:
                        const float* x,
                        int k,
                        float* outDistancesData,
-                       Index::idx_t* outIndicesData) const;
+                       Index::idx_t* outIndicesData,
+                      ConcurrentBitsetPtr bitset = nullptr) const;
 
   /// Calls searchImpl_ for a single page of GPU-resident data,
   /// handling paging of the data and copies from the CPU
@@ -130,7 +132,8 @@ private:
                            const float* x,
                            int k,
                            float* outDistancesData,
-                           Index::idx_t* outIndicesData) const;
+                           Index::idx_t* outIndicesData,
+                           ConcurrentBitsetPtr bitset = nullptr) const;
 
  protected:
   /// Manages streams, cuBLAS handles and scratch memory for devices
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
index 09c23363fc..6084933b5c 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu
@@ -203,7 +203,8 @@ GpuIndexFlat::searchImpl_(int n,
                           const float* x,
                           int k,
                           float* distances,
-                          Index::idx_t* labels) const {
+                          Index::idx_t* labels,
+                          ConcurrentBitsetPtr bitset) const {
   auto stream = resources_->getDefaultStream(device_);
 
   // Input and output data are already resident on the GPU
@@ -215,7 +216,17 @@ GpuIndexFlat::searchImpl_(int n,
   DeviceTensor<int, 2, true> outIntLabels(
     resources_->getMemoryManagerCurrentDevice(), {n, k}, stream);
 
-  data_->query(queries, k, outDistances, outIntLabels, true);
+  // Copy bitset to GPU
+  if (!bitset) {
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+  } else {
+    auto bitsetData = bitset->bitset();
+    auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_,
+                                  const_cast<uint8_t*>(bitsetData), stream,
+                                  {(int) bitset->size()});
+    data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true);
+  }
 
   // Convert int to idx_t
   convertTensor<int, faiss::Index::idx_t, 2>(stream,
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
index ecda39dc6e..03cfdf7191 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h
@@ -126,7 +126,8 @@ class GpuIndexFlat : public GpuIndex {
                    const float* x,
                    int k,
                    float* distances,
-                   faiss::Index::idx_t* labels) const override;
+                   faiss::Index::idx_t* labels,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
  private:
   /// Checks user settings for consistency
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
index d946f002b8..3030ea53af 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu
@@ -207,14 +207,18 @@ GpuIndexIVFFlat::addImpl_(int n,
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
 
+  auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
 
   // Not all vectors may be able to be added (some may contain NaNs etc)
-  index_->classifyAndAddVectors(data, labels);
+  index_->classifyAndAddVectors(data, labels, bitset);
 
   // but keep the ntotal based on the total number of vectors that we attempted
   // to add
@@ -226,11 +230,14 @@ GpuIndexIVFFlat::searchImpl_(int n,
                              const float* x,
                              int k,
                              float* distances,
-                             Index::idx_t* labels) const {
+                             Index::idx_t* labels,
+                             ConcurrentBitsetPtr bitset) const {
   // Device is already set in GpuIndex::search
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
   Tensor<float, 2, true> outDistances(distances, {n, k});
@@ -238,7 +245,9 @@ GpuIndexIVFFlat::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  index_->query(queries, nprobe, k, outDistances, outLabels);
+  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+  
+  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
 }
 
 
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
index f5d6fba457..876371eda4 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h
@@ -70,7 +70,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
                    const float* x,
                    int k,
                    float* distances,
-                   Index::idx_t* labels) const override;
+                   Index::idx_t* labels,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
  private:
   GpuIndexIVFFlatConfig ivfFlatConfig_;
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
index d75a9bf212..8ad7bd8778 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu
@@ -330,14 +330,18 @@ GpuIndexIVFPQ::addImpl_(int n,
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
 
+  auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
 
   // Not all vectors may be able to be added (some may contain NaNs etc)
-  index_->classifyAndAddVectors(data, labels);
+  index_->classifyAndAddVectors(data, labels, bitset);
 
   // but keep the ntotal based on the total number of vectors that we attempted
   // to add
@@ -349,11 +353,14 @@ GpuIndexIVFPQ::searchImpl_(int n,
                            const float* x,
                            int k,
                            float* distances,
-                           Index::idx_t* labels) const {
+                           Index::idx_t* labels,
+                           ConcurrentBitsetPtr bitset) const {
   // Device is already set in GpuIndex::search
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
   Tensor<float, 2, true> outDistances(distances, {n, k});
@@ -361,7 +368,9 @@ GpuIndexIVFPQ::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  index_->query(queries, nprobe, k, outDistances, outLabels);
+  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
+  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
 }
 
 int
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h
index 0bde2596ae..086c9acac4 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h
@@ -116,7 +116,8 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
                    const float* x,
                    int k,
                    float* distances,
-                   Index::idx_t* labels) const override;
+                   Index::idx_t* labels,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
  private:
   void verifySettings_() const;
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
index c6e4e92615..464688b23e 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu
@@ -304,14 +304,18 @@ GpuIndexIVFSQHybrid::addImpl_(int n,
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
 
+  auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
 
   // Not all vectors may be able to be added (some may contain NaNs etc)
-  index_->classifyAndAddVectors(data, labels);
+  index_->classifyAndAddVectors(data, labels, bitset);
 
   // but keep the ntotal based on the total number of vectors that we attempted
   // to add
@@ -323,11 +327,14 @@ GpuIndexIVFSQHybrid::searchImpl_(int n,
                                         const float* x,
                                         int k,
                                         float* distances,
-                                        Index::idx_t* labels) const {
+                                        Index::idx_t* labels,
+                                        ConcurrentBitsetPtr bitset) const {
   // Device is already set in GpuIndex::search
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
   Tensor<float, 2, true> outDistances(distances, {n, k});
@@ -335,7 +342,9 @@ GpuIndexIVFSQHybrid::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  index_->query(queries, nprobe, k, outDistances, outLabels);
+  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
+  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
 }
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
index 551b85f763..049372c10f 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h
@@ -79,7 +79,8 @@ class GpuIndexIVFSQHybrid : public GpuIndexIVF {
                    const float* x,
                    int k,
                    float* distances,
-                   Index::idx_t* labels) const override;
+                   Index::idx_t* labels,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
   /// Called from train to handle SQ residual training
   void trainResiduals_(Index::idx_t n, const float* x);
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
index 494090da37..9e7f901bf5 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu
@@ -239,14 +239,18 @@ GpuIndexIVFScalarQuantizer::addImpl_(int n,
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> data(const_cast<float*>(x), {n, (int) this->d});
 
+  auto bitset = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 1, true> labels(const_cast<long*>(xids), {n});
 
   // Not all vectors may be able to be added (some may contain NaNs etc)
-  index_->classifyAndAddVectors(data, labels);
+  index_->classifyAndAddVectors(data, labels, bitset);
 
   // but keep the ntotal based on the total number of vectors that we attempted
   // to add
@@ -258,11 +262,14 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
                                         const float* x,
                                         int k,
                                         float* distances,
-                                        Index::idx_t* labels) const {
+                                        Index::idx_t* labels,
+                                        ConcurrentBitsetPtr bitset) const {
   // Device is already set in GpuIndex::search
   FAISS_ASSERT(index_);
   FAISS_ASSERT(n > 0);
 
+  auto stream = resources_->getDefaultStream(device_);
+
   // Data is already resident on the GPU
   Tensor<float, 2, true> queries(const_cast<float*>(x), {n, (int) this->d});
   Tensor<float, 2, true> outDistances(distances, {n, k});
@@ -270,7 +277,9 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n,
   static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch");
   Tensor<long, 2, true> outLabels(const_cast<long*>(labels), {n, k});
 
-  index_->query(queries, nprobe, k, outDistances, outLabels);
+  auto bitsetDevice = toDevice<uint8_t, 1>(resources_, device_, nullptr, stream, {0});
+
+  index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels);
 }
 
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
index 24a811f823..47b8de249f 100644
--- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
+++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h
@@ -75,7 +75,8 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF {
                    const float* x,
                    int k,
                    float* distances,
-                   Index::idx_t* labels) const override;
+                   Index::idx_t* labels,
+                   ConcurrentBitsetPtr bitset = nullptr) const override;
 
   /// Called from train to handle SQ residual training
   void trainResiduals_(Index::idx_t n, const float* x);
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
index 986c2eee3b..67b112434f 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu
@@ -130,6 +130,7 @@ void runDistance(bool computeL2,
                  Tensor<T, 1, true>* centroidNorms,
                  Tensor<T, 2, true>& queries,
                  bool queriesRowMajor,
+                 Tensor<uint8_t, 1, true>& bitset,
                  int k,
                  Tensor<T, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
@@ -306,6 +307,7 @@ void runDistance(bool computeL2,
           // Write into the final output
           runL2SelectMin(distanceBufView,
                          *centroidNorms,
+                         bitset,
                          outDistanceView,
                          outIndexView,
                          k,
@@ -326,6 +328,7 @@ void runDistance(bool computeL2,
           // Write into our intermediate output
           runL2SelectMin(distanceBufView,
                          centroidNormsView,
+                         bitset,
                          outDistanceBufColView,
                          outIndexBufColView,
                          k,
@@ -346,12 +349,14 @@ void runDistance(bool computeL2,
         if (tileCols == numCentroids) {
           // Write into the final output
           runBlockSelect(distanceBufView,
+                         bitset,
                          outDistanceView,
                          outIndexView,
                          true, k, streams[curStream]);
         } else {
           // Write into the intermediate output
           runBlockSelect(distanceBufView,
+                         bitset,
                          outDistanceBufColView,
                          outIndexBufColView,
                          true, k, streams[curStream]);
@@ -368,6 +373,7 @@ void runDistance(bool computeL2,
 
       runBlockSelectPair(outDistanceBufRowView,
                          outIndexBufRowView,
+                         bitset,
                          outDistanceView,
                          outIndexView,
                          computeL2 ? false : true, k, streams[curStream]);
@@ -384,6 +390,7 @@ void runDistance(bool computeL2,
   }
 }
 
+// Bitset added
 template <typename T>
 void runL2Distance(GpuResources* resources,
                    Tensor<T, 2, true>& centroids,
@@ -391,6 +398,7 @@ void runL2Distance(GpuResources* resources,
                    Tensor<T, 1, true>* centroidNorms,
                    Tensor<T, 2, true>& queries,
                    bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
                    int k,
                    Tensor<T, 2, true>& outDistances,
                    Tensor<int, 2, true>& outIndices,
@@ -403,6 +411,7 @@ void runL2Distance(GpuResources* resources,
                  centroidNorms,
                  queries,
                  queriesRowMajor,
+                 bitset,
                  k,
                  outDistances,
                  outIndices,
@@ -416,6 +425,7 @@ void runIPDistance(GpuResources* resources,
                    bool centroidsRowMajor,
                    Tensor<T, 2, true>& queries,
                    bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
                    int k,
                    Tensor<T, 2, true>& outDistances,
                    Tensor<int, 2, true>& outIndices,
@@ -427,6 +437,7 @@ void runIPDistance(GpuResources* resources,
                  nullptr, // no centroid norms provided
                  queries,
                  queriesRowMajor,
+                 bitset,
                  k,
                  outDistances,
                  outIndices,
@@ -444,6 +455,7 @@ runIPDistance(GpuResources* resources,
               bool vectorsRowMajor,
               Tensor<float, 2, true>& queries,
               bool queriesRowMajor,
+              Tensor<uint8_t, 1, true>& bitset,
               int k,
               Tensor<float, 2, true>& outDistances,
               Tensor<int, 2, true>& outIndices) {
@@ -452,6 +464,7 @@ runIPDistance(GpuResources* resources,
                        vectorsRowMajor,
                        queries,
                        queriesRowMajor,
+                       bitset,
                        k,
                        outDistances,
                        outIndices,
@@ -464,6 +477,7 @@ runIPDistance(GpuResources* resources,
               bool vectorsRowMajor,
               Tensor<half, 2, true>& queries,
               bool queriesRowMajor,
+              Tensor<uint8_t, 1, true>& bitset,
               int k,
               Tensor<half, 2, true>& outDistances,
               Tensor<int, 2, true>& outIndices,
@@ -473,6 +487,7 @@ runIPDistance(GpuResources* resources,
                       vectorsRowMajor,
                       queries,
                       queriesRowMajor,
+                      bitset,
                       k,
                       outDistances,
                       outIndices,
@@ -486,6 +501,7 @@ runL2Distance(GpuResources* resources,
               Tensor<float, 1, true>* vectorNorms,
               Tensor<float, 2, true>& queries,
               bool queriesRowMajor,
+              Tensor<uint8_t, 1, true>& bitset,
               int k,
               Tensor<float, 2, true>& outDistances,
               Tensor<int, 2, true>& outIndices,
@@ -496,6 +512,7 @@ runL2Distance(GpuResources* resources,
                        vectorNorms,
                        queries,
                        queriesRowMajor,
+                       bitset,
                        k,
                        outDistances,
                        outIndices,
@@ -510,6 +527,7 @@ runL2Distance(GpuResources* resources,
               Tensor<half, 1, true>* vectorNorms,
               Tensor<half, 2, true>& queries,
               bool queriesRowMajor,
+              Tensor<uint8_t, 1, true>& bitset,
               int k,
               Tensor<half, 2, true>& outDistances,
               Tensor<int, 2, true>& outIndices,
@@ -521,6 +539,7 @@ runL2Distance(GpuResources* resources,
                       vectorNorms,
                       queries,
                       queriesRowMajor,
+                      bitset,
                       k,
                       outDistances,
                       outIndices,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
index 0508eeeed1..7f0242780d 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh
@@ -10,6 +10,7 @@
 
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
+#include <faiss/utils/ConcurrentBitset.h>
 
 namespace faiss { namespace gpu {
 
@@ -65,4 +66,58 @@ void runL2Distance(GpuResources* resources,
                    bool useHgemm,
                    bool ignoreOutDistances = false);
 
+// Bitset added
+void runL2Distance(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   // can be optionally pre-computed; nullptr if we
+                   // have to compute it upon the call
+                   Tensor<float, 1, true>* vectorNorms,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   // Do we care about `outDistances`? If not, we can
+                   // take shortcuts.
+                   bool ignoreOutDistances = false);
+
+/// Calculates brute-force inner product distance between `vectors`
+/// and `queries`, returning the k closest results seen
+void runIPDistance(GpuResources* resources,
+                   Tensor<float, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<float, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<float, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices);
+
+void runIPDistance(GpuResources* resources,
+                   Tensor<half, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<half, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<half, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm);
+
+void runL2Distance(GpuResources* resources,
+                   Tensor<half, 2, true>& vectors,
+                   bool vectorsRowMajor,
+                   Tensor<half, 1, true>* vectorNorms,
+                   Tensor<half, 2, true>& queries,
+                   bool queriesRowMajor,
+                   Tensor<uint8_t, 1, true>& bitset,
+                   int k,
+                   Tensor<half, 2, true>& outDistances,
+                   Tensor<int, 2, true>& outIndices,
+                   bool useHgemm,
+                   bool ignoreOutDistances = false);
+
+
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
index 08d4221dfd..510b2182fc 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu
@@ -103,6 +103,7 @@ FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) {
 
 void
 FlatIndex::query(Tensor<float, 2, true>& input,
+                 Tensor<uint8_t, 1, true>& bitset,
                  int k,
                  Tensor<float, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
@@ -119,7 +120,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
     DeviceTensor<half, 2, true> outDistancesHalf(
       mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream);
 
-    query(inputHalf, k, outDistancesHalf, outIndices, exactDistance);
+    query(inputHalf, bitset, k, outDistancesHalf, outIndices, exactDistance);
 
     if (exactDistance) {
       // Convert outDistances back
@@ -135,6 +136,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                     &norms_,
                     input,
                     true, // input is row major
+                    bitset,
                     k,
                     outDistances,
                     outIndices,
@@ -145,6 +147,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
                     !storeTransposed_, // is vectors row major?
                     input,
                     true, // input is row major
+                    bitset,
                     k,
                     outDistances,
                     outIndices);
@@ -154,6 +157,7 @@ FlatIndex::query(Tensor<float, 2, true>& input,
 
 void
 FlatIndex::query(Tensor<half, 2, true>& input,
+                 Tensor<uint8_t, 1, true>& bitset,
                  int k,
                  Tensor<half, 2, true>& outDistances,
                  Tensor<int, 2, true>& outIndices,
@@ -167,6 +171,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                   &normsHalf_,
                   input,
                   true, // input is row major
+                  bitset,
                   k,
                   outDistances,
                   outIndices,
@@ -179,6 +184,7 @@ FlatIndex::query(Tensor<half, 2, true>& input,
                   !storeTransposed_, // is vectors row major?
                   input,
                   true, // input is row major
+                  bitset,
                   k,
                   outDistances,
                   outIndices,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
index da7b640d69..03be3a2d4a 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh
@@ -53,12 +53,14 @@ class FlatIndex {
                                                      cudaStream_t stream);
 
   void query(Tensor<float, 2, true>& vecs,
+             Tensor<uint8_t, 1, true>& bitset,
              int k,
              Tensor<float, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
              bool exactDistance);
 
   void query(Tensor<half, 2, true>& vecs,
+             Tensor<uint8_t, 1, true>& bitset,
              int k,
              Tensor<half, 2, true>& outDistances,
              Tensor<int, 2, true>& outIndices,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
index ffd7f9523b..742df3fcf2 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu
@@ -157,7 +157,8 @@ IVFFlat::addCodeVectorsFromCpu(int listId,
 
 int
 IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
-                               Tensor<long, 1, true>& indices) {
+                               Tensor<long, 1, true>& indices,
+                               Tensor<uint8_t, 1, true>& bitset) {
   FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
   FAISS_ASSERT(vecs.getSize(1) == dim_);
 
@@ -174,7 +175,7 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
     listIds2d(mem, {vecs.getSize(0), 1},  stream);
   auto listIds = listIds2d.view<1>({vecs.getSize(0)});
 
-  quantizer_->query(vecs, 1, listDistance2d, listIds2d, false);
+  quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false);
 
   // Calculate residuals for these vectors, if needed
   DeviceTensor<float, 2, true>
@@ -326,6 +327,7 @@ IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
 
 void
 IVFFlat::query(Tensor<float, 2, true>& queries,
+               Tensor<uint8_t, 1, true>& bitset,
                int nprobe,
                int k,
                Tensor<float, 2, true>& outDistances,
@@ -352,6 +354,7 @@ IVFFlat::query(Tensor<float, 2, true>& queries,
   // Find the `nprobe` closest lists; we can use int indices both
   // internally and externally
   quantizer_->query(queries,
+                    bitset,
                     nprobe,
                     coarseDistances,
                     coarseIndices,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
index 66c05a7d61..4ab62a489a 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh
@@ -44,11 +44,14 @@ class IVFFlat : public IVFBase {
   /// Returns the number of vectors successfully added. Vectors may
   /// not be able to be added because they contain NaNs.
   int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
-                            Tensor<long, 1, true>& indices);
+                            Tensor<long, 1, true>& indices,
+                            Tensor<uint8_t, 1, true>& bitset);
+
 
   /// Find the approximate k nearest neigbors for `queries` against
   /// our database
   void query(Tensor<float, 2, true>& queries,
+             Tensor<uint8_t, 1, true>& bitset,
              int nprobe,
              int k,
              Tensor<float, 2, true>& outDistances,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
index aa843fed1e..d665d34566 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu
@@ -110,7 +110,8 @@ IVFPQ::setPrecomputedCodes(bool enable) {
 
 int
 IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
-                             Tensor<long, 1, true>& indices) {
+                             Tensor<long, 1, true>& indices,
+                             Tensor<uint8_t, 1, true>& bitset) {
   FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
   FAISS_ASSERT(vecs.getSize(1) == dim_);
 
@@ -128,7 +129,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
   DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
   auto listIds = listIds2d.view<1>({vecs.getSize(0)});
 
-  quantizer_->query(vecs, 1, listDistance, listIds2d, false);
+  quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false);
 
   // Copy the lists that we wish to append to back to the CPU
   // FIXME: really this can be into pinned memory and a true async
@@ -184,6 +185,7 @@ IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                   nullptr, // no precomputed norms
                   residualsTransposeView,
                   true, // residualsTransposeView is row major
+                  bitset,
                   1,
                   closestSubQDistanceView,
                   closestSubQIndexView,
@@ -506,6 +508,7 @@ IVFPQ::precomputeCodes_() {
 
 void
 IVFPQ::query(Tensor<float, 2, true>& queries,
+             Tensor<uint8_t, 1, true>& bitset,
              int nprobe,
              int k,
              Tensor<float, 2, true>& outDistances,
@@ -531,6 +534,7 @@ IVFPQ::query(Tensor<float, 2, true>& queries,
   // Find the `nprobe` closest coarse centroids; we can use int
   // indices both internally and externally
   quantizer_->query(queries,
+                    bitset,
                     nprobe,
                     coarseDistances,
                     coarseIndices,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
index 781104d77b..cc231459b0 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh
@@ -52,11 +52,13 @@ class IVFPQ : public IVFBase {
   /// Returns the number of vectors successfully added. Vectors may
   /// not be able to be added because they contain NaNs.
   int classifyAndAddVectors(Tensor<float, 2, true>& vecs,
-                            Tensor<long, 1, true>& indices);
+                            Tensor<long, 1, true>& indices,
+                            Tensor<uint8_t, 1, true>& bitset);
 
   /// Find the approximate k nearest neigbors for `queries` against
   /// our database
   void query(Tensor<float, 2, true>& queries,
+             Tensor<uint8_t, 1, true>& bitset,
              int nprobe,
              int k,
              Tensor<float, 2, true>& outDistances,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
index 1480ec07df..ab4556a584 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu
@@ -159,9 +159,160 @@ __global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
   }
 }
 
+// With bitset included
+// L2 + select kernel for k == 1, implements re-use of ||c||^2
+template <typename T, int kRowsPerBlock, int kBlockSize>
+__global__ void l2SelectMin1(Tensor<T, 2, true> productDistances,
+                             Tensor<T, 1, true> centroidDistances,
+                             Tensor<uint8_t, 1, true> bitset,
+                             Tensor<T, 2, true> outDistances,
+                             Tensor<int, 2, true> outIndices) {
+  // Each block handles kRowsPerBlock rows of the distances (results)
+  Pair<T, int> threadMin[kRowsPerBlock];
+  __shared__ Pair<T, int> blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)];
+
+  T distance[kRowsPerBlock];
+
+#pragma unroll
+  for (int i = 0; i < kRowsPerBlock; ++i) {
+    threadMin[i].k = Limits<T>::getMax();
+    threadMin[i].v = -1;
+  }
+
+  // blockIdx.x: which chunk of rows we are responsible for updating
+  int rowStart = blockIdx.x * kRowsPerBlock;
+
+  // FIXME: if we have exact multiples, don't need this
+  bool endRow = (blockIdx.x == gridDim.x - 1);
+
+  if (endRow) {
+    if (productDistances.getSize(0) % kRowsPerBlock == 0) {
+      endRow = false;
+    }
+  }
+
+  if (endRow) {
+    for (int row = rowStart; row < productDistances.getSize(0); ++row) {
+      for (int col = threadIdx.x; col < productDistances.getSize(1);
+           col += blockDim.x) {
+        if (!(bitset[col >> 3] & (0x1 << (col & 0x7)))) {
+          distance[0] = Math<T>::add(centroidDistances[col],
+                                    productDistances[row][col]);
+
+          if (Math<T>::lt(distance[0], threadMin[0].k)) {
+            threadMin[0].k = distance[0];
+            threadMin[0].v = col;
+          }
+        }
+      }
+
+      // Reduce within the block
+      threadMin[0] =
+        blockReduceAll<Pair<T, int>, Min<Pair<T, int> >, false, false>(
+        threadMin[0], Min<Pair<T, int> >(), blockMin);
+
+      if (threadIdx.x == 0) {
+        outDistances[row][0] = threadMin[0].k;
+        outIndices[row][0] = threadMin[0].v;
+      }
+
+      // so we can use the shared memory again
+      __syncthreads();
+
+      threadMin[0].k = Limits<T>::getMax();
+      threadMin[0].v = -1;
+    }
+  } else {
+    for (int col = threadIdx.x; col < productDistances.getSize(1);
+         col += blockDim.x) {
+      T centroidDistance = centroidDistances[col];
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        distance[row] = productDistances[rowStart + row][col];
+      }
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        distance[row] = Math<T>::add(distance[row], centroidDistance);
+      }
+
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        if (Math<T>::lt(distance[row], threadMin[row].k)) {
+          threadMin[row].k = distance[row];
+          threadMin[row].v = col;
+        }
+      }
+    }
+
+    // Reduce within the block
+    blockReduceAll<kRowsPerBlock, Pair<T, int>, Min<Pair<T, int> >, false, false>(
+      threadMin, Min<Pair<T, int> >(), blockMin);
+
+    if (threadIdx.x == 0) {
+#pragma unroll
+      for (int row = 0; row < kRowsPerBlock; ++row) {
+        outDistances[rowStart + row][0] = threadMin[row].k;
+        outIndices[rowStart + row][0] = threadMin[row].v;
+      }
+    }
+  }
+}
+
+// With bitset included
+// L2 + select kernel for k > 1, no re-use of ||c||^2
+template <typename T, int NumWarpQ, int NumThreadQ, int ThreadsPerBlock>
+__global__ void l2SelectMinK(Tensor<T, 2, true> productDistances,
+                             Tensor<T, 1, true> centroidDistances,
+                             Tensor<uint8_t, 1, true> bitset,
+                             Tensor<T, 2, true> outDistances,
+                             Tensor<int, 2, true> outIndices,
+                             int k, T initK) {
+  // Each block handles a single row of the distances (results)
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ T smemK[kNumWarps * NumWarpQ];
+  __shared__ int smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<T, int, false, Comparator<T>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, -1, smemK, smemV, k);
+
+  int row = blockIdx.x;
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(productDistances.getSize(1), kWarpSize);
+  int i = threadIdx.x;
+
+  for (; i < limit; i += blockDim.x) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      T v = Math<T>::add(centroidDistances[i],
+                        productDistances[row][i]);
+      heap.add(v, i);
+    }
+  }
+
+  if (i < productDistances.getSize(1)) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      T v = Math<T>::add(centroidDistances[i],
+                        productDistances[row][i]);
+      heap.addThreadQ(v, i);
+    }
+  }
+
+  heap.reduce();
+  for (int i = threadIdx.x; i < k; i += blockDim.x) {
+    outDistances[row][i] = smemK[i];
+    outIndices[row][i] = smemV[i];
+  }
+}
+
+
 template <typename T>
 void runL2SelectMin(Tensor<T, 2, true>& productDistances,
                     Tensor<T, 1, true>& centroidDistances,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<T, 2, true>& outDistances,
                     Tensor<int, 2, true>& outIndices,
                     int k,
@@ -181,7 +332,7 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
     auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock));
 
     l2SelectMin1<T, kRowsPerBlock, kThreadsPerBlock>
-      <<<grid, block, 0, stream>>>(productDistances, centroidDistances,
+      <<<grid, block, 0, stream>>>(productDistances, centroidDistances, bitset,
                                    outDistances, outIndices);
   } else {
     auto grid = dim3(outDistances.getSize(0));
@@ -194,28 +345,63 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
                                      k, Limits<T>::getMax());           \
     } while (0)
 
-    // block size 128 for everything <= 1024
-    if (k <= 32) {
-      RUN_L2_SELECT(128, 32, 2);
-    } else if (k <= 64) {
-      RUN_L2_SELECT(128, 64, 3);
-    } else if (k <= 128) {
-      RUN_L2_SELECT(128, 128, 3);
-    } else if (k <= 256) {
-      RUN_L2_SELECT(128, 256, 4);
-    } else if (k <= 512) {
-      RUN_L2_SELECT(128, 512, 8);
-    } else if (k <= 1024) {
-      RUN_L2_SELECT(128, 1024, 8);
+#define RUN_L2_SELECT_BITSET(BLOCK, NUM_WARP_Q, NUM_THREAD_Q)                  \
+    do {                                                                \
+      l2SelectMinK<T, NUM_WARP_Q, NUM_THREAD_Q, BLOCK>                  \
+        <<<grid, BLOCK, 0, stream>>>(productDistances, centroidDistances, \
+                                    bitset, outDistances, outIndices,      \
+                                    k, Limits<T>::getMax());           \
+    } while (0)
 
-#if GPU_MAX_SELECTION_K >= 2048
-    } else if (k <= 2048) {
-      // smaller block for less shared memory
-      RUN_L2_SELECT(64, 2048, 8);
-#endif
+    if (bitset.getSize(0) == 0) {
+      // block size 128 for everything <= 1024
+      if (k <= 32) {
+        RUN_L2_SELECT(128, 32, 2);
+      } else if (k <= 64) {
+        RUN_L2_SELECT(128, 64, 3);
+      } else if (k <= 128) {
+        RUN_L2_SELECT(128, 128, 3);
+      } else if (k <= 256) {
+        RUN_L2_SELECT(128, 256, 4);
+      } else if (k <= 512) {
+        RUN_L2_SELECT(128, 512, 8);
+      } else if (k <= 1024) {
+        RUN_L2_SELECT(128, 1024, 8);
+
+  #if GPU_MAX_SELECTION_K >= 2048
+      } else if (k <= 2048) {
+        // smaller block for less shared memory
+        RUN_L2_SELECT(64, 2048, 8);
+  #endif
+
+      } else {
+        FAISS_ASSERT(false);
+      }
 
     } else {
-      FAISS_ASSERT(false);
+      // With bitset
+      if (k <= 32) {
+        RUN_L2_SELECT_BITSET(128, 32, 2);
+      } else if (k <= 64) {
+        RUN_L2_SELECT_BITSET(128, 64, 3);
+      } else if (k <= 128) {
+        RUN_L2_SELECT_BITSET(128, 128, 3);
+      } else if (k <= 256) {
+        RUN_L2_SELECT_BITSET(128, 256, 4);
+      } else if (k <= 512) {
+        RUN_L2_SELECT_BITSET(128, 512, 8);
+      } else if (k <= 1024) {
+        RUN_L2_SELECT_BITSET(128, 1024, 8);
+
+  #if GPU_MAX_SELECTION_K >= 2048
+      } else if (k <= 2048) {
+        // smaller block for less shared memory
+        RUN_L2_SELECT_BITSET(64, 2048, 8);
+  #endif
+
+      } else {
+        FAISS_ASSERT(false);
+      }
     }
   }
 
@@ -224,12 +410,14 @@ void runL2SelectMin(Tensor<T, 2, true>& productDistances,
 
 void runL2SelectMin(Tensor<float, 2, true>& productDistances,
                     Tensor<float, 1, true>& centroidDistances,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<float, 2, true>& outDistances,
                     Tensor<int, 2, true>& outIndices,
                     int k,
                     cudaStream_t stream) {
   runL2SelectMin<float>(productDistances,
                         centroidDistances,
+                        bitset,
                         outDistances,
                         outIndices,
                         k,
@@ -238,12 +426,14 @@ void runL2SelectMin(Tensor<float, 2, true>& productDistances,
 
 void runL2SelectMin(Tensor<half, 2, true>& productDistances,
                     Tensor<half, 1, true>& centroidDistances,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<half, 2, true>& outDistances,
                     Tensor<int, 2, true>& outIndices,
                     int k,
                     cudaStream_t stream) {
   runL2SelectMin<half>(productDistances,
                        centroidDistances,
+                       bitset,
                        outDistances,
                        outIndices,
                        k,
diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
index 95c35ca571..31b0ddfe23 100644
--- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh
@@ -26,4 +26,20 @@ void runL2SelectMin(Tensor<half, 2, true>& productDistances,
                     int k,
                     cudaStream_t stream);
 
+void runL2SelectMin(Tensor<float, 2, true>& productDistances,
+                    Tensor<float, 1, true>& centroidDistances,
+                    Tensor<uint8_t, 1, true>& bitset,
+                    Tensor<float, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream);
+
+void runL2SelectMin(Tensor<half, 2, true>& productDistances,
+                    Tensor<half, 1, true>& centroidDistances,
+                    Tensor<uint8_t, 1, true>& bitset,
+                    Tensor<half, 2, true>& outDistances,
+                    Tensor<int, 2, true>& outIndices,
+                    int k,
+                    cudaStream_t stream);
+
 } } // namespace
diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu
index 890fe5fb1e..5e2eb49f13 100644
--- a/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu
+++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu
@@ -51,6 +51,7 @@ int main(int argc, char** argv) {
     limitK = GPU_MAX_SELECTION_K;
   }
 
+  faiss::gpu::DeviceTensor<uint8_t, 1, true> bitset(nullptr, {0});
   for (int k = startK; k <= limitK; k *= 2) {
     faiss::gpu::DeviceTensor<float, 2, true> gpuOutVal({FLAGS_rows, k});
     faiss::gpu::DeviceTensor<int, 2, true> gpuOutInd({FLAGS_rows, k});
@@ -60,7 +61,7 @@ int main(int argc, char** argv) {
         faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd,
                                   FLAGS_dir, k, 0);
       } else {
-        faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd,
+        faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd,
                                    FLAGS_dir, k, 0);
       }
     }
diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu
index 35d5b95505..eec621bd5c 100644
--- a/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu
+++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu
@@ -29,6 +29,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) {
     }
   }
 
+  faiss::gpu::DeviceTensor<uint8_t, 1, true> bitset(nullptr, {0});
+
   // row -> (val -> idx)
   std::unordered_map<int, std::vector<std::pair<int, float>>> hostOutValAndInd;
   for (int r = 0; r < rows; ++r) {
@@ -59,7 +61,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) {
   if (warp) {
     faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
   } else {
-    faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0);
+
+    faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd, dir, k, 0);
   }
 
   // Copy back to CPU
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu
index 47617fbe85..7f1febed3e 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu
@@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(float, false, 2048);
 #endif
 
 void runBlockSelect(Tensor<float, 2, true>& in,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<float, 2, true>& outK,
                     Tensor<int, 2, true>& outV,
                     bool dir, int k, cudaStream_t stream) {
@@ -93,6 +94,7 @@ void runBlockSelect(Tensor<float, 2, true>& in,
 
 void runBlockSelectPair(Tensor<float, 2, true>& inK,
                         Tensor<int, 2, true>& inV,
+                        Tensor<uint8_t, 1, true>& bitset,
                         Tensor<float, 2, true>& outK,
                         Tensor<int, 2, true>& outV,
                         bool dir, int k, cudaStream_t stream) {
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu
index bc05e1485f..4f642a0ca8 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu
@@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(half, false, 2048);
 #endif
 
 void runBlockSelect(Tensor<half, 2, true>& in,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<half, 2, true>& outK,
                     Tensor<int, 2, true>& outV,
                     bool dir, int k, cudaStream_t stream) {
@@ -93,6 +94,7 @@ void runBlockSelect(Tensor<half, 2, true>& in,
 
 void runBlockSelectPair(Tensor<half, 2, true>& inK,
                         Tensor<int, 2, true>& inV,
+                        Tensor<uint8_t, 1, true>& bitset,
                         Tensor<half, 2, true>& outK,
                         Tensor<int, 2, true>& outV,
                         bool dir, int k, cudaStream_t stream) {
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh
index 04e76541de..e7d804c50f 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh
@@ -110,24 +110,138 @@ __global__ void blockSelectPair(Tensor<K, 2, true> inK,
   }
 }
 
+// Bitset included
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelect(Tensor<K, 2, true> in,
+                            Tensor<uint8_t, 1, true> bitset,
+                            Tensor<K, 2, true> outK,
+                            Tensor<IndexType, 2, true> outV,
+                            K initK,
+                            IndexType initV,
+                            int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+
+  int i = threadIdx.x;
+  K* inStart = in[row][i].data();
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(in.getSize(1), kWarpSize);
+
+  for (; i < limit; i += ThreadsPerBlock) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      heap.add(*inStart, (IndexType) i);
+      inStart += ThreadsPerBlock;
+    }
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < in.getSize(1)) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      heap.addThreadQ(*inStart, (IndexType) i);
+    }
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
+
+template <typename K,
+          typename IndexType,
+          bool Dir,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
+__global__ void blockSelectPair(Tensor<K, 2, true> inK,
+                                Tensor<IndexType, 2, true> inV,
+                                Tensor<uint8_t, 1, true> bitset,
+                                Tensor<K, 2, true> outK,
+                                Tensor<IndexType, 2, true> outV,
+                                K initK,
+                                IndexType initV,
+                                int k) {
+  constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+
+  __shared__ K smemK[kNumWarps * NumWarpQ];
+  __shared__ IndexType smemV[kNumWarps * NumWarpQ];
+
+  BlockSelect<K, IndexType, Dir, Comparator<K>,
+              NumWarpQ, NumThreadQ, ThreadsPerBlock>
+    heap(initK, initV, smemK, smemV, k);
+
+  // Grid is exactly sized to rows available
+  int row = blockIdx.x;
+
+  int i = threadIdx.x;
+  K* inKStart = inK[row][i].data();
+  IndexType* inVStart = inV[row][i].data();
+
+  // Whole warps must participate in the selection
+  int limit = utils::roundDown(inK.getSize(1), kWarpSize);
+
+  for (; i < limit; i += ThreadsPerBlock) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      heap.add(*inKStart, *inVStart);
+      inKStart += ThreadsPerBlock;
+      inVStart += ThreadsPerBlock;
+    }
+  }
+
+  // Handle last remainder fraction of a warp of elements
+  if (i < inK.getSize(1)) {
+    if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) {
+      heap.addThreadQ(*inKStart, *inVStart);
+    }
+  }
+
+  heap.reduce();
+
+  for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) {
+    outK[row][i] = smemK[i];
+    outV[row][i] = smemV[i];
+  }
+}
+
 void runBlockSelect(Tensor<float, 2, true>& in,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<float, 2, true>& outKeys,
                     Tensor<int, 2, true>& outIndices,
                     bool dir, int k, cudaStream_t stream);
 
 void runBlockSelectPair(Tensor<float, 2, true>& inKeys,
                         Tensor<int, 2, true>& inIndices,
+                        Tensor<uint8_t, 1, true>& bitset,
                         Tensor<float, 2, true>& outKeys,
                         Tensor<int, 2, true>& outIndices,
                         bool dir, int k, cudaStream_t stream);
 
 void runBlockSelect(Tensor<half, 2, true>& in,
+                    Tensor<uint8_t, 1, true>& bitset,
                     Tensor<half, 2, true>& outKeys,
                     Tensor<int, 2, true>& outIndices,
                     bool dir, int k, cudaStream_t stream);
 
 void runBlockSelectPair(Tensor<half, 2, true>& inKeys,
                         Tensor<int, 2, true>& inIndices,
+                        Tensor<uint8_t, 1, true>& bitset,
                         Tensor<half, 2, true>& outKeys,
                         Tensor<int, 2, true>& outIndices,
                         bool dir, int k, cudaStream_t stream);
diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
index fe50488e5f..4c32b75194 100644
--- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
+++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh
@@ -13,6 +13,7 @@
 #define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q)                            \
   extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
     Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -22,15 +23,17 @@
   extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \
     Tensor<TYPE, 2, true>& inK,                                         \
     Tensor<int, 2, true>& inV,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
     int k,                                                              \
-    cudaStream_t stream)
+    cudaStream_t stream);
 
 #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q)                  \
   void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(         \
     Tensor<TYPE, 2, true>& in,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -52,14 +55,19 @@
     auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
     auto vInit = -1;                                                    \
                                                                         \
-    blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
-      <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);    \
+    if (bitset.getSize(0) == 0)                                       \
+      blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(in, outK, outV, kInit, vInit, k);  \
+    else                                                                \
+      blockSelect<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(in, bitset, outK, outV, kInit, vInit, k);    \
     CUDA_TEST_ERROR();                                                  \
   }                                                                     \
                                                                         \
   void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(     \
     Tensor<TYPE, 2, true>& inK,                                         \
     Tensor<int, 2, true>& inV,                                          \
+    Tensor<uint8_t, 1, true>& bitset,                                    \
     Tensor<TYPE, 2, true>& outK,                                        \
     Tensor<int, 2, true>& outV,                                         \
     bool dir,                                                           \
@@ -79,16 +87,20 @@
     auto kInit = dir ? Limits<TYPE>::getMin() : Limits<TYPE>::getMax(); \
     auto vInit = -1;                                                    \
                                                                         \
-    blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
-      <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k); \
+    if (bitset.getSize(0) == 0)                                         \
+      blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(inK, inV, outK, outV, kInit, vInit, k);  \
+    else                                                                \
+      blockSelectPair<TYPE, int, DIR, WARP_Q, THREAD_Q, kBlockSelectNumThreads> \
+        <<<grid, block, 0, stream>>>(inK, inV, bitset, outK, outV, kInit, vInit, k); \
     CUDA_TEST_ERROR();                                                  \
   }
 
 
 #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q)                    \
   runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(      \
-    in, outK, outV, dir, k, stream)
+    in, bitset, outK, outV, dir, k, stream)
 
 #define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q)               \
   runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _(  \
-    inK, inV, outK, outV, dir, k, stream)
+    inK, inV, bitset, outK, outV, dir, k, stream)