From f93b4641726ad851ef75fa97fe10c04909e7f926 Mon Sep 17 00:00:00 2001 From: Xiaohai Xu Date: Thu, 26 Mar 2020 18:36:46 +0800 Subject: [PATCH] #1654 GPU Index Flat Delete (#1736) * Add Flat Index Delete Signed-off-by: sahuang * Fix log Signed-off-by: sahuang * Fix bitset Signed-off-by: sahuang * Fix reference Signed-off-by: sahuang * fix bug Signed-off-by: sahuang * fix bug Signed-off-by: sahuang * fix bug Signed-off-by: sahuang --- .../index/thirdparty/faiss/gpu/GpuDistance.cu | 5 + .../index/thirdparty/faiss/gpu/GpuIndex.cu | 20 +- .../src/index/thirdparty/faiss/gpu/GpuIndex.h | 9 +- .../thirdparty/faiss/gpu/GpuIndexFlat.cu | 15 +- .../index/thirdparty/faiss/gpu/GpuIndexFlat.h | 3 +- .../thirdparty/faiss/gpu/GpuIndexIVFFlat.cu | 15 +- .../thirdparty/faiss/gpu/GpuIndexIVFFlat.h | 3 +- .../thirdparty/faiss/gpu/GpuIndexIVFPQ.cu | 15 +- .../thirdparty/faiss/gpu/GpuIndexIVFPQ.h | 3 +- .../faiss/gpu/GpuIndexIVFSQHybrid.cu | 15 +- .../faiss/gpu/GpuIndexIVFSQHybrid.h | 3 +- .../faiss/gpu/GpuIndexIVFScalarQuantizer.cu | 15 +- .../faiss/gpu/GpuIndexIVFScalarQuantizer.h | 3 +- .../thirdparty/faiss/gpu/impl/Distance.cu | 19 ++ .../thirdparty/faiss/gpu/impl/Distance.cuh | 55 +++++ .../thirdparty/faiss/gpu/impl/FlatIndex.cu | 8 +- .../thirdparty/faiss/gpu/impl/FlatIndex.cuh | 2 + .../thirdparty/faiss/gpu/impl/IVFFlat.cu | 7 +- .../thirdparty/faiss/gpu/impl/IVFFlat.cuh | 5 +- .../index/thirdparty/faiss/gpu/impl/IVFPQ.cu | 8 +- .../index/thirdparty/faiss/gpu/impl/IVFPQ.cuh | 4 +- .../thirdparty/faiss/gpu/impl/L2Select.cu | 230 ++++++++++++++++-- .../thirdparty/faiss/gpu/impl/L2Select.cuh | 16 ++ .../thirdparty/faiss/gpu/perf/PerfSelect.cu | 3 +- .../faiss/gpu/test/TestGpuSelect.cu | 5 +- .../faiss/gpu/utils/BlockSelectFloat.cu | 2 + .../faiss/gpu/utils/BlockSelectHalf.cu | 2 + .../faiss/gpu/utils/BlockSelectKernel.cuh | 114 +++++++++ .../gpu/utils/blockselect/BlockSelectImpl.cuh | 26 +- 29 files changed, 565 insertions(+), 65 deletions(-) diff --git a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu index 6d7e67b89b..9ae5fdad5d 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuDistance.cu @@ -64,6 +64,9 @@ void bruteForceKnn(GpuResources* resources, // temporary memory for it DeviceTensor tOutIntIndices(mem, {numQueries, k}, stream); + // Empty bitset + auto bitsetDevice = toDevice(resources, device, nullptr, stream, {0}); + // Do the work if (metric == faiss::MetricType::METRIC_L2) { runL2Distance(resources, @@ -72,6 +75,7 @@ void bruteForceKnn(GpuResources* resources, nullptr, // compute norms in temp memory tQueries, queriesRowMajor, + bitsetDevice, k, tOutDistances, tOutIntIndices); @@ -81,6 +85,7 @@ void bruteForceKnn(GpuResources* resources, vectorsRowMajor, tQueries, queriesRowMajor, + bitsetDevice, k, tOutDistances, tOutIntIndices); diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu index 6fddbfe1fb..9ae1662055 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.cu @@ -235,7 +235,8 @@ GpuIndex::search(Index::idx_t n, if (dataSize >= minPagedSize_) { searchFromCpuPaged_(n, x, k, outDistances.data(), - outLabels.data()); + outLabels.data(), + bitset); usePaged = true; } } @@ -243,7 +244,8 @@ GpuIndex::search(Index::idx_t n, if (!usePaged) { searchNonPaged_(n, x, k, outDistances.data(), - outLabels.data()); + outLabels.data(), + bitset); } // Copy back if necessary @@ -256,7 +258,8 @@ GpuIndex::searchNonPaged_(int n, const float* x, int k, float* outDistancesData, - Index::idx_t* outIndicesData) const { + Index::idx_t* outIndicesData, + ConcurrentBitsetPtr bitset) const { auto stream = resources_->getDefaultStream(device_); // Make sure arguments are on the device we desire; use temporary @@ -267,7 +270,7 @@ GpuIndex::searchNonPaged_(int n, stream, {n, (int) this->d}); - searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData); + searchImpl_(n, vecs.data(), k, outDistancesData, outIndicesData, bitset); } void @@ -275,7 +278,8 @@ GpuIndex::searchFromCpuPaged_(int n, const float* x, int k, float* outDistancesData, - Index::idx_t* outIndicesData) const { + Index::idx_t* outIndicesData, + ConcurrentBitsetPtr bitset) const { Tensor outDistances(outDistancesData, {n, k}); Tensor outIndices(outIndicesData, {n, k}); @@ -300,7 +304,8 @@ GpuIndex::searchFromCpuPaged_(int n, x + (size_t) cur * this->d, k, outDistancesSlice.data(), - outIndicesSlice.data()); + outIndicesSlice.data(), + bitset); } return; @@ -411,7 +416,8 @@ GpuIndex::searchFromCpuPaged_(int n, bufGpus[cur3BufIndex]->data(), k, outDistancesSlice.data(), - outIndicesSlice.data()); + outIndicesSlice.data(), + bitset); // Create completion event eventGpuExecuteDone[cur3BufIndex] = diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h index 70aca88b3c..44efd660d3 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndex.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndex.h @@ -103,7 +103,8 @@ class GpuIndex : public faiss::Index { const float* x, int k, float* distances, - Index::idx_t* labels) const = 0; + Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const = 0; private: /// Handles paged adds if the add set is too large, passes to @@ -122,7 +123,8 @@ private: const float* x, int k, float* outDistancesData, - Index::idx_t* outIndicesData) const; + Index::idx_t* outIndicesData, + ConcurrentBitsetPtr bitset = nullptr) const; /// Calls searchImpl_ for a single page of GPU-resident data, /// handling paging of the data and copies from the CPU @@ -130,7 +132,8 @@ private: const float* x, int k, float* outDistancesData, - Index::idx_t* outIndicesData) const; + Index::idx_t* outIndicesData, + ConcurrentBitsetPtr bitset = nullptr) const; protected: /// Manages streams, cuBLAS handles and scratch memory for devices diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu index 09c23363fc..6084933b5c 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.cu @@ -203,7 +203,8 @@ GpuIndexFlat::searchImpl_(int n, const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + ConcurrentBitsetPtr bitset) const { auto stream = resources_->getDefaultStream(device_); // Input and output data are already resident on the GPU @@ -215,7 +216,17 @@ GpuIndexFlat::searchImpl_(int n, DeviceTensor outIntLabels( resources_->getMemoryManagerCurrentDevice(), {n, k}, stream); - data_->query(queries, k, outDistances, outIntLabels, true); + // Copy bitset to GPU + if (!bitset) { + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true); + } else { + auto bitsetData = bitset->bitset(); + auto bitsetDevice = toDevice(resources_, device_, + const_cast(bitsetData), stream, + {(int) bitset->size()}); + data_->query(queries, bitsetDevice, k, outDistances, outIntLabels, true); + } // Convert int to idx_t convertTensor(stream, diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h index ecda39dc6e..03cfdf7191 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexFlat.h @@ -126,7 +126,8 @@ class GpuIndexFlat : public GpuIndex { const float* x, int k, float* distances, - faiss::Index::idx_t* labels) const override; + faiss::Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const override; private: /// Checks user settings for consistency diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu index d946f002b8..3030ea53af 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu @@ -207,14 +207,18 @@ GpuIndexIVFFlat::addImpl_(int n, FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int) this->d}); + auto bitset = toDevice(resources_, device_, nullptr, stream, {0}); + static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor labels(const_cast(xids), {n}); // Not all vectors may be able to be added (some may contain NaNs etc) - index_->classifyAndAddVectors(data, labels); + index_->classifyAndAddVectors(data, labels, bitset); // but keep the ntotal based on the total number of vectors that we attempted // to add @@ -226,11 +230,14 @@ GpuIndexIVFFlat::searchImpl_(int n, const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + ConcurrentBitsetPtr bitset) const { // Device is already set in GpuIndex::search FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor queries(const_cast(x), {n, (int) this->d}); Tensor outDistances(distances, {n, k}); @@ -238,7 +245,9 @@ GpuIndexIVFFlat::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - index_->query(queries, nprobe, k, outDistances, outLabels); + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); } diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h index f5d6fba457..876371eda4 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.h @@ -70,7 +70,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const override; private: GpuIndexIVFFlatConfig ivfFlatConfig_; diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu index d75a9bf212..8ad7bd8778 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu @@ -330,14 +330,18 @@ GpuIndexIVFPQ::addImpl_(int n, FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int) this->d}); + auto bitset = toDevice(resources_, device_, nullptr, stream, {0}); + static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor labels(const_cast(xids), {n}); // Not all vectors may be able to be added (some may contain NaNs etc) - index_->classifyAndAddVectors(data, labels); + index_->classifyAndAddVectors(data, labels, bitset); // but keep the ntotal based on the total number of vectors that we attempted // to add @@ -349,11 +353,14 @@ GpuIndexIVFPQ::searchImpl_(int n, const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + ConcurrentBitsetPtr bitset) const { // Device is already set in GpuIndex::search FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor queries(const_cast(x), {n, (int) this->d}); Tensor outDistances(distances, {n, k}); @@ -361,7 +368,9 @@ GpuIndexIVFPQ::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - index_->query(queries, nprobe, k, outDistances, outLabels); + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); } int diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h index 0bde2596ae..086c9acac4 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.h @@ -116,7 +116,8 @@ class GpuIndexIVFPQ : public GpuIndexIVF { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const override; private: void verifySettings_() const; diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu index c6e4e92615..464688b23e 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.cu @@ -304,14 +304,18 @@ GpuIndexIVFSQHybrid::addImpl_(int n, FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int) this->d}); + auto bitset = toDevice(resources_, device_, nullptr, stream, {0}); + static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor labels(const_cast(xids), {n}); // Not all vectors may be able to be added (some may contain NaNs etc) - index_->classifyAndAddVectors(data, labels); + index_->classifyAndAddVectors(data, labels, bitset); // but keep the ntotal based on the total number of vectors that we attempted // to add @@ -323,11 +327,14 @@ GpuIndexIVFSQHybrid::searchImpl_(int n, const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + ConcurrentBitsetPtr bitset) const { // Device is already set in GpuIndex::search FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor queries(const_cast(x), {n, (int) this->d}); Tensor outDistances(distances, {n, k}); @@ -335,7 +342,9 @@ GpuIndexIVFSQHybrid::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - index_->query(queries, nprobe, k, outDistances, outLabels); + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); } } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h index 551b85f763..049372c10f 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFSQHybrid.h @@ -79,7 +79,8 @@ class GpuIndexIVFSQHybrid : public GpuIndexIVF { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const override; /// Called from train to handle SQ residual training void trainResiduals_(Index::idx_t n, const float* x); diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu index 494090da37..9e7f901bf5 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.cu @@ -239,14 +239,18 @@ GpuIndexIVFScalarQuantizer::addImpl_(int n, FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor data(const_cast(x), {n, (int) this->d}); + auto bitset = toDevice(resources_, device_, nullptr, stream, {0}); + static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor labels(const_cast(xids), {n}); // Not all vectors may be able to be added (some may contain NaNs etc) - index_->classifyAndAddVectors(data, labels); + index_->classifyAndAddVectors(data, labels, bitset); // but keep the ntotal based on the total number of vectors that we attempted // to add @@ -258,11 +262,14 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n, const float* x, int k, float* distances, - Index::idx_t* labels) const { + Index::idx_t* labels, + ConcurrentBitsetPtr bitset) const { // Device is already set in GpuIndex::search FAISS_ASSERT(index_); FAISS_ASSERT(n > 0); + auto stream = resources_->getDefaultStream(device_); + // Data is already resident on the GPU Tensor queries(const_cast(x), {n, (int) this->d}); Tensor outDistances(distances, {n, k}); @@ -270,7 +277,9 @@ GpuIndexIVFScalarQuantizer::searchImpl_(int n, static_assert(sizeof(long) == sizeof(Index::idx_t), "size mismatch"); Tensor outLabels(const_cast(labels), {n, k}); - index_->query(queries, nprobe, k, outDistances, outLabels); + auto bitsetDevice = toDevice(resources_, device_, nullptr, stream, {0}); + + index_->query(queries, bitsetDevice, nprobe, k, outDistances, outLabels); } } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h index 24a811f823..47b8de249f 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFScalarQuantizer.h @@ -75,7 +75,8 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF { const float* x, int k, float* distances, - Index::idx_t* labels) const override; + Index::idx_t* labels, + ConcurrentBitsetPtr bitset = nullptr) const override; /// Called from train to handle SQ residual training void trainResiduals_(Index::idx_t n, const float* x); diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu index 986c2eee3b..67b112434f 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu @@ -130,6 +130,7 @@ void runDistance(bool computeL2, Tensor* centroidNorms, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -306,6 +307,7 @@ void runDistance(bool computeL2, // Write into the final output runL2SelectMin(distanceBufView, *centroidNorms, + bitset, outDistanceView, outIndexView, k, @@ -326,6 +328,7 @@ void runDistance(bool computeL2, // Write into our intermediate output runL2SelectMin(distanceBufView, centroidNormsView, + bitset, outDistanceBufColView, outIndexBufColView, k, @@ -346,12 +349,14 @@ void runDistance(bool computeL2, if (tileCols == numCentroids) { // Write into the final output runBlockSelect(distanceBufView, + bitset, outDistanceView, outIndexView, true, k, streams[curStream]); } else { // Write into the intermediate output runBlockSelect(distanceBufView, + bitset, outDistanceBufColView, outIndexBufColView, true, k, streams[curStream]); @@ -368,6 +373,7 @@ void runDistance(bool computeL2, runBlockSelectPair(outDistanceBufRowView, outIndexBufRowView, + bitset, outDistanceView, outIndexView, computeL2 ? false : true, k, streams[curStream]); @@ -384,6 +390,7 @@ void runDistance(bool computeL2, } } +// Bitset added template void runL2Distance(GpuResources* resources, Tensor& centroids, @@ -391,6 +398,7 @@ void runL2Distance(GpuResources* resources, Tensor* centroidNorms, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -403,6 +411,7 @@ void runL2Distance(GpuResources* resources, centroidNorms, queries, queriesRowMajor, + bitset, k, outDistances, outIndices, @@ -416,6 +425,7 @@ void runIPDistance(GpuResources* resources, bool centroidsRowMajor, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -427,6 +437,7 @@ void runIPDistance(GpuResources* resources, nullptr, // no centroid norms provided queries, queriesRowMajor, + bitset, k, outDistances, outIndices, @@ -444,6 +455,7 @@ runIPDistance(GpuResources* resources, bool vectorsRowMajor, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices) { @@ -452,6 +464,7 @@ runIPDistance(GpuResources* resources, vectorsRowMajor, queries, queriesRowMajor, + bitset, k, outDistances, outIndices, @@ -464,6 +477,7 @@ runIPDistance(GpuResources* resources, bool vectorsRowMajor, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -473,6 +487,7 @@ runIPDistance(GpuResources* resources, vectorsRowMajor, queries, queriesRowMajor, + bitset, k, outDistances, outIndices, @@ -486,6 +501,7 @@ runL2Distance(GpuResources* resources, Tensor* vectorNorms, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -496,6 +512,7 @@ runL2Distance(GpuResources* resources, vectorNorms, queries, queriesRowMajor, + bitset, k, outDistances, outIndices, @@ -510,6 +527,7 @@ runL2Distance(GpuResources* resources, Tensor* vectorNorms, Tensor& queries, bool queriesRowMajor, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -521,6 +539,7 @@ runL2Distance(GpuResources* resources, vectorNorms, queries, queriesRowMajor, + bitset, k, outDistances, outIndices, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh index 0508eeeed1..7f0242780d 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh @@ -10,6 +10,7 @@ #include #include +#include namespace faiss { namespace gpu { @@ -65,4 +66,58 @@ void runL2Distance(GpuResources* resources, bool useHgemm, bool ignoreOutDistances = false); +// Bitset added +void runL2Distance(GpuResources* resources, + Tensor& vectors, + bool vectorsRowMajor, + // can be optionally pre-computed; nullptr if we + // have to compute it upon the call + Tensor* vectorNorms, + Tensor& queries, + bool queriesRowMajor, + Tensor& bitset, + int k, + Tensor& outDistances, + Tensor& outIndices, + // Do we care about `outDistances`? If not, we can + // take shortcuts. + bool ignoreOutDistances = false); + +/// Calculates brute-force inner product distance between `vectors` +/// and `queries`, returning the k closest results seen +void runIPDistance(GpuResources* resources, + Tensor& vectors, + bool vectorsRowMajor, + Tensor& queries, + bool queriesRowMajor, + Tensor& bitset, + int k, + Tensor& outDistances, + Tensor& outIndices); + +void runIPDistance(GpuResources* resources, + Tensor& vectors, + bool vectorsRowMajor, + Tensor& queries, + bool queriesRowMajor, + Tensor& bitset, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool useHgemm); + +void runL2Distance(GpuResources* resources, + Tensor& vectors, + bool vectorsRowMajor, + Tensor* vectorNorms, + Tensor& queries, + bool queriesRowMajor, + Tensor& bitset, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool useHgemm, + bool ignoreOutDistances = false); + + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu index 08d4221dfd..510b2182fc 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu @@ -103,6 +103,7 @@ FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) { void FlatIndex::query(Tensor& input, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -119,7 +120,7 @@ FlatIndex::query(Tensor& input, DeviceTensor outDistancesHalf( mem, {outDistances.getSize(0), outDistances.getSize(1)}, stream); - query(inputHalf, k, outDistancesHalf, outIndices, exactDistance); + query(inputHalf, bitset, k, outDistancesHalf, outIndices, exactDistance); if (exactDistance) { // Convert outDistances back @@ -135,6 +136,7 @@ FlatIndex::query(Tensor& input, &norms_, input, true, // input is row major + bitset, k, outDistances, outIndices, @@ -145,6 +147,7 @@ FlatIndex::query(Tensor& input, !storeTransposed_, // is vectors row major? input, true, // input is row major + bitset, k, outDistances, outIndices); @@ -154,6 +157,7 @@ FlatIndex::query(Tensor& input, void FlatIndex::query(Tensor& input, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, @@ -167,6 +171,7 @@ FlatIndex::query(Tensor& input, &normsHalf_, input, true, // input is row major + bitset, k, outDistances, outIndices, @@ -179,6 +184,7 @@ FlatIndex::query(Tensor& input, !storeTransposed_, // is vectors row major? input, true, // input is row major + bitset, k, outDistances, outIndices, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh index da7b640d69..03be3a2d4a 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh @@ -53,12 +53,14 @@ class FlatIndex { cudaStream_t stream); void query(Tensor& vecs, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, bool exactDistance); void query(Tensor& vecs, + Tensor& bitset, int k, Tensor& outDistances, Tensor& outIndices, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu index ffd7f9523b..742df3fcf2 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cu @@ -157,7 +157,8 @@ IVFFlat::addCodeVectorsFromCpu(int listId, int IVFFlat::classifyAndAddVectors(Tensor& vecs, - Tensor& indices) { + Tensor& indices, + Tensor& bitset) { FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0)); FAISS_ASSERT(vecs.getSize(1) == dim_); @@ -174,7 +175,7 @@ IVFFlat::classifyAndAddVectors(Tensor& vecs, listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); - quantizer_->query(vecs, 1, listDistance2d, listIds2d, false); + quantizer_->query(vecs, bitset, 1, listDistance2d, listIds2d, false); // Calculate residuals for these vectors, if needed DeviceTensor @@ -326,6 +327,7 @@ IVFFlat::classifyAndAddVectors(Tensor& vecs, void IVFFlat::query(Tensor& queries, + Tensor& bitset, int nprobe, int k, Tensor& outDistances, @@ -352,6 +354,7 @@ IVFFlat::query(Tensor& queries, // Find the `nprobe` closest lists; we can use int indices both // internally and externally quantizer_->query(queries, + bitset, nprobe, coarseDistances, coarseIndices, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh index 66c05a7d61..4ab62a489a 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFFlat.cuh @@ -44,11 +44,14 @@ class IVFFlat : public IVFBase { /// Returns the number of vectors successfully added. Vectors may /// not be able to be added because they contain NaNs. int classifyAndAddVectors(Tensor& vecs, - Tensor& indices); + Tensor& indices, + Tensor& bitset); + /// Find the approximate k nearest neigbors for `queries` against /// our database void query(Tensor& queries, + Tensor& bitset, int nprobe, int k, Tensor& outDistances, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu index aa843fed1e..d665d34566 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu @@ -110,7 +110,8 @@ IVFPQ::setPrecomputedCodes(bool enable) { int IVFPQ::classifyAndAddVectors(Tensor& vecs, - Tensor& indices) { + Tensor& indices, + Tensor& bitset) { FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0)); FAISS_ASSERT(vecs.getSize(1) == dim_); @@ -128,7 +129,7 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, DeviceTensor listIds2d(mem, {vecs.getSize(0), 1}, stream); auto listIds = listIds2d.view<1>({vecs.getSize(0)}); - quantizer_->query(vecs, 1, listDistance, listIds2d, false); + quantizer_->query(vecs, bitset, 1, listDistance, listIds2d, false); // Copy the lists that we wish to append to back to the CPU // FIXME: really this can be into pinned memory and a true async @@ -184,6 +185,7 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, nullptr, // no precomputed norms residualsTransposeView, true, // residualsTransposeView is row major + bitset, 1, closestSubQDistanceView, closestSubQIndexView, @@ -506,6 +508,7 @@ IVFPQ::precomputeCodes_() { void IVFPQ::query(Tensor& queries, + Tensor& bitset, int nprobe, int k, Tensor& outDistances, @@ -531,6 +534,7 @@ IVFPQ::query(Tensor& queries, // Find the `nprobe` closest coarse centroids; we can use int // indices both internally and externally quantizer_->query(queries, + bitset, nprobe, coarseDistances, coarseIndices, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh index 781104d77b..cc231459b0 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh @@ -52,11 +52,13 @@ class IVFPQ : public IVFBase { /// Returns the number of vectors successfully added. Vectors may /// not be able to be added because they contain NaNs. int classifyAndAddVectors(Tensor& vecs, - Tensor& indices); + Tensor& indices, + Tensor& bitset); /// Find the approximate k nearest neigbors for `queries` against /// our database void query(Tensor& queries, + Tensor& bitset, int nprobe, int k, Tensor& outDistances, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu index 1480ec07df..ab4556a584 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu @@ -159,9 +159,160 @@ __global__ void l2SelectMinK(Tensor productDistances, } } +// With bitset included +// L2 + select kernel for k == 1, implements re-use of ||c||^2 +template +__global__ void l2SelectMin1(Tensor productDistances, + Tensor centroidDistances, + Tensor bitset, + Tensor outDistances, + Tensor outIndices) { + // Each block handles kRowsPerBlock rows of the distances (results) + Pair threadMin[kRowsPerBlock]; + __shared__ Pair blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)]; + + T distance[kRowsPerBlock]; + +#pragma unroll + for (int i = 0; i < kRowsPerBlock; ++i) { + threadMin[i].k = Limits::getMax(); + threadMin[i].v = -1; + } + + // blockIdx.x: which chunk of rows we are responsible for updating + int rowStart = blockIdx.x * kRowsPerBlock; + + // FIXME: if we have exact multiples, don't need this + bool endRow = (blockIdx.x == gridDim.x - 1); + + if (endRow) { + if (productDistances.getSize(0) % kRowsPerBlock == 0) { + endRow = false; + } + } + + if (endRow) { + for (int row = rowStart; row < productDistances.getSize(0); ++row) { + for (int col = threadIdx.x; col < productDistances.getSize(1); + col += blockDim.x) { + if (!(bitset[col >> 3] & (0x1 << (col & 0x7)))) { + distance[0] = Math::add(centroidDistances[col], + productDistances[row][col]); + + if (Math::lt(distance[0], threadMin[0].k)) { + threadMin[0].k = distance[0]; + threadMin[0].v = col; + } + } + } + + // Reduce within the block + threadMin[0] = + blockReduceAll, Min >, false, false>( + threadMin[0], Min >(), blockMin); + + if (threadIdx.x == 0) { + outDistances[row][0] = threadMin[0].k; + outIndices[row][0] = threadMin[0].v; + } + + // so we can use the shared memory again + __syncthreads(); + + threadMin[0].k = Limits::getMax(); + threadMin[0].v = -1; + } + } else { + for (int col = threadIdx.x; col < productDistances.getSize(1); + col += blockDim.x) { + T centroidDistance = centroidDistances[col]; + +#pragma unroll + for (int row = 0; row < kRowsPerBlock; ++row) { + distance[row] = productDistances[rowStart + row][col]; + } + +#pragma unroll + for (int row = 0; row < kRowsPerBlock; ++row) { + distance[row] = Math::add(distance[row], centroidDistance); + } + +#pragma unroll + for (int row = 0; row < kRowsPerBlock; ++row) { + if (Math::lt(distance[row], threadMin[row].k)) { + threadMin[row].k = distance[row]; + threadMin[row].v = col; + } + } + } + + // Reduce within the block + blockReduceAll, Min >, false, false>( + threadMin, Min >(), blockMin); + + if (threadIdx.x == 0) { +#pragma unroll + for (int row = 0; row < kRowsPerBlock; ++row) { + outDistances[rowStart + row][0] = threadMin[row].k; + outIndices[rowStart + row][0] = threadMin[row].v; + } + } + } +} + +// With bitset included +// L2 + select kernel for k > 1, no re-use of ||c||^2 +template +__global__ void l2SelectMinK(Tensor productDistances, + Tensor centroidDistances, + Tensor bitset, + Tensor outDistances, + Tensor outIndices, + int k, T initK) { + // Each block handles a single row of the distances (results) + constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + + __shared__ T smemK[kNumWarps * NumWarpQ]; + __shared__ int smemV[kNumWarps * NumWarpQ]; + + BlockSelect, + NumWarpQ, NumThreadQ, ThreadsPerBlock> + heap(initK, -1, smemK, smemV, k); + + int row = blockIdx.x; + + // Whole warps must participate in the selection + int limit = utils::roundDown(productDistances.getSize(1), kWarpSize); + int i = threadIdx.x; + + for (; i < limit; i += blockDim.x) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + T v = Math::add(centroidDistances[i], + productDistances[row][i]); + heap.add(v, i); + } + } + + if (i < productDistances.getSize(1)) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + T v = Math::add(centroidDistances[i], + productDistances[row][i]); + heap.addThreadQ(v, i); + } + } + + heap.reduce(); + for (int i = threadIdx.x; i < k; i += blockDim.x) { + outDistances[row][i] = smemK[i]; + outIndices[row][i] = smemV[i]; + } +} + + template void runL2SelectMin(Tensor& productDistances, Tensor& centroidDistances, + Tensor& bitset, Tensor& outDistances, Tensor& outIndices, int k, @@ -181,7 +332,7 @@ void runL2SelectMin(Tensor& productDistances, auto grid = dim3(utils::divUp(outDistances.getSize(0), kRowsPerBlock)); l2SelectMin1 - <<>>(productDistances, centroidDistances, + <<>>(productDistances, centroidDistances, bitset, outDistances, outIndices); } else { auto grid = dim3(outDistances.getSize(0)); @@ -194,28 +345,63 @@ void runL2SelectMin(Tensor& productDistances, k, Limits::getMax()); \ } while (0) - // block size 128 for everything <= 1024 - if (k <= 32) { - RUN_L2_SELECT(128, 32, 2); - } else if (k <= 64) { - RUN_L2_SELECT(128, 64, 3); - } else if (k <= 128) { - RUN_L2_SELECT(128, 128, 3); - } else if (k <= 256) { - RUN_L2_SELECT(128, 256, 4); - } else if (k <= 512) { - RUN_L2_SELECT(128, 512, 8); - } else if (k <= 1024) { - RUN_L2_SELECT(128, 1024, 8); +#define RUN_L2_SELECT_BITSET(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \ + do { \ + l2SelectMinK \ + <<>>(productDistances, centroidDistances, \ + bitset, outDistances, outIndices, \ + k, Limits::getMax()); \ + } while (0) -#if GPU_MAX_SELECTION_K >= 2048 - } else if (k <= 2048) { - // smaller block for less shared memory - RUN_L2_SELECT(64, 2048, 8); -#endif + if (bitset.getSize(0) == 0) { + // block size 128 for everything <= 1024 + if (k <= 32) { + RUN_L2_SELECT(128, 32, 2); + } else if (k <= 64) { + RUN_L2_SELECT(128, 64, 3); + } else if (k <= 128) { + RUN_L2_SELECT(128, 128, 3); + } else if (k <= 256) { + RUN_L2_SELECT(128, 256, 4); + } else if (k <= 512) { + RUN_L2_SELECT(128, 512, 8); + } else if (k <= 1024) { + RUN_L2_SELECT(128, 1024, 8); + + #if GPU_MAX_SELECTION_K >= 2048 + } else if (k <= 2048) { + // smaller block for less shared memory + RUN_L2_SELECT(64, 2048, 8); + #endif + + } else { + FAISS_ASSERT(false); + } } else { - FAISS_ASSERT(false); + // With bitset + if (k <= 32) { + RUN_L2_SELECT_BITSET(128, 32, 2); + } else if (k <= 64) { + RUN_L2_SELECT_BITSET(128, 64, 3); + } else if (k <= 128) { + RUN_L2_SELECT_BITSET(128, 128, 3); + } else if (k <= 256) { + RUN_L2_SELECT_BITSET(128, 256, 4); + } else if (k <= 512) { + RUN_L2_SELECT_BITSET(128, 512, 8); + } else if (k <= 1024) { + RUN_L2_SELECT_BITSET(128, 1024, 8); + + #if GPU_MAX_SELECTION_K >= 2048 + } else if (k <= 2048) { + // smaller block for less shared memory + RUN_L2_SELECT_BITSET(64, 2048, 8); + #endif + + } else { + FAISS_ASSERT(false); + } } } @@ -224,12 +410,14 @@ void runL2SelectMin(Tensor& productDistances, void runL2SelectMin(Tensor& productDistances, Tensor& centroidDistances, + Tensor& bitset, Tensor& outDistances, Tensor& outIndices, int k, cudaStream_t stream) { runL2SelectMin(productDistances, centroidDistances, + bitset, outDistances, outIndices, k, @@ -238,12 +426,14 @@ void runL2SelectMin(Tensor& productDistances, void runL2SelectMin(Tensor& productDistances, Tensor& centroidDistances, + Tensor& bitset, Tensor& outDistances, Tensor& outIndices, int k, cudaStream_t stream) { runL2SelectMin(productDistances, centroidDistances, + bitset, outDistances, outIndices, k, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh index 95c35ca571..31b0ddfe23 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh @@ -26,4 +26,20 @@ void runL2SelectMin(Tensor& productDistances, int k, cudaStream_t stream); +void runL2SelectMin(Tensor& productDistances, + Tensor& centroidDistances, + Tensor& bitset, + Tensor& outDistances, + Tensor& outIndices, + int k, + cudaStream_t stream); + +void runL2SelectMin(Tensor& productDistances, + Tensor& centroidDistances, + Tensor& bitset, + Tensor& outDistances, + Tensor& outIndices, + int k, + cudaStream_t stream); + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu index 890fe5fb1e..5e2eb49f13 100644 --- a/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu +++ b/core/src/index/thirdparty/faiss/gpu/perf/PerfSelect.cu @@ -51,6 +51,7 @@ int main(int argc, char** argv) { limitK = GPU_MAX_SELECTION_K; } + faiss::gpu::DeviceTensor bitset(nullptr, {0}); for (int k = startK; k <= limitK; k *= 2) { faiss::gpu::DeviceTensor gpuOutVal({FLAGS_rows, k}); faiss::gpu::DeviceTensor gpuOutInd({FLAGS_rows, k}); @@ -60,7 +61,7 @@ int main(int argc, char** argv) { faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, FLAGS_dir, k, 0); } else { - faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, + faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd, FLAGS_dir, k, 0); } } diff --git a/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu index 35d5b95505..eec621bd5c 100644 --- a/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu +++ b/core/src/index/thirdparty/faiss/gpu/test/TestGpuSelect.cu @@ -29,6 +29,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) { } } + faiss::gpu::DeviceTensor bitset(nullptr, {0}); + // row -> (val -> idx) std::unordered_map>> hostOutValAndInd; for (int r = 0; r < rows; ++r) { @@ -59,7 +61,8 @@ void testForSize(int rows, int cols, int k, bool dir, bool warp) { if (warp) { faiss::gpu::runWarpSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0); } else { - faiss::gpu::runBlockSelect(gpuVal, gpuOutVal, gpuOutInd, dir, k, 0); + + faiss::gpu::runBlockSelect(gpuVal, bitset, gpuOutVal, gpuOutInd, dir, k, 0); } // Copy back to CPU diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu index 47617fbe85..7f1febed3e 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectFloat.cu @@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(float, false, 2048); #endif void runBlockSelect(Tensor& in, + Tensor& bitset, Tensor& outK, Tensor& outV, bool dir, int k, cudaStream_t stream) { @@ -93,6 +94,7 @@ void runBlockSelect(Tensor& in, void runBlockSelectPair(Tensor& inK, Tensor& inV, + Tensor& bitset, Tensor& outK, Tensor& outV, bool dir, int k, cudaStream_t stream) { diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu index bc05e1485f..4f642a0ca8 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu @@ -43,6 +43,7 @@ BLOCK_SELECT_DECL(half, false, 2048); #endif void runBlockSelect(Tensor& in, + Tensor& bitset, Tensor& outK, Tensor& outV, bool dir, int k, cudaStream_t stream) { @@ -93,6 +94,7 @@ void runBlockSelect(Tensor& in, void runBlockSelectPair(Tensor& inK, Tensor& inV, + Tensor& bitset, Tensor& outK, Tensor& outV, bool dir, int k, cudaStream_t stream) { diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh index 04e76541de..e7d804c50f 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh @@ -110,24 +110,138 @@ __global__ void blockSelectPair(Tensor inK, } } +// Bitset included +template +__global__ void blockSelect(Tensor in, + Tensor bitset, + Tensor outK, + Tensor outV, + K initK, + IndexType initV, + int k) { + constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + + __shared__ K smemK[kNumWarps * NumWarpQ]; + __shared__ IndexType smemV[kNumWarps * NumWarpQ]; + + BlockSelect, + NumWarpQ, NumThreadQ, ThreadsPerBlock> + heap(initK, initV, smemK, smemV, k); + + // Grid is exactly sized to rows available + int row = blockIdx.x; + + int i = threadIdx.x; + K* inStart = in[row][i].data(); + + // Whole warps must participate in the selection + int limit = utils::roundDown(in.getSize(1), kWarpSize); + + for (; i < limit; i += ThreadsPerBlock) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + heap.add(*inStart, (IndexType) i); + inStart += ThreadsPerBlock; + } + } + + // Handle last remainder fraction of a warp of elements + if (i < in.getSize(1)) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + heap.addThreadQ(*inStart, (IndexType) i); + } + } + + heap.reduce(); + + for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) { + outK[row][i] = smemK[i]; + outV[row][i] = smemV[i]; + } +} + +template +__global__ void blockSelectPair(Tensor inK, + Tensor inV, + Tensor bitset, + Tensor outK, + Tensor outV, + K initK, + IndexType initV, + int k) { + constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; + + __shared__ K smemK[kNumWarps * NumWarpQ]; + __shared__ IndexType smemV[kNumWarps * NumWarpQ]; + + BlockSelect, + NumWarpQ, NumThreadQ, ThreadsPerBlock> + heap(initK, initV, smemK, smemV, k); + + // Grid is exactly sized to rows available + int row = blockIdx.x; + + int i = threadIdx.x; + K* inKStart = inK[row][i].data(); + IndexType* inVStart = inV[row][i].data(); + + // Whole warps must participate in the selection + int limit = utils::roundDown(inK.getSize(1), kWarpSize); + + for (; i < limit; i += ThreadsPerBlock) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + heap.add(*inKStart, *inVStart); + inKStart += ThreadsPerBlock; + inVStart += ThreadsPerBlock; + } + } + + // Handle last remainder fraction of a warp of elements + if (i < inK.getSize(1)) { + if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + heap.addThreadQ(*inKStart, *inVStart); + } + } + + heap.reduce(); + + for (int i = threadIdx.x; i < k; i += ThreadsPerBlock) { + outK[row][i] = smemK[i]; + outV[row][i] = smemV[i]; + } +} + void runBlockSelect(Tensor& in, + Tensor& bitset, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); void runBlockSelectPair(Tensor& inKeys, Tensor& inIndices, + Tensor& bitset, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); void runBlockSelect(Tensor& in, + Tensor& bitset, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); void runBlockSelectPair(Tensor& inKeys, Tensor& inIndices, + Tensor& bitset, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh index fe50488e5f..4c32b75194 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectImpl.cuh @@ -13,6 +13,7 @@ #define BLOCK_SELECT_DECL(TYPE, DIR, WARP_Q) \ extern void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ Tensor& in, \ + Tensor& bitset, \ Tensor& outK, \ Tensor& outV, \ bool dir, \ @@ -22,15 +23,17 @@ extern void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ Tensor& inK, \ Tensor& inV, \ + Tensor& bitset, \ Tensor& outK, \ Tensor& outV, \ bool dir, \ int k, \ - cudaStream_t stream) + cudaStream_t stream); #define BLOCK_SELECT_IMPL(TYPE, DIR, WARP_Q, THREAD_Q) \ void runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ Tensor& in, \ + Tensor& bitset, \ Tensor& outK, \ Tensor& outV, \ bool dir, \ @@ -52,14 +55,19 @@ auto kInit = dir ? Limits::getMin() : Limits::getMax(); \ auto vInit = -1; \ \ - blockSelect \ - <<>>(in, outK, outV, kInit, vInit, k); \ + if (bitset.getSize(0) == 0) \ + blockSelect \ + <<>>(in, outK, outV, kInit, vInit, k); \ + else \ + blockSelect \ + <<>>(in, bitset, outK, outV, kInit, vInit, k); \ CUDA_TEST_ERROR(); \ } \ \ void runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ Tensor& inK, \ Tensor& inV, \ + Tensor& bitset, \ Tensor& outK, \ Tensor& outV, \ bool dir, \ @@ -79,16 +87,20 @@ auto kInit = dir ? Limits::getMin() : Limits::getMax(); \ auto vInit = -1; \ \ - blockSelectPair \ - <<>>(inK, inV, outK, outV, kInit, vInit, k); \ + if (bitset.getSize(0) == 0) \ + blockSelectPair \ + <<>>(inK, inV, outK, outV, kInit, vInit, k); \ + else \ + blockSelectPair \ + <<>>(inK, inV, bitset, outK, outV, kInit, vInit, k); \ CUDA_TEST_ERROR(); \ } #define BLOCK_SELECT_CALL(TYPE, DIR, WARP_Q) \ runBlockSelect_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ - in, outK, outV, dir, k, stream) + in, bitset, outK, outV, dir, k, stream) #define BLOCK_SELECT_PAIR_CALL(TYPE, DIR, WARP_Q) \ runBlockSelectPair_ ## TYPE ## _ ## DIR ## _ ## WARP_Q ## _( \ - inK, inV, outK, outV, dir, k, stream) + inK, inV, bitset, outK, outV, dir, k, stream)