diff --git a/CHANGELOG.md b/CHANGELOG.md index 808d072936..d306e1a7d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,13 @@ Please mark all change in change log and use the issue from GitHub ## Bug - \#2890 Fix the index size caculation in cache - \#2952 Fix the result merging of IVF_PQ IP -- \#2975 Fix config ut failed +- \#2975 Fix config UT failed ## Feature ## Improvement - \#2653 Improve IVF search performance when NQ and nProbe are both large +- \#2828 Let Faiss not to compile half float by default ## Task diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu index 8e873c1914..130e95f866 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVF.cu @@ -27,6 +27,7 @@ GpuIndexIVF::GpuIndexIVF(GpuResources* resources, nlist(nlistIn), nprobe(1), quantizer(nullptr) { + init_(); // Only IP and L2 are supported for now diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu index 6ca7c70ffb..e806ea49fa 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFFlat.cu @@ -261,3 +261,4 @@ GpuIndexIVFFlat::searchImpl_(int n, } } // namespace + diff --git a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu index 254c0c4104..d6095a58e8 100644 --- a/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/GpuIndexIVFPQ.cu @@ -34,6 +34,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources, bitsPerCode_(0), reserveMemoryVecs_(0), index_(nullptr) { +#ifndef FAISS_USE_FLOAT16 + FAISS_ASSERT(!ivfpqConfig_.useFloat16LookupTables); +#endif + copyFrom(index); } @@ -55,6 +59,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResources* resources, bitsPerCode_(bitsPerCode), reserveMemoryVecs_(0), index_(nullptr) { +#ifndef FAISS_USE_FLOAT16 + FAISS_ASSERT(!config.useFloat16LookupTables); +#endif + verifySettings_(); // We haven't trained ourselves, so don't construct the PQ index yet @@ -424,9 +432,11 @@ GpuIndexIVFPQ::verifySettings_() const { // We must have enough shared memory on the current device to store // our lookup distances int lookupTableSize = sizeof(float); +#ifdef FAISS_USE_FLOAT16 if (ivfpqConfig_.useFloat16LookupTables) { lookupTableSize = sizeof(half); } +#endif // 64 bytes per code is only supported with usage of float16, at 2^8 // codes per subquantizer diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu index 364200c3e4..e9f7548e25 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cu @@ -262,11 +262,13 @@ void runSumAlongColumns(Tensor& input, runSumAlongColumns(input, output, stream); } +#ifdef FAISS_USE_FLOAT16 void runSumAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream) { runSumAlongColumns(input, output, stream); } +#endif template void runAssignAlongColumns(Tensor& input, @@ -310,11 +312,13 @@ void runAssignAlongColumns(Tensor& input, runAssignAlongColumns(input, output, stream); } +#ifdef FAISS_USE_FLOAT16 void runAssignAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream) { runAssignAlongColumns(input, output, stream); } +#endif template void runSumAlongRows(Tensor& input, @@ -344,11 +348,13 @@ void runSumAlongRows(Tensor& input, runSumAlongRows(input, output, zeroClamp, stream); } +#ifdef FAISS_USE_FLOAT16 void runSumAlongRows(Tensor& input, Tensor& output, bool zeroClamp, cudaStream_t stream) { runSumAlongRows(input, output, zeroClamp, stream); } +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh index 8c4b27452c..6641aadd40 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/BroadcastSum.cuh @@ -17,18 +17,22 @@ void runSumAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runSumAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream); +#endif // output[x][i] = input[i] for all x void runAssignAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runAssignAlongColumns(Tensor& input, Tensor& output, cudaStream_t stream); +#endif // output[i][x] += input[i] for all x // If zeroClamp, output[i][x] = max(output[i][x] + input[i], 0) for all x @@ -37,9 +41,11 @@ void runSumAlongRows(Tensor& input, bool zeroClamp, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runSumAlongRows(Tensor& input, Tensor& output, bool zeroClamp, cudaStream_t stream); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu index e4aa3af1fc..0856396cc1 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cu @@ -370,6 +370,7 @@ runIPDistance(GpuResources* resources, outIndices); } +#ifdef FAISS_USE_FLOAT16 void runIPDistance(GpuResources* resources, Tensor& vectors, @@ -390,6 +391,7 @@ runIPDistance(GpuResources* resources, outDistances, outIndices); } +#endif void runL2Distance(GpuResources* resources, @@ -416,6 +418,7 @@ runL2Distance(GpuResources* resources, ignoreOutDistances); } +#ifdef FAISS_USE_FLOAT16 void runL2Distance(GpuResources* resources, Tensor& vectors, @@ -440,5 +443,6 @@ runL2Distance(GpuResources* resources, outIndices, ignoreOutDistances); } +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh index 844d420aea..3430ddf87f 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh @@ -57,6 +57,7 @@ void runIPDistance(GpuResources* resources, Tensor& outDistances, Tensor& outIndices); + void runL2Distance(GpuResources* resources, Tensor& vectors, bool vectorsRowMajor, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu index e7545df767..29480fa84f 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cu @@ -29,6 +29,9 @@ FlatIndex::FlatIndex(GpuResources* res, space_(space), num_(0), rawData_(space) { +#ifndef FAISS_USE_FLOAT16 + FAISS_ASSERT(!useFloat16_); +#endif } bool @@ -38,28 +41,40 @@ FlatIndex::getUseFloat16() const { /// Returns the number of vectors we contain int FlatIndex::getSize() const { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { return vectorsHalf_.getSize(0); } else { return vectors_.getSize(0); } +#else + return vectors_.getSize(0); +#endif } int FlatIndex::getDim() const { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { return vectorsHalf_.getSize(1); } else { return vectors_.getSize(1); } +#else + return vectors_.getSize(1); +#endif } void FlatIndex::reserve(size_t numVecs, cudaStream_t stream) { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { rawData_.reserve(numVecs * dim_ * sizeof(half), stream); } else { rawData_.reserve(numVecs * dim_ * sizeof(float), stream); } +#else + rawData_.reserve(numVecs * dim_ * sizeof(float), stream); +#endif } template <> @@ -70,6 +85,7 @@ FlatIndex::getVectorsRef() { return getVectorsFloat32Ref(); } +#ifdef FAISS_USE_FLOAT16 template <> Tensor& FlatIndex::getVectorsRef() { @@ -77,6 +93,7 @@ FlatIndex::getVectorsRef() { FAISS_ASSERT(useFloat16_); return getVectorsFloat16Ref(); } +#endif Tensor& FlatIndex::getVectorsFloat32Ref() { @@ -86,6 +103,7 @@ FlatIndex::getVectorsFloat32Ref() { return vectors_; } +#ifdef FAISS_USE_FLOAT16 Tensor& FlatIndex::getVectorsFloat16Ref() { // Should not call this unless we are in float16 mode @@ -93,6 +111,7 @@ FlatIndex::getVectorsFloat16Ref() { return vectorsHalf_; } +#endif DeviceTensor FlatIndex::getVectorsFloat32Copy(cudaStream_t stream) { @@ -103,12 +122,16 @@ DeviceTensor FlatIndex::getVectorsFloat32Copy(int from, int num, cudaStream_t stream) { DeviceTensor vecFloat32({num, dim_}, space_); +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { auto halfNarrow = vectorsHalf_.narrowOutermost(from, num); convertTensor(stream, halfNarrow, vecFloat32); } else { vectors_.copyTo(vecFloat32, stream); } +#else + vectors_.copyTo(vecFloat32, stream); +#endif return vecFloat32; } @@ -125,13 +148,16 @@ FlatIndex::query(Tensor& input, auto stream = resources_->getDefaultStreamCurrentDevice(); auto& mem = resources_->getMemoryManagerCurrentDevice(); +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { // We need to convert the input to float16 for comparison to ourselves + auto inputHalf = convertTensor(resources_, stream, input); query(inputHalf, bitset, k, metric, metricArg, outDistances, outIndices, exactDistance); + } else { bfKnnOnDevice(resources_, getCurrentDevice(), @@ -149,8 +175,26 @@ FlatIndex::query(Tensor& input, outIndices, !exactDistance); } +#else + bfKnnOnDevice(resources_, + getCurrentDevice(), + stream, + storeTransposed_ ? vectorsTransposed_ : vectors_, + !storeTransposed_, // is vectors row major? + &norms_, + input, + true, // input is row major + bitset, + k, + metric, + metricArg, + outDistances, + outIndices, + !exactDistance); +#endif } +#ifdef FAISS_USE_FLOAT16 void FlatIndex::query(Tensor& input, Tensor& bitset, @@ -178,11 +222,13 @@ FlatIndex::query(Tensor& input, outIndices, !exactDistance); } +#endif void FlatIndex::computeResidual(Tensor& vecs, Tensor& listIds, Tensor& residuals) { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { runCalcResidual(vecs, getVectorsFloat16Ref(), @@ -196,11 +242,19 @@ FlatIndex::computeResidual(Tensor& vecs, residuals, resources_->getDefaultStreamCurrentDevice()); } +#else + runCalcResidual(vecs, + getVectorsFloat32Ref(), + listIds, + residuals, + resources_->getDefaultStreamCurrentDevice()); +#endif } void FlatIndex::reconstruct(Tensor& listIds, Tensor& vecs) { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { runReconstruct(listIds, getVectorsFloat16Ref(), @@ -212,8 +266,13 @@ FlatIndex::reconstruct(Tensor& listIds, vecs, resources_->getDefaultStreamCurrentDevice()); } +#else + runReconstruct(listIds, + getVectorsFloat32Ref(), + vecs, + resources_->getDefaultStreamCurrentDevice()); +#endif } - void FlatIndex::reconstruct(Tensor& listIds, Tensor& vecs) { @@ -229,6 +288,7 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { return; } +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { // Make sure that `data` is on our device; we'll run the // conversion on our device @@ -252,8 +312,15 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { true /* reserve exactly */); } +#else + rawData_.append((char*) data, + (size_t) dim_ * numVecs * sizeof(float), + stream, + true /* reserve exactly */); +#endif num_ += numVecs; +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { DeviceTensor vectorsHalf( (half*) rawData_.data(), {(int) num_, dim_}, space_); @@ -263,8 +330,14 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { (float*) rawData_.data(), {(int) num_, dim_}, space_); vectors_ = std::move(vectors); } +#else + DeviceTensor vectors( + (float*) rawData_.data(), {(int) num_, dim_}, space_); + vectors_ = std::move(vectors); +#endif if (storeTransposed_) { +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { vectorsHalfTransposed_ = std::move(DeviceTensor({dim_, (int) num_}, space_)); @@ -274,9 +347,15 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { std::move(DeviceTensor({dim_, (int) num_}, space_)); runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream); } +#else + vectorsTransposed_ = + std::move(DeviceTensor({dim_, (int) num_}, space_)); + runTransposeAny(vectors_, 0, 1, vectorsTransposed_, stream); +#endif } // Precompute L2 norms of our database +#ifdef FAISS_USE_FLOAT16 if (useFloat16_) { DeviceTensor norms({(int) num_}, space_); runL2Norm(vectorsHalf_, true, norms, true, stream); @@ -286,6 +365,11 @@ FlatIndex::add(const float* data, int numVecs, cudaStream_t stream) { runL2Norm(vectors_, true, norms, true, stream); norms_ = std::move(norms); } +#else + DeviceTensor norms({(int) num_}, space_); + runL2Norm(vectors_, true, norms, true, stream); + norms_ = std::move(norms); +#endif } void @@ -293,8 +377,10 @@ FlatIndex::reset() { rawData_.clear(); vectors_ = std::move(DeviceTensor()); vectorsTransposed_ = std::move(DeviceTensor()); +#ifdef FAISS_USE_FLOAT16 vectorsHalf_ = std::move(DeviceTensor()); vectorsHalfTransposed_ = std::move(DeviceTensor()); +#endif norms_ = std::move(DeviceTensor()); num_ = 0; } diff --git a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh index 5bc97441c4..eef07df24c 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/FlatIndex.cuh @@ -47,7 +47,9 @@ class FlatIndex { Tensor& getVectorsFloat32Ref(); /// Returns a reference to our vectors currently in use (useFloat16 mode) +#ifdef FAISS_USE_FLOAT16 Tensor& getVectorsFloat16Ref(); +#endif /// Performs a copy of the vectors on the given device, converting /// as needed from float16 @@ -67,6 +69,7 @@ class FlatIndex { Tensor& outIndices, bool exactDistance); +#ifdef FAISS_USE_FLOAT16 void query(Tensor& vecs, Tensor& bitset, int k, @@ -75,6 +78,7 @@ class FlatIndex { Tensor& outDistances, Tensor& outIndices, bool exactDistance); +#endif /// Compute residual for set of vectors void computeResidual(Tensor& vecs, @@ -123,8 +127,10 @@ class FlatIndex { DeviceTensor vectorsTransposed_; /// Vectors currently in rawData_, float16 form +#ifdef FAISS_USE_FLOAT16 DeviceTensor vectorsHalf_; DeviceTensor vectorsHalfTransposed_; +#endif /// Precomputed L2 norms DeviceTensor norms_; diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu index 01c91f9c3f..48254c1f5b 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cu @@ -60,6 +60,10 @@ IVFPQ::IVFPQ(GpuResources* resources, FAISS_ASSERT(dim_ % numSubQuantizers_ == 0); FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_)); +#ifndef FAISS_USE_FLOAT16 + FAISS_ASSERT(!useFloat16LookupTables_); +#endif + setPQCentroids_(pqCentroidData); } @@ -112,7 +116,9 @@ IVFPQ::setPrecomputedCodes(bool enable) { } else { // Clear out old precomputed code data precomputedCode_ = std::move(DeviceTensor()); +#ifdef FAISS_USE_FLOAT16 precomputedCodeHalf_ = std::move(DeviceTensor()); +#endif } } } @@ -156,6 +162,7 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, DeviceTensor residuals( mem, {vecs.getSize(0), vecs.getSize(1)}, stream); +#ifdef FAISS_USE_FLOAT16 if (quantizer_->getUseFloat16()) { auto& coarseCentroids = quantizer_->getVectorsFloat16Ref(); runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); @@ -163,6 +170,10 @@ IVFPQ::classifyAndAddVectors(Tensor& vecs, auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); } +#else + auto& coarseCentroids = quantizer_->getVectorsFloat32Ref(); + runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream); +#endif // Residuals are in the form // (vec x numSubQuantizer x dimPerSubQuantizer) @@ -519,6 +530,7 @@ IVFPQ::precomputeCodesT_() { // We added into the view, so `coarsePQProductTransposed` is now our // precomputed term 2. +#ifdef FAISS_USE_FLOAT16 if (useFloat16LookupTables_) { precomputedCodeHalf_ = convertTensor(resources_, @@ -527,15 +539,23 @@ IVFPQ::precomputeCodesT_() { } else { precomputedCode_ = std::move(coarsePQProductTransposed); } +#else + precomputedCode_ = std::move(coarsePQProductTransposed); +#endif + } void IVFPQ::precomputeCodes_() { +#ifdef FAISS_USE_FLOAT16 if (quantizer_->getUseFloat16()) { precomputeCodesT_(); } else { precomputeCodesT_(); } +#else + precomputeCodesT_(); +#endif } void @@ -678,6 +698,7 @@ IVFPQ::runPQPrecomputedCodes_( NoTypeTensor<3, true> term2; NoTypeTensor<3, true> term3; +#ifdef FAISS_USE_FLOAT16 DeviceTensor term3Half; if (useFloat16LookupTables_) { @@ -686,7 +707,10 @@ IVFPQ::runPQPrecomputedCodes_( term2 = NoTypeTensor<3, true>(precomputedCodeHalf_); term3 = NoTypeTensor<3, true>(term3Half); - } else { + } +#endif + + if (!useFloat16LookupTables_) { term2 = NoTypeTensor<3, true>(precomputedCode_); term3 = NoTypeTensor<3, true>(term3Transposed); } @@ -754,6 +778,7 @@ IVFPQ::runPQNoPrecomputedCodes_( int k, Tensor& outDistances, Tensor& outIndices) { +#ifdef FAISS_USE_FLOAT16 if (quantizer_->getUseFloat16()) { runPQNoPrecomputedCodesT_(queries, bitset, @@ -770,7 +795,17 @@ IVFPQ::runPQNoPrecomputedCodes_( k, outDistances, outIndices); - } + } +#else + runPQNoPrecomputedCodesT_(queries, + bitset, + coarseDistances, + coarseIndices, + k, + outDistances, + outIndices); +#endif + } } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh index db8cbb68aa..ad03fb4f89 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/IVFPQ.cuh @@ -153,7 +153,9 @@ class IVFPQ : public IVFBase { DeviceTensor precomputedCode_; /// Precomputed term 2 in half form +#ifdef FAISS_USE_FLOAT16 DeviceTensor precomputedCodeHalf_; +#endif }; } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu index 96bcd8e95b..bdf812524e 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cu @@ -309,6 +309,7 @@ void runL2Norm(Tensor& input, } } +#ifdef FAISS_USE_FLOAT16 void runL2Norm(Tensor& input, bool inputRowMajor, Tensor& output, @@ -325,5 +326,6 @@ void runL2Norm(Tensor& input, inputCast, inputRowMajor, outputCast, normSquared, stream); } } +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh index c4d5850802..6df3dcea58 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Norm.cuh @@ -18,10 +18,12 @@ void runL2Norm(Tensor& input, bool normSquared, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runL2Norm(Tensor& input, bool inputRowMajor, Tensor& output, bool normSquared, cudaStream_t stream); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh index c3ef87f2e7..520a8bcafb 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances-inl.cuh @@ -438,6 +438,7 @@ runPQCodeDistancesMM(Tensor& pqCentroids, runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); +#ifdef FAISS_USE_FLOAT16 if (useFloat16Lookup) { // Need to convert back auto outCodeDistancesH = outCodeDistances.toTensor(); @@ -445,6 +446,7 @@ runPQCodeDistancesMM(Tensor& pqCentroids, outCodeDistancesF, outCodeDistancesH); } +#endif } template @@ -477,6 +479,7 @@ runPQCodeDistances(Tensor& pqCentroids, auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) + topQueryToCentroid.getSize(1) * sizeof(int); +#ifdef FAISS_USE_FLOAT16 #define RUN_CODE(DIMS, L2) \ do { \ if (useFloat16Lookup) { \ @@ -495,6 +498,16 @@ runPQCodeDistances(Tensor& pqCentroids, topQueryToCentroid, outCodeDistancesT); \ } \ } while (0) +#else +#define RUN_CODE(DIMS, L2) \ + do { \ + auto outCodeDistancesT = outCodeDistances.toTensor(); \ + pqCodeDistances<<>>( \ + queries, kQueriesPerBlock, \ + coarseCentroids, pqCentroids, \ + topQueryToCentroid, outCodeDistancesT); \ + } while (0) +#endif #define CODE_L2(DIMS) \ do { \ diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu index 817990b4a6..eec8852310 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQCodeDistances.cu @@ -26,10 +26,12 @@ template struct Converter { }; +#ifdef FAISS_USE_FLOAT16 template <> struct Converter { inline static __device__ half to(float v) { return __float2half(v); } }; +#endif template <> struct Converter { @@ -394,6 +396,7 @@ runPQCodeDistancesMM(Tensor& pqCentroids, Tensor outCodeDistancesF; DeviceTensor outCodeDistancesFloatMem; +#ifdef FAISS_USE_FLOAT16 if (useFloat16Lookup) { outCodeDistancesFloatMem = DeviceTensor( mem, {outCodeDistances.getSize(0), @@ -406,6 +409,9 @@ runPQCodeDistancesMM(Tensor& pqCentroids, } else { outCodeDistancesF = outCodeDistances.toTensor(); } +#else + outCodeDistancesF = outCodeDistances.toTensor(); +#endif // Transpose -2(sub q)(q * c)(code) to -2(q * c)(sub q)(code) (which // is where we build our output distances) @@ -445,6 +451,7 @@ runPQCodeDistancesMM(Tensor& pqCentroids, runSumAlongColumns(pqCentroidsNorm, outDistancesCodeViewCols, stream); +#ifdef FAISS_USE_FLOAT16 if (useFloat16Lookup) { // Need to convert back auto outCodeDistancesH = outCodeDistances.toTensor(); @@ -452,6 +459,7 @@ runPQCodeDistancesMM(Tensor& pqCentroids, outCodeDistancesF, outCodeDistancesH); } +#endif } void @@ -483,6 +491,7 @@ runPQCodeDistances(Tensor& pqCentroids, auto smem = (3 * dimsPerSubQuantizer) * sizeof(float) + topQueryToCentroid.getSize(1) * sizeof(int); +#ifdef FAISS_USE_FLOAT16 #define RUN_CODE(DIMS, L2) \ do { \ if (useFloat16Lookup) { \ @@ -492,7 +501,19 @@ runPQCodeDistances(Tensor& pqCentroids, queries, kQueriesPerBlock, \ coarseCentroids, pqCentroids, \ topQueryToCentroid, outCodeDistancesT); \ - } else { \ + } else { \ + auto outCodeDistancesT = outCodeDistances.toTensor(); \ + \ + pqCodeDistances<<>>( \ + queries, kQueriesPerBlock, \ + coarseCentroids, pqCentroids, \ + topQueryToCentroid, outCodeDistancesT); \ + } \ + } while (0) +#else +#define RUN_CODE(DIMS, L2) \ + do { \ + if(!useFloat16Lookup){ \ auto outCodeDistancesT = outCodeDistances.toTensor(); \ \ pqCodeDistances<<>>( \ @@ -501,6 +522,7 @@ runPQCodeDistances(Tensor& pqCentroids, topQueryToCentroid, outCodeDistancesT); \ } \ } while (0) +#endif #define CODE_L2(DIMS) \ do { \ diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh index ffc81b1f8c..a77e783d09 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh @@ -275,7 +275,12 @@ runMultiPassTile(Tensor& queries, auto block = dim3(kThreadsPerBlock); // pq centroid distances - auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); + +#ifdef FAISS_USE_FLOAT16 + auto smem = (sizeof(float)== useFloat16Lookup) ? sizeof(half) : sizeof(float); +#else + auto smem = sizeof(float); +#endif smem *= numSubQuantizers * numSubQuantizerCodes; FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); @@ -296,6 +301,7 @@ runMultiPassTile(Tensor& queries, allDistances); \ } while (0) +#ifdef FAISS_USE_FLOAT16 #define RUN_PQ(NUM_SUB_Q) \ do { \ if (useFloat16Lookup) { \ @@ -304,6 +310,12 @@ runMultiPassTile(Tensor& queries, RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ } \ } while (0) +#else +#define RUN_PQ(NUM_SUB_Q) \ + do { \ + RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ + } while (0) +#endif switch (bytesPerCode) { case 1: @@ -499,7 +511,12 @@ runPQScanMultiPassNoPrecomputed(Tensor& queries, sizeof(int), stream)); - int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); + int codeDistanceTypeSize = sizeof(float); +#ifdef FAISS_USE_FLOAT16 + if (useFloat16Lookup) { + codeDistanceTypeSize = sizeof(half); + } +#endif int totalCodeDistancesSize = queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu index ecf35fffdb..b4934382cb 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cu @@ -248,8 +248,11 @@ runMultiPassTile(Tensor& queries, metric == MetricType::METRIC_L2); bool l2Distance = metric == MetricType::METRIC_L2; - // Calculate offset lengths, so we know where to write out +#ifndef FAISS_USE_FLOAT16 + FAISS_ASSERT(!useFloat16Lookup); +#endif + // Calculate offset lengths, so we know where to write out // intermediate results runCalcListOffsets(topQueryToCentroid, listLengths, prefixSumOffsets, thrustMem, stream); @@ -275,7 +278,13 @@ runMultiPassTile(Tensor& queries, auto block = dim3(kThreadsPerBlock); // pq centroid distances - auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); + //auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); + auto smem = sizeof(float); +#ifdef FAISS_USE_FLOAT16 + if (useFloat16Lookup) { + smem = sizeof(half); + } +#endif smem *= numSubQuantizers * numSubQuantizerCodes; FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); @@ -296,6 +305,7 @@ runMultiPassTile(Tensor& queries, allDistances); \ } while (0) +#ifdef FAISS_USE_FLOAT16 #define RUN_PQ(NUM_SUB_Q) \ do { \ if (useFloat16Lookup) { \ @@ -304,6 +314,12 @@ runMultiPassTile(Tensor& queries, RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ } \ } while (0) +#else +#define RUN_PQ(NUM_SUB_Q) \ + do { \ + RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ + } while (0) +#endif // FAISS_USE_FLOAT16 switch (bytesPerCode) { case 1: @@ -497,7 +513,14 @@ void runPQScanMultiPassNoPrecomputed(Tensor& queries, sizeof(int), stream)); - int codeDistanceTypeSize = useFloat16Lookup ? sizeof(half) : sizeof(float); + int codeDistanceTypeSize = sizeof(float); +#ifdef FAISS_USE_FLOAT16 + if (useFloat16Lookup) { + codeDistanceTypeSize = sizeof(half); + } +#else + FAISS_ASSERT(!useFloat16Lookup); +#endif int totalCodeDistancesSize = queryTileSize * nprobe * numSubQuantizers * numSubQuantizerCodes * diff --git a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu index 583ee477dc..02e65ff32a 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu @@ -252,7 +252,12 @@ runMultiPassTile(Tensor& queries, auto block = dim3(kThreadsPerBlock); // pq precomputed terms (2 + 3) - auto smem = useFloat16Lookup ? sizeof(half) : sizeof(float); + auto smem = sizeof(float); +#ifdef FAISS_USE_FLOAT16 + if (useFloat16Lookup) { + smem = sizeof(half); + } +#endif smem *= numSubQuantizers * numSubQuantizerCodes; FAISS_ASSERT(smem <= getMaxSharedMemPerBlockCurrentDevice()); @@ -275,6 +280,7 @@ runMultiPassTile(Tensor& queries, allDistances); \ } while (0) +#ifdef FAISS_USE_FLOAT16 #define RUN_PQ(NUM_SUB_Q) \ do { \ if (useFloat16Lookup) { \ @@ -283,6 +289,12 @@ runMultiPassTile(Tensor& queries, RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ } \ } while (0) +#else +#define RUN_PQ(NUM_SUB_Q) \ + do { \ + RUN_PQ_OPT(NUM_SUB_Q, float, float4); \ + } while (0) +#endif switch (bytesPerCode) { case 1: diff --git a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu index 078e660417..980b3c3979 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cu @@ -119,6 +119,7 @@ void runCalcResidual(Tensor& vecs, calcResidual(vecs, centroids, vecToCentroid, residuals, stream); } +#ifdef FAISS_USE_FLOAT16 void runCalcResidual(Tensor& vecs, Tensor& centroids, Tensor& vecToCentroid, @@ -126,6 +127,7 @@ void runCalcResidual(Tensor& vecs, cudaStream_t stream) { calcResidual(vecs, centroids, vecToCentroid, residuals, stream); } +#endif void runReconstruct(Tensor& listIds, Tensor& vecs, @@ -134,11 +136,13 @@ void runReconstruct(Tensor& listIds, gatherReconstruct(listIds, vecs, out, stream); } +#ifdef FAISS_USE_FLOAT16 void runReconstruct(Tensor& listIds, Tensor& vecs, Tensor& out, cudaStream_t stream) { gatherReconstruct(listIds, vecs, out, stream); } +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh index ca7bcaa0b6..8e8cd2e756 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/VectorResidual.cuh @@ -31,9 +31,11 @@ void runReconstruct(Tensor& listIds, Tensor& out, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runReconstruct(Tensor& listIds, Tensor& vecs, Tensor& out, cudaStream_t stream); +# endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu index 4f642a0ca8..f6989fc084 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectHalf.cu @@ -10,6 +10,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 + // warp Q to thread Q: // 1, 1 // 32, 2 @@ -143,4 +145,6 @@ void runBlockSelectPair(Tensor& inK, } } +#endif // FAISS_USE_FLOAT16 + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh index 238909d4b0..f787335cdf 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh @@ -241,6 +241,7 @@ void runBlockSelectPair(Tensor& inKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runBlockSelect(Tensor& in, Tensor& bitset, Tensor& outKeys, @@ -253,5 +254,6 @@ void runBlockSelectPair(Tensor& inKeys, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh index ddc30af173..cf9b74c971 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/ConversionOperators.cuh @@ -29,6 +29,7 @@ struct Convert { } }; +#ifdef FAISS_USE_FLOAT16 template <> struct Convert { inline __device__ half operator()(float v) const { @@ -42,6 +43,7 @@ struct Convert { return __half2float(v); } }; +#endif template struct ConvertTo { @@ -50,38 +52,50 @@ struct ConvertTo { template <> struct ConvertTo { static inline __device__ float to(float v) { return v; } +#ifdef FAISS_USE_FLOAT16 static inline __device__ float to(half v) { return __half2float(v); } +#endif }; template <> struct ConvertTo { static inline __device__ float2 to(float2 v) { return v; } +#ifdef FAISS_USE_FLOAT16 static inline __device__ float2 to(half2 v) { return __half22float2(v); } +#endif }; template <> struct ConvertTo { static inline __device__ float4 to(float4 v) { return v; } +#ifdef FAISS_USE_FLOAT16 static inline __device__ float4 to(Half4 v) { return half4ToFloat4(v); } +#endif }; +#ifdef FAISS_USE_FLOAT16 template <> struct ConvertTo { static inline __device__ half to(float v) { return __float2half(v); } static inline __device__ half to(half v) { return v; } }; +#endif +#ifdef FAISS_USE_FLOAT16 template <> struct ConvertTo { static inline __device__ half2 to(float2 v) { return __float22half2_rn(v); } static inline __device__ half2 to(half2 v) { return v; } }; +#endif +#ifdef FAISS_USE_FLOAT16 template <> struct ConvertTo { static inline __device__ Half4 to(float4 v) { return float4ToHalf4(v); } static inline __device__ Half4 to(Half4 v) { return v; } }; +#endif // Tensor conversion template diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu index 52d54df309..e1f5c09b9f 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cu @@ -12,6 +12,8 @@ #include #include +#ifdef FAISS_USE_FLOAT16 + namespace faiss { namespace gpu { bool getDeviceSupportsFloat16Math(int device) { @@ -36,3 +38,5 @@ __half hostFloat2Half(float a) { } } } // namespace + +#endif // FAISS_USE_FLOAT16 diff --git a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh index 09566eaa94..0af798ba80 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/Float16.cuh @@ -22,10 +22,14 @@ #define FAISS_USE_FULL_FLOAT16 1 #endif // __CUDA_ARCH__ types +#ifdef FAISS_USE_FLOAT16 #include +#endif namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 + // 64 bytes containing 4 half (float16) values struct Half4 { half2 a; @@ -72,4 +76,6 @@ bool getDeviceSupportsFloat16Math(int device); __half hostFloat2Half(float v); +#endif // FAISS_USE_FLOAT16 + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh index b0bb8b5330..b49d634461 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/LoadStoreOperators.cuh @@ -35,6 +35,8 @@ struct LoadStore { } }; +#ifdef FAISS_USE_FLOAT16 + template <> struct LoadStore { static inline __device__ Half4 load(void* p) { @@ -87,4 +89,6 @@ struct LoadStore { } }; +#endif + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh index 68ccbd5686..7e9f25a2a0 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/MathOperators.cuh @@ -217,6 +217,7 @@ struct Math { } }; +#ifdef FAISS_USE_FLOAT16 template <> struct Math { typedef half ScalarType; @@ -555,5 +556,6 @@ struct Math { return h; } }; +#endif // FAISS_USE_FLOAT16 } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu index 54e10be1e5..d700ecaee7 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectHalf.cu @@ -10,6 +10,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 + // warp Q to thread Q: // 1, 1 // 32, 2 @@ -91,4 +93,6 @@ void runWarpSelect(Tensor& in, } } +#endif // FAISS_USE_FLOAT16 + } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh index 3c122e8861..1b690b0306 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpSelectKernel.cuh @@ -62,9 +62,11 @@ void runWarpSelect(Tensor& in, Tensor& outIndices, bool dir, int k, cudaStream_t stream); +#ifdef FAISS_USE_FLOAT16 void runWarpSelect(Tensor& in, Tensor& outKeys, Tensor& outIndices, bool dir, int k, cudaStream_t stream); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh b/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh index 504c73f79a..ec2e5b618c 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/WarpShuffles.cuh @@ -91,6 +91,7 @@ inline __device__ T* shfl_xor(T* const val, return (T*) shfl_xor(v, laneMask, width); } +#ifdef FAISS_USE_FLOAT16 // CUDA 9.0+ has half shuffle #if CUDA_VERSION < 9000 inline __device__ half shfl(half v, @@ -113,5 +114,6 @@ inline __device__ half shfl_xor(half v, return h; } #endif // CUDA_VERSION +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu index 88f1d21b57..d2525935c2 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf1.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 1, 1); BLOCK_SELECT_IMPL(half, false, 1, 1); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu index b38c00b83e..3759af9342 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf128.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 128, 3); BLOCK_SELECT_IMPL(half, false, 128, 3); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu index 2cea11ace2..a8a5cf13e9 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf256.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 256, 4); BLOCK_SELECT_IMPL(half, false, 256, 4); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu index 6045a52fea..18907c5119 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf32.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 32, 2); BLOCK_SELECT_IMPL(half, false, 32, 2); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu index ea4b0bf64b..81a9a84a9f 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalf64.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 64, 3); BLOCK_SELECT_IMPL(half, false, 64, 3); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu index 710e8c8460..e83b615193 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF1024.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, false, 1024, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu index 5f7f4d4f6b..e06c334481 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF2048.cu @@ -11,7 +11,9 @@ namespace faiss { namespace gpu { #if GPU_MAX_SELECTION_K >= 2048 +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, false, 2048, 8); #endif +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu index 07ea1f9f6b..c1b67bd3de 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfF512.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, false, 512, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu index 6dc37accf7..2fd0dffa37 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT1024.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 1024, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu index dd38b8d6a5..f91b6787e2 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT2048.cu @@ -11,7 +11,9 @@ namespace faiss { namespace gpu { #if GPU_MAX_SELECTION_K >= 2048 +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 2048, 8); #endif +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu index ff2a9903fa..a2877db6ed 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/blockselect/BlockSelectHalfT512.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 BLOCK_SELECT_IMPL(half, true, 512, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu index 79876207f7..da3206d454 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf1.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 1, 1); WARP_SELECT_IMPL(half, false, 1, 1); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu index 150c9507da..8705e593c5 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf128.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 128, 3); WARP_SELECT_IMPL(half, false, 128, 3); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu index cd8b49b18f..a7af219582 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf256.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 256, 4); WARP_SELECT_IMPL(half, false, 256, 4); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu index ce1b7e4c74..d7ed389aec 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf32.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 32, 2); WARP_SELECT_IMPL(half, false, 32, 2); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu index 9d4311ec01..fea6c40b9c 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalf64.cu @@ -9,7 +9,9 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 64, 3); WARP_SELECT_IMPL(half, false, 64, 3); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu index 0241300141..d99eea9c7c 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF1024.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, false, 1024, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu index 1a16ee45c9..030d28e17f 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF2048.cu @@ -11,7 +11,9 @@ namespace faiss { namespace gpu { #if GPU_MAX_SELECTION_K >= 2048 +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, false, 2048, 8); #endif +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu index 4cb138837b..651d727580 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfF512.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, false, 512, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu index 6a95007ff8..5a576d7c48 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT1024.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 1024, 8); +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu index 94586d0100..b5bd1f9e53 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT2048.cu @@ -11,7 +11,9 @@ namespace faiss { namespace gpu { #if GPU_MAX_SELECTION_K >= 2048 +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 2048, 8); #endif +#endif } } // namespace diff --git a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu index 6ca08a16ab..21b8660273 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu +++ b/core/src/index/thirdparty/faiss/gpu/utils/warpselect/WarpSelectHalfT512.cu @@ -9,6 +9,8 @@ namespace faiss { namespace gpu { +#ifdef FAISS_USE_FLOAT16 WARP_SELECT_IMPL(half, true, 512, 8); +#endif } } // namespace