diff --git a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIDMAP.cpp b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIDMAP.cpp index c2778c28b1..5bcc288c32 100644 --- a/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIDMAP.cpp +++ b/core/src/index/knowhere/knowhere/index/vector_index/gpu/IndexGPUIDMAP.cpp @@ -105,7 +105,7 @@ GPUIDMAP::GetRawIds() { void GPUIDMAP::QueryImpl(int64_t n, const float* data, int64_t k, float* distances, int64_t* labels, const Config& config) { ResScope rs(res_, gpu_id_); - index_->search(n, (float*)data, k, distances, labels); + index_->search(n, (float*)data, k, distances, labels, bitset_); } void diff --git a/core/src/index/thirdparty/faiss/IndexIVF.cpp b/core/src/index/thirdparty/faiss/IndexIVF.cpp index e7b0c94b0b..6f8871034f 100644 --- a/core/src/index/thirdparty/faiss/IndexIVF.cpp +++ b/core/src/index/thirdparty/faiss/IndexIVF.cpp @@ -310,7 +310,7 @@ void IndexIVF::search (idx_t n, const float *x, idx_t k, float *distances, idx_t invlists->prefetch_lists (idx.get(), n * nprobe); search_preassigned (n, x, k, idx.get(), coarse_dis.get(), - distances, labels, false, nullptr,bitset); + distances, labels, false, nullptr, bitset); indexIVF_stats.search_time += getmillisecs() - t0; } diff --git a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh index 7f0242780d..097dfbc9f6 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/Distance.cuh @@ -18,55 +18,6 @@ class GpuResources; /// Calculates brute-force L2 distance between `vectors` and /// `queries`, returning the k closest results seen -void runL2Distance(GpuResources* resources, - Tensor& vectors, - bool vectorsRowMajor, - // can be optionally pre-computed; nullptr if we - // have to compute it upon the call - Tensor* vectorNorms, - Tensor& queries, - bool queriesRowMajor, - int k, - Tensor& outDistances, - Tensor& outIndices, - // Do we care about `outDistances`? If not, we can - // take shortcuts. - bool ignoreOutDistances = false); - -/// Calculates brute-force inner product distance between `vectors` -/// and `queries`, returning the k closest results seen -void runIPDistance(GpuResources* resources, - Tensor& vectors, - bool vectorsRowMajor, - Tensor& queries, - bool queriesRowMajor, - int k, - Tensor& outDistances, - Tensor& outIndices); - -void runIPDistance(GpuResources* resources, - Tensor& vectors, - bool vectorsRowMajor, - Tensor& queries, - bool queriesRowMajor, - int k, - Tensor& outDistances, - Tensor& outIndices, - bool useHgemm); - -void runL2Distance(GpuResources* resources, - Tensor& vectors, - bool vectorsRowMajor, - Tensor* vectorNorms, - Tensor& queries, - bool queriesRowMajor, - int k, - Tensor& outDistances, - Tensor& outIndices, - bool useHgemm, - bool ignoreOutDistances = false); - -// Bitset added void runL2Distance(GpuResources* resources, Tensor& vectors, bool vectorsRowMajor, diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu index ab4556a584..367bf8465d 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cu @@ -24,6 +24,7 @@ namespace faiss { namespace gpu { template __global__ void l2SelectMin1(Tensor productDistances, Tensor centroidDistances, + Tensor bitset, Tensor outDistances, Tensor outIndices) { // Each block handles kRowsPerBlock rows of the distances (results) @@ -44,6 +45,8 @@ __global__ void l2SelectMin1(Tensor productDistances, // FIXME: if we have exact multiples, don't need this bool endRow = (blockIdx.x == gridDim.x - 1); + bool bitsetIsEmpty = (bitset.getSize(0) == 0); + if (endRow) { if (productDistances.getSize(0) % kRowsPerBlock == 0) { endRow = false; @@ -54,8 +57,12 @@ __global__ void l2SelectMin1(Tensor productDistances, for (int row = rowStart; row < productDistances.getSize(0); ++row) { for (int col = threadIdx.x; col < productDistances.getSize(1); col += blockDim.x) { - distance[0] = Math::add(centroidDistances[col], - productDistances[row][col]); + if (bitsetIsEmpty || (!(bitset[col >> 3] & (0x1 << (col & 0x7))))) { + distance[0] = Math::add(centroidDistances[col], + productDistances[row][col]); + } else { + distance[0] = (T)(1.0 / 0.0); + } if (Math::lt(distance[0], threadMin[0].k)) { threadMin[0].k = distance[0]; @@ -117,10 +124,12 @@ __global__ void l2SelectMin1(Tensor productDistances, } } +// With bitset included // L2 + select kernel for k > 1, no re-use of ||c||^2 template __global__ void l2SelectMinK(Tensor productDistances, Tensor centroidDistances, + Tensor bitset, Tensor outDistances, Tensor outIndices, int k, T initK) { @@ -140,15 +149,28 @@ __global__ void l2SelectMinK(Tensor productDistances, int limit = utils::roundDown(productDistances.getSize(1), kWarpSize); int i = threadIdx.x; + bool bitsetIsEmpty = (bitset.getSize(0) == 0); + T v; + for (; i < limit; i += blockDim.x) { - T v = Math::add(centroidDistances[i], - productDistances[row][i]); + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { + v = Math::add(centroidDistances[i], + productDistances[row][i]); + } else { + v = (T)(1.0 / 0.0); + } + heap.add(v, i); } if (i < productDistances.getSize(1)) { - T v = Math::add(centroidDistances[i], - productDistances[row][i]); + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { + v = Math::add(centroidDistances[i], + productDistances[row][i]); + } else { + v = (T)(1.0 / 0.0); + } + heap.addThreadQ(v, i); } @@ -159,155 +181,6 @@ __global__ void l2SelectMinK(Tensor productDistances, } } -// With bitset included -// L2 + select kernel for k == 1, implements re-use of ||c||^2 -template -__global__ void l2SelectMin1(Tensor productDistances, - Tensor centroidDistances, - Tensor bitset, - Tensor outDistances, - Tensor outIndices) { - // Each block handles kRowsPerBlock rows of the distances (results) - Pair threadMin[kRowsPerBlock]; - __shared__ Pair blockMin[kRowsPerBlock * (kBlockSize / kWarpSize)]; - - T distance[kRowsPerBlock]; - -#pragma unroll - for (int i = 0; i < kRowsPerBlock; ++i) { - threadMin[i].k = Limits::getMax(); - threadMin[i].v = -1; - } - - // blockIdx.x: which chunk of rows we are responsible for updating - int rowStart = blockIdx.x * kRowsPerBlock; - - // FIXME: if we have exact multiples, don't need this - bool endRow = (blockIdx.x == gridDim.x - 1); - - if (endRow) { - if (productDistances.getSize(0) % kRowsPerBlock == 0) { - endRow = false; - } - } - - if (endRow) { - for (int row = rowStart; row < productDistances.getSize(0); ++row) { - for (int col = threadIdx.x; col < productDistances.getSize(1); - col += blockDim.x) { - if (!(bitset[col >> 3] & (0x1 << (col & 0x7)))) { - distance[0] = Math::add(centroidDistances[col], - productDistances[row][col]); - - if (Math::lt(distance[0], threadMin[0].k)) { - threadMin[0].k = distance[0]; - threadMin[0].v = col; - } - } - } - - // Reduce within the block - threadMin[0] = - blockReduceAll, Min >, false, false>( - threadMin[0], Min >(), blockMin); - - if (threadIdx.x == 0) { - outDistances[row][0] = threadMin[0].k; - outIndices[row][0] = threadMin[0].v; - } - - // so we can use the shared memory again - __syncthreads(); - - threadMin[0].k = Limits::getMax(); - threadMin[0].v = -1; - } - } else { - for (int col = threadIdx.x; col < productDistances.getSize(1); - col += blockDim.x) { - T centroidDistance = centroidDistances[col]; - -#pragma unroll - for (int row = 0; row < kRowsPerBlock; ++row) { - distance[row] = productDistances[rowStart + row][col]; - } - -#pragma unroll - for (int row = 0; row < kRowsPerBlock; ++row) { - distance[row] = Math::add(distance[row], centroidDistance); - } - -#pragma unroll - for (int row = 0; row < kRowsPerBlock; ++row) { - if (Math::lt(distance[row], threadMin[row].k)) { - threadMin[row].k = distance[row]; - threadMin[row].v = col; - } - } - } - - // Reduce within the block - blockReduceAll, Min >, false, false>( - threadMin, Min >(), blockMin); - - if (threadIdx.x == 0) { -#pragma unroll - for (int row = 0; row < kRowsPerBlock; ++row) { - outDistances[rowStart + row][0] = threadMin[row].k; - outIndices[rowStart + row][0] = threadMin[row].v; - } - } - } -} - -// With bitset included -// L2 + select kernel for k > 1, no re-use of ||c||^2 -template -__global__ void l2SelectMinK(Tensor productDistances, - Tensor centroidDistances, - Tensor bitset, - Tensor outDistances, - Tensor outIndices, - int k, T initK) { - // Each block handles a single row of the distances (results) - constexpr int kNumWarps = ThreadsPerBlock / kWarpSize; - - __shared__ T smemK[kNumWarps * NumWarpQ]; - __shared__ int smemV[kNumWarps * NumWarpQ]; - - BlockSelect, - NumWarpQ, NumThreadQ, ThreadsPerBlock> - heap(initK, -1, smemK, smemV, k); - - int row = blockIdx.x; - - // Whole warps must participate in the selection - int limit = utils::roundDown(productDistances.getSize(1), kWarpSize); - int i = threadIdx.x; - - for (; i < limit; i += blockDim.x) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { - T v = Math::add(centroidDistances[i], - productDistances[row][i]); - heap.add(v, i); - } - } - - if (i < productDistances.getSize(1)) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { - T v = Math::add(centroidDistances[i], - productDistances[row][i]); - heap.addThreadQ(v, i); - } - } - - heap.reduce(); - for (int i = threadIdx.x; i < k; i += blockDim.x) { - outDistances[row][i] = smemK[i]; - outIndices[row][i] = smemV[i]; - } -} - template void runL2SelectMin(Tensor& productDistances, @@ -338,14 +211,6 @@ void runL2SelectMin(Tensor& productDistances, auto grid = dim3(outDistances.getSize(0)); #define RUN_L2_SELECT(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \ - do { \ - l2SelectMinK \ - <<>>(productDistances, centroidDistances, \ - outDistances, outIndices, \ - k, Limits::getMax()); \ - } while (0) - -#define RUN_L2_SELECT_BITSET(BLOCK, NUM_WARP_Q, NUM_THREAD_Q) \ do { \ l2SelectMinK \ <<>>(productDistances, centroidDistances, \ @@ -353,55 +218,27 @@ void runL2SelectMin(Tensor& productDistances, k, Limits::getMax()); \ } while (0) - if (bitset.getSize(0) == 0) { - // block size 128 for everything <= 1024 - if (k <= 32) { - RUN_L2_SELECT(128, 32, 2); - } else if (k <= 64) { - RUN_L2_SELECT(128, 64, 3); - } else if (k <= 128) { - RUN_L2_SELECT(128, 128, 3); - } else if (k <= 256) { - RUN_L2_SELECT(128, 256, 4); - } else if (k <= 512) { - RUN_L2_SELECT(128, 512, 8); - } else if (k <= 1024) { - RUN_L2_SELECT(128, 1024, 8); - - #if GPU_MAX_SELECTION_K >= 2048 - } else if (k <= 2048) { - // smaller block for less shared memory - RUN_L2_SELECT(64, 2048, 8); - #endif - - } else { - FAISS_ASSERT(false); - } + // block size 128 for everything <= 1024 + if (k <= 32) { + RUN_L2_SELECT(128, 32, 2); + } else if (k <= 64) { + RUN_L2_SELECT(128, 64, 3); + } else if (k <= 128) { + RUN_L2_SELECT(128, 128, 3); + } else if (k <= 256) { + RUN_L2_SELECT(128, 256, 4); + } else if (k <= 512) { + RUN_L2_SELECT(128, 512, 8); + } else if (k <= 1024) { + RUN_L2_SELECT(128, 1024, 8); +#if GPU_MAX_SELECTION_K >= 2048 + } else if (k <= 2048) { + // smaller block for less shared memory + RUN_L2_SELECT(64, 2048, 8); +#endif } else { - // With bitset - if (k <= 32) { - RUN_L2_SELECT_BITSET(128, 32, 2); - } else if (k <= 64) { - RUN_L2_SELECT_BITSET(128, 64, 3); - } else if (k <= 128) { - RUN_L2_SELECT_BITSET(128, 128, 3); - } else if (k <= 256) { - RUN_L2_SELECT_BITSET(128, 256, 4); - } else if (k <= 512) { - RUN_L2_SELECT_BITSET(128, 512, 8); - } else if (k <= 1024) { - RUN_L2_SELECT_BITSET(128, 1024, 8); - - #if GPU_MAX_SELECTION_K >= 2048 - } else if (k <= 2048) { - // smaller block for less shared memory - RUN_L2_SELECT_BITSET(64, 2048, 8); - #endif - - } else { - FAISS_ASSERT(false); - } + FAISS_ASSERT(false); } } diff --git a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh index 31b0ddfe23..705b7c3355 100644 --- a/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh +++ b/core/src/index/thirdparty/faiss/gpu/impl/L2Select.cuh @@ -12,20 +12,6 @@ namespace faiss { namespace gpu { -void runL2SelectMin(Tensor& productDistances, - Tensor& centroidDistances, - Tensor& outDistances, - Tensor& outIndices, - int k, - cudaStream_t stream); - -void runL2SelectMin(Tensor& productDistances, - Tensor& centroidDistances, - Tensor& outDistances, - Tensor& outIndices, - int k, - cudaStream_t stream); - void runL2SelectMin(Tensor& productDistances, Tensor& centroidDistances, Tensor& bitset, diff --git a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh index e7d804c50f..d8eb2075c1 100644 --- a/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh +++ b/core/src/index/thirdparty/faiss/gpu/utils/BlockSelectKernel.cuh @@ -142,17 +142,24 @@ __global__ void blockSelect(Tensor in, // Whole warps must participate in the selection int limit = utils::roundDown(in.getSize(1), kWarpSize); + bool bitsetIsEmpty = (bitset.getSize(0) == 0); + for (; i < limit; i += ThreadsPerBlock) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { heap.add(*inStart, (IndexType) i); - inStart += ThreadsPerBlock; + } else { + heap.add(-1.0, (IndexType) i); } + + inStart += ThreadsPerBlock; } // Handle last remainder fraction of a warp of elements if (i < in.getSize(1)) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { heap.addThreadQ(*inStart, (IndexType) i); + } else { + heap.addThreadQ(-1.0, (IndexType) i); } } @@ -197,18 +204,25 @@ __global__ void blockSelectPair(Tensor inK, // Whole warps must participate in the selection int limit = utils::roundDown(inK.getSize(1), kWarpSize); + bool bitsetIsEmpty = (bitset.getSize(0) == 0); + for (; i < limit; i += ThreadsPerBlock) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { heap.add(*inKStart, *inVStart); - inKStart += ThreadsPerBlock; - inVStart += ThreadsPerBlock; + } else { + heap.add(-1.0, *inVStart); } + + inKStart += ThreadsPerBlock; + inVStart += ThreadsPerBlock; } // Handle last remainder fraction of a warp of elements if (i < inK.getSize(1)) { - if (!(bitset[i >> 3] & (0x1 << (i & 0x7)))) { + if (bitsetIsEmpty || (!(bitset[i >> 3] & (0x1 << (i & 0x7))))) { heap.addThreadQ(*inKStart, *inVStart); + } else { + heap.addThreadQ(-1.0, *inVStart); } } diff --git a/core/src/index/unittest/test_idmap.cpp b/core/src/index/unittest/test_idmap.cpp index c884b979e3..bf06a1eb64 100644 --- a/core/src/index/unittest/test_idmap.cpp +++ b/core/src/index/unittest/test_idmap.cpp @@ -9,40 +9,61 @@ // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express // or implied. See the License for the specific language governing permissions and limitations under the License. +#include + #include #include -#include #include +#include #include "knowhere/common/Exception.h" #include "knowhere/index/vector_index/IndexIDMAP.h" +#include "knowhere/index/vector_index/IndexType.h" #ifdef MILVUS_GPU_VERSION +#include #include "knowhere/index/vector_index/gpu/IndexGPUIDMAP.h" #include "knowhere/index/vector_index/helpers/Cloner.h" +#include "knowhere/index/vector_index/helpers/FaissGpuResourceMgr.h" #endif #include "Helper.h" #include "unittest/utils.h" -class IDMAPTest : public DataGen, public TestGpuIndexBase { +using ::testing::Combine; +using ::testing::TestWithParam; +using ::testing::Values; + +class IDMAPTest : public DataGen, public TestWithParam { protected: void SetUp() override { - TestGpuIndexBase::SetUp(); - +#ifdef MILVUS_GPU_VERSION + milvus::knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(DEVICEID, PINMEM, TEMPMEM, RESNUM); +#endif + index_mode_ = GetParam(); Init_with_default(); index_ = std::make_shared(); } void TearDown() override { - TestGpuIndexBase::TearDown(); +#ifdef MILVUS_GPU_VERSION + milvus::knowhere::FaissGpuResourceMgr::GetInstance().Free(); +#endif } protected: milvus::knowhere::IDMAPPtr index_ = nullptr; + milvus::knowhere::IndexMode index_mode_; }; -TEST_F(IDMAPTest, idmap_basic) { +INSTANTIATE_TEST_CASE_P(IDMAPParameters, IDMAPTest, + Values( +#ifdef MILVUS_GPU_VERSION + milvus::knowhere::IndexMode::MODE_GPU, +#endif + milvus::knowhere::IndexMode::MODE_CPU)); + +TEST_P(IDMAPTest, idmap_basic) { ASSERT_TRUE(!xb.empty()); milvus::knowhere::Config conf{{milvus::knowhere::meta::DIM, dim}, @@ -67,6 +88,13 @@ TEST_F(IDMAPTest, idmap_basic) { AssertAnns(result, nq, k); // PrintResult(result, nq, k); + if (index_mode_ == milvus::knowhere::IndexMode::MODE_GPU) { +#ifdef MILVUS_GPU_VERSION + // cpu to gpu + index_ = std::dynamic_pointer_cast(index_->CopyCpuToGpu(DEVICEID, conf)); +#endif + } + auto binaryset = index_->Serialize(); auto new_index = std::make_shared(); new_index->Load(binaryset); @@ -96,7 +124,7 @@ TEST_F(IDMAPTest, idmap_basic) { AssertVec(result_bs_3, base_dataset, xid_dataset, 1, dim, CheckMode::CHECK_NOT_EQUAL); } -TEST_F(IDMAPTest, idmap_serialize) { +TEST_P(IDMAPTest, idmap_serialize) { auto serialize = [](const std::string& filename, milvus::knowhere::BinaryPtr& bin, uint8_t* ret) { FileIOWriter writer(filename); writer(static_cast(bin->data.get()), bin->size); @@ -113,6 +141,14 @@ TEST_F(IDMAPTest, idmap_serialize) { // serialize index index_->Train(base_dataset, conf); index_->Add(base_dataset, milvus::knowhere::Config()); + + if (index_mode_ == milvus::knowhere::IndexMode::MODE_GPU) { +#ifdef MILVUS_GPU_VERSION + // cpu to gpu + index_ = std::dynamic_pointer_cast(index_->CopyCpuToGpu(DEVICEID, conf)); +#endif + } + auto re_result = index_->Query(query_dataset, conf); AssertAnns(re_result, nq, k); // PrintResult(re_result, nq, k); @@ -139,7 +175,7 @@ TEST_F(IDMAPTest, idmap_serialize) { } #ifdef MILVUS_GPU_VERSION -TEST_F(IDMAPTest, copy_test) { +TEST_P(IDMAPTest, copy_test) { ASSERT_TRUE(!xb.empty()); milvus::knowhere::Config conf{{milvus::knowhere::meta::DIM, dim},