diff --git a/CHANGELOG.md b/CHANGELOG.md index 39ac26c370..4dab9a88cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ Please mark all change in change log and use the issue from GitHub ## Improvement - \#221 Refactor LOG macro +- \#2039 Support Milvus run on SSE CPUs ## Task diff --git a/core/src/index/CMakeLists.txt b/core/src/index/CMakeLists.txt index 84f61c9aaa..9dc3b3bb41 100644 --- a/core/src/index/CMakeLists.txt +++ b/core/src/index/CMakeLists.txt @@ -79,12 +79,12 @@ endif () include(ThirdPartyPackagesCore) if (CMAKE_BUILD_TYPE STREQUAL "Release") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -DELPP_THREAD_SAFE -fopenmp") if (KNOWHERE_GPU_VERSION) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3") endif () else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp -mavx -mf16c -msse4 -mpopcnt") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fPIC -DELPP_THREAD_SAFE -fopenmp") if (KNOWHERE_GPU_VERSION) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O0 -g") endif () diff --git a/core/src/index/cmake/ThirdPartyPackagesCore.cmake b/core/src/index/cmake/ThirdPartyPackagesCore.cmake index 96de5b8702..6148a62924 100644 --- a/core/src/index/cmake/ThirdPartyPackagesCore.cmake +++ b/core/src/index/cmake/ThirdPartyPackagesCore.cmake @@ -436,7 +436,7 @@ macro(build_faiss) set(FAISS_CONFIGURE_ARGS "--prefix=${FAISS_PREFIX}" "CFLAGS=${EP_C_FLAGS}" - "CXXFLAGS=${EP_CXX_FLAGS} -mavx2 -mf16c -O3" + "CXXFLAGS=${EP_CXX_FLAGS} -mf16c -O3" --without-python) if (FAISS_WITH_MKL) diff --git a/core/src/index/thirdparty/annoy/src/annoylib.h b/core/src/index/thirdparty/annoy/src/annoylib.h index 9f21bbcf2d..ed83629462 100644 --- a/core/src/index/thirdparty/annoy/src/annoylib.h +++ b/core/src/index/thirdparty/annoy/src/annoylib.h @@ -187,12 +187,14 @@ inline T euclidean_distance(const T* x, const T* y, int f) { //#ifdef USE_AVX // Horizontal single sum of 256bit vector. +#if 0 /* use FAISS distance calculation algorithm instead */ inline float hsum256_ps_avx(__m256 v) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); } +#endif template<> inline float dot(const float* x, const float *y, int f) { diff --git a/core/src/index/thirdparty/faiss/FaissHook.cpp b/core/src/index/thirdparty/faiss/FaissHook.cpp index 8f9c91a79e..3da223e9ff 100644 --- a/core/src/index/thirdparty/faiss/FaissHook.cpp +++ b/core/src/index/thirdparty/faiss/FaissHook.cpp @@ -7,8 +7,10 @@ #include #include #include +#include #include #include +#include #include #include diff --git a/core/src/index/thirdparty/faiss/Makefile b/core/src/index/thirdparty/faiss/Makefile index e04fb87da0..520e9527ef 100644 --- a/core/src/index/thirdparty/faiss/Makefile +++ b/core/src/index/thirdparty/faiss/Makefile @@ -7,6 +7,7 @@ HEADERS = $(wildcard *.h impl/*.h utils/*.h) SRC = $(wildcard *.cpp impl/*.cpp utils/*.cpp) +AVX_SRC = $(wildcard *avx.cpp impl/*avx.cpp utils/*avx.cpp) AVX512_SRC = $(wildcard *avx512.cpp impl/*avx512.cpp utils/*avx512.cpp) OBJ = $(SRC:.cpp=.o) INSTALLDIRS = $(DESTDIR)$(libdir) $(DESTDIR)$(includedir)/faiss @@ -42,6 +43,10 @@ libfaiss.$(SHAREDEXT): $(OBJ) %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -c $< -o $@ +# support avx +%avx.o: %avx.cpp + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx2 -c $< -o $@ + # support avx512 %avx512.o: %avx512.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CPUFLAGS) -mavx512f -mavx512dq -mavx512bw -c $< -o $@ diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp index 53c279bc89..767bb8485f 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp @@ -11,13 +11,8 @@ #include #include - #include -#ifdef __SSE__ -#include -#endif - #include #include #include @@ -40,9 +35,6 @@ namespace faiss { * that hides the template mess. ********************************************************************/ -#ifdef __AVX__ -#define USE_AVX -#endif /******************************************************************* @@ -444,14 +436,10 @@ InvertedListScanner* ScalarQuantizer::select_InvertedListScanner if (d % 16 == 0 && support_avx512()) { return sel0_InvertedListScanner<16> (mt, this, quantizer, store_pairs, by_residual); - } -#ifdef USE_AVX - if (d % 8 == 0) { + } if (d % 8 == 0) { return sel0_InvertedListScanner<8> (mt, this, quantizer, store_pairs, by_residual); - } else -#endif - { + } else { return sel0_InvertedListScanner<1> (mt, this, quantizer, store_pairs, by_residual); } diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h index 35bb702337..a437db982d 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h @@ -11,13 +11,8 @@ #include #include - #include -#ifdef __SSE__ -#include -#endif - #include #include #include @@ -25,11 +20,6 @@ namespace faiss { -#ifdef __AVX__ -#define USE_AVX -#endif - - /******************************************************************* * Codec: converts between values in [0, 1] and an index in a code * array. The "i" parameter is the vector component index (not byte @@ -44,22 +34,6 @@ struct Codec8bit { static float decode_component (const uint8_t *code, int i) { return (code[i] + 0.5f) / 255.0f; } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - uint64_t c8 = *(uint64_t*)(code + i); - __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8)); - __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32)); - // __m256i i8 = _mm256_set_m128i(c4lo, c4hi); - __m256i i8 = _mm256_castsi128_si256 (c4lo); - i8 = _mm256_insertf128_si256 (i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps (i8); - __m256 half = _mm256_set1_ps (0.5f); - f8 += half; - __m256 one_255 = _mm256_set1_ps (1.f / 255.f); - return f8 * one_255; - } -#endif }; @@ -71,28 +45,6 @@ struct Codec4bit { static float decode_component (const uint8_t *code, int i) { return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; - uint32_t c4od = (c4 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev), - _mm_set1_epi32(c4od)); - __m128i c4lo = _mm_cvtepu8_epi32 (c8); - __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4)); - __m256i i8 = _mm256_castsi128_si256 (c4lo); - i8 = _mm256_insertf128_si256 (i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps (i8); - __m256 half = _mm256_set1_ps (0.5f); - f8 += half; - __m256 one_255 = _mm256_set1_ps (1.f / 15.f); - return f8 * one_255; - } -#endif }; struct Codec6bit { @@ -138,20 +90,6 @@ struct Codec6bit { } return (bits + 0.5f) / 63.0f; } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - return _mm256_set_ps - (decode_component(code, i + 7), - decode_component(code, i + 6), - decode_component(code, i + 5), - decode_component(code, i + 4), - decode_component(code, i + 3), - decode_component(code, i + 2), - decode_component(code, i + 1), - decode_component(code, i + 0)); - } -#endif }; @@ -204,25 +142,6 @@ struct QuantizerTemplate: Quantizer { }; - -#ifdef USE_AVX - -template -struct QuantizerTemplate: QuantizerTemplate { - QuantizerTemplate (size_t d, const std::vector &trained): - QuantizerTemplate (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m256 xi = Codec::decode_8_components (code, i); - return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff); - } -}; - -#endif - - - template struct QuantizerTemplate: Quantizer { const size_t d; @@ -257,22 +176,6 @@ struct QuantizerTemplate: Quantizer { }; -#ifdef USE_AVX - -template -struct QuantizerTemplate: QuantizerTemplate { - QuantizerTemplate (size_t d, const std::vector &trained): - QuantizerTemplate (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m256 xi = Codec::decode_8_components (code, i); - return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i); - } -}; - -#endif - /******************************************************************* * FP16 quantizer *******************************************************************/ @@ -305,21 +208,6 @@ struct QuantizerFP16<1>: Quantizer { } }; -#ifdef USE_AVX - -template<> -struct QuantizerFP16<8>: QuantizerFP16<1> { - QuantizerFP16 (size_t d, const std::vector &trained): - QuantizerFP16<1> (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps (codei); - } -}; - -#endif /******************************************************************* * 8bit_direct quantizer @@ -354,23 +242,6 @@ struct Quantizer8bitDirect<1>: Quantizer { } }; -#ifdef USE_AVX - -template<> -struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> { - Quantizer8bitDirect (size_t d, const std::vector &trained): - Quantizer8bitDirect<1> (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32 - return _mm256_cvtepi32_ps (y8); // 8 * float32 - } -}; - -#endif - template Quantizer *select_quantizer_1 ( @@ -407,7 +278,6 @@ Quantizer *select_quantizer_1 ( template struct SimilarityL2 {}; - template<> struct SimilarityL2<1> { static constexpr int simdwidth = 1; @@ -441,89 +311,78 @@ struct SimilarityL2<1> { } }; - -#ifdef USE_AVX +/* as same as SimilarityL2<1>, let build pass */ template<> struct SimilarityL2<8> { - static constexpr int simdwidth = 8; + static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_L2; const float *y, *yi; explicit SimilarityL2 (const float * y): y(y) {} - __m256 accu8; - void begin_8 () { - accu8 = _mm256_setzero_ps(); + /******* scalar accumulator *******/ + + float accu; + + void begin () { + accu = 0; yi = y; } - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - __m256 tmp = yiv - x; - accu8 += tmp * tmp; + void add_component (float x) { + float tmp = *yi++ - x; + accu += tmp * tmp; } - void add_8_components_2 (__m256 x, __m256 y) { - __m256 tmp = y - x; - accu8 += tmp * tmp; + void add_component_2 (float x1, float x2) { + float tmp = x1 - x2; + accu += tmp * tmp; } - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + float result () { + return accu; } }; -/* as same as SimilarityL2<8>, let build pass */ +/* as same as SimilarityL2<1>, let build pass */ template<> struct SimilarityL2<16> { - static constexpr int simdwidth = 8; + static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_L2; const float *y, *yi; explicit SimilarityL2 (const float * y): y(y) {} - __m256 accu8; - void begin_8 () { - accu8 = _mm256_setzero_ps(); + /******* scalar accumulator *******/ + + float accu; + + void begin () { + accu = 0; yi = y; } - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - __m256 tmp = yiv - x; - accu8 += tmp * tmp; + void add_component (float x) { + float tmp = *yi++ - x; + accu += tmp * tmp; } - void add_8_components_2 (__m256 x, __m256 y) { - __m256 tmp = y - x; - accu8 += tmp * tmp; + void add_component_2 (float x1, float x2) { + float tmp = x1 - x2; + accu += tmp * tmp; } - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + float result () { + return accu; } }; -#endif template struct SimilarityIP {}; - template<> struct SimilarityIP<1> { static constexpr int simdwidth = 1; @@ -553,13 +412,11 @@ struct SimilarityIP<1> { } }; -#ifdef USE_AVX - +/* as same as SimilarityIP<1>, let build pass */ template<> struct SimilarityIP<8> { - static constexpr int simdwidth = 8; + static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; float accu; @@ -567,39 +424,29 @@ struct SimilarityIP<8> { explicit SimilarityIP (const float * y): y (y) {} - __m256 accu8; - - void begin_8 () { - accu8 = _mm256_setzero_ps(); + void begin () { + accu = 0; yi = y; } - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - accu8 += yiv * x; + void add_component (float x) { + accu += *yi++ * x; } - void add_8_components_2 (__m256 x1, __m256 x2) { - accu8 += x1 * x2; + void add_component_2 (float x1, float x2) { + accu += x1 * x2; } - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + float result () { + return accu; } }; -/* as same as SimilarityIP<8>, let build pass */ +/* as same as SimilarityIP<1>, let build pass */ template<> struct SimilarityIP<16> { - static constexpr int simdwidth = 8; + static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; float accu; @@ -607,33 +454,23 @@ struct SimilarityIP<16> { explicit SimilarityIP (const float * y): y (y) {} - __m256 accu8; - - void begin_8 () { - accu8 = _mm256_setzero_ps(); + void begin () { + accu = 0; yi = y; } - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - accu8 += yiv * x; + void add_component (float x) { + accu += *yi++ * x; } - void add_8_components_2 (__m256 x1, __m256 x2) { - accu8 += x1 * x2; + void add_component_2 (float x1, float x2) { + accu += x1 * x2; } - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + float result () { + return accu; } }; -#endif /******************************************************************* @@ -696,63 +533,6 @@ struct DCTemplate : SQDistanceComputer } }; -#ifdef USE_AVX - -template -struct DCTemplate : SQDistanceComputer -{ - using Sim = Similarity; - - Quantizer quant; - - DCTemplate(size_t d, const std::vector &trained): - quant(d, trained) - {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query (const float *x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_distance (q, code); - } -}; - -#endif - - /******************************************************************* * DistanceComputerByte: computes distances in the integer domain @@ -811,84 +591,12 @@ struct DistanceComputerByte : SQDistanceComputer { } }; -#ifdef USE_AVX - - -template -struct DistanceComputerByte : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte(int d, const std::vector &): d(d), tmp(d) { - } - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m256i accu = _mm256_setzero_si256 (); - for (int i = 0; i < d; i += 16) { - // load 16 bytes, convert to 16 uint16_t - __m256i c1 = _mm256_cvtepu8_epi16 - (_mm_loadu_si128((__m128i*)(code1 + i))); - __m256i c2 = _mm256_cvtepu8_epi16 - (_mm_loadu_si128((__m128i*)(code2 + i))); - __m256i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm256_madd_epi16(c1, c2); - } else { - __m256i diff = _mm256_sub_epi16(c1, c2); - prod32 = _mm256_madd_epi16(diff, diff); - } - accu = _mm256_add_epi32 (accu, prod32); - } - __m128i sum = _mm256_extractf128_si256(accu, 0); - sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1)); - sum = _mm_hadd_epi32 (sum, sum); - sum = _mm_hadd_epi32 (sum, sum); - return _mm_cvtsi128_si32 (sum); - } - - void set_query (const float *x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_code_distance (tmp.data(), code); - } -}; - -#endif /******************************************************************* * select_distance_computer: runtime selection of template * specialization *******************************************************************/ - template SQDistanceComputer *select_distance_computer ( QuantizerType qtype, diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h new file mode 100644 index 0000000000..024169f62b --- /dev/null +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h @@ -0,0 +1,936 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#pragma once + +#include +#include + +#include + +#ifdef __SSE__ +#include +#endif + +#include +#include +#include + +namespace faiss { + + +#ifdef __AVX__ +#define USE_AVX +#endif + + +/******************************************************************* + * Codec: converts between values in [0, 1] and an index in a code + * array. The "i" parameter is the vector component index (not byte + * index). + */ + +struct Codec8bit_avx { + static void encode_component (float x, uint8_t *code, int i) { + code[i] = (int)(255 * x); + } + + static float decode_component (const uint8_t *code, int i) { + return (code[i] + 0.5f) / 255.0f; + } + +#ifdef USE_AVX + static __m256 decode_8_components (const uint8_t *code, int i) { + uint64_t c8 = *(uint64_t*)(code + i); + __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8)); + __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32)); + // __m256i i8 = _mm256_set_m128i(c4lo, c4hi); + __m256i i8 = _mm256_castsi128_si256 (c4lo); + i8 = _mm256_insertf128_si256 (i8, c4hi, 1); + __m256 f8 = _mm256_cvtepi32_ps (i8); + __m256 half = _mm256_set1_ps (0.5f); + f8 += half; + __m256 one_255 = _mm256_set1_ps (1.f / 255.f); + return f8 * one_255; + } +#endif +}; + + +struct Codec4bit_avx { + static void encode_component (float x, uint8_t *code, int i) { + code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); + } + + static float decode_component (const uint8_t *code, int i) { + return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; + } + +#ifdef USE_AVX + static __m256 decode_8_components (const uint8_t *code, int i) { + uint32_t c4 = *(uint32_t*)(code + (i >> 1)); + uint32_t mask = 0x0f0f0f0f; + uint32_t c4ev = c4 & mask; + uint32_t c4od = (c4 >> 4) & mask; + + // the 8 lower bytes of c8 contain the values + __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev), + _mm_set1_epi32(c4od)); + __m128i c4lo = _mm_cvtepu8_epi32 (c8); + __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4)); + __m256i i8 = _mm256_castsi128_si256 (c4lo); + i8 = _mm256_insertf128_si256 (i8, c4hi, 1); + __m256 f8 = _mm256_cvtepi32_ps (i8); + __m256 half = _mm256_set1_ps (0.5f); + f8 += half; + __m256 one_255 = _mm256_set1_ps (1.f / 15.f); + return f8 * one_255; + } +#endif +}; + +struct Codec6bit_avx { + static void encode_component (float x, uint8_t *code, int i) { + int bits = (int)(x * 63.0); + code += (i >> 2) * 3; + switch(i & 3) { + case 0: + code[0] |= bits; + break; + case 1: + code[0] |= bits << 6; + code[1] |= bits >> 2; + break; + case 2: + code[1] |= bits << 4; + code[2] |= bits >> 4; + break; + case 3: + code[2] |= bits << 2; + break; + } + } + + static float decode_component (const uint8_t *code, int i) { + uint8_t bits; + code += (i >> 2) * 3; + switch(i & 3) { + case 0: + bits = code[0] & 0x3f; + break; + case 1: + bits = code[0] >> 6; + bits |= (code[1] & 0xf) << 2; + break; + case 2: + bits = code[1] >> 4; + bits |= (code[2] & 3) << 4; + break; + case 3: + bits = code[2] >> 2; + break; + } + return (bits + 0.5f) / 63.0f; + } + +#ifdef USE_AVX + static __m256 decode_8_components (const uint8_t *code, int i) { + return _mm256_set_ps + (decode_component(code, i + 7), + decode_component(code, i + 6), + decode_component(code, i + 5), + decode_component(code, i + 4), + decode_component(code, i + 3), + decode_component(code, i + 2), + decode_component(code, i + 1), + decode_component(code, i + 0)); + } +#endif +}; + + + +/******************************************************************* + * Quantizer: normalizes scalar vector components, then passes them + * through a codec + *******************************************************************/ + + +template +struct QuantizerTemplate_avx {}; + + +template +struct QuantizerTemplate_avx: Quantizer { + const size_t d; + const float vmin, vdiff; + + QuantizerTemplate_avx(size_t d, const std::vector &trained): + d(d), vmin(trained[0]), vdiff(trained[1]) + { + } + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + float xi = (x[i] - vmin) / vdiff; + if (xi < 0) { + xi = 0; + } + if (xi > 1.0) { + xi = 1.0; + } + Codec::encode_component(xi, code, i); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + float xi = Codec::decode_component(code, i); + x[i] = vmin + xi * vdiff; + } + } + + float reconstruct_component (const uint8_t * code, int i) const + { + float xi = Codec::decode_component (code, i); + return vmin + xi * vdiff; + } +}; + + + +#ifdef USE_AVX + +template +struct QuantizerTemplate_avx: QuantizerTemplate_avx { + QuantizerTemplate_avx (size_t d, const std::vector &trained): + QuantizerTemplate_avx (d, trained) {} + + __m256 reconstruct_8_components (const uint8_t * code, int i) const + { + __m256 xi = Codec::decode_8_components (code, i); + return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff); + } +}; + +#endif + + + +template +struct QuantizerTemplate_avx: Quantizer { + const size_t d; + const float *vmin, *vdiff; + + QuantizerTemplate_avx (size_t d, const std::vector &trained): + d(d), vmin(trained.data()), vdiff(trained.data() + d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + float xi = (x[i] - vmin[i]) / vdiff[i]; + if (xi < 0) + xi = 0; + if (xi > 1.0) + xi = 1.0; + Codec::encode_component(xi, code, i); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + float xi = Codec::decode_component(code, i); + x[i] = vmin[i] + xi * vdiff[i]; + } + } + + float reconstruct_component (const uint8_t * code, int i) const + { + float xi = Codec::decode_component (code, i); + return vmin[i] + xi * vdiff[i]; + } +}; + + +#ifdef USE_AVX + +template +struct QuantizerTemplate_avx: QuantizerTemplate_avx { + QuantizerTemplate_avx (size_t d, const std::vector &trained): + QuantizerTemplate_avx (d, trained) {} + + __m256 reconstruct_8_components (const uint8_t * code, int i) const + { + __m256 xi = Codec::decode_8_components (code, i); + return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i); + } +}; + +#endif + +/******************************************************************* + * FP16 quantizer + *******************************************************************/ + +template +struct QuantizerFP16_avx {}; + +template<> +struct QuantizerFP16_avx<1>: Quantizer { + const size_t d; + + QuantizerFP16_avx(size_t d, const std::vector & /* unused */): + d(d) {} + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + ((uint16_t*)code)[i] = encode_fp16(x[i]); + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = decode_fp16(((uint16_t*)code)[i]); + } + } + + float reconstruct_component (const uint8_t * code, int i) const + { + return decode_fp16(((uint16_t*)code)[i]); + } +}; + +#ifdef USE_AVX + +template<> +struct QuantizerFP16_avx<8>: QuantizerFP16_avx<1> { + QuantizerFP16_avx (size_t d, const std::vector &trained): + QuantizerFP16_avx<1> (d, trained) {} + + __m256 reconstruct_8_components (const uint8_t * code, int i) const + { + __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i)); + return _mm256_cvtph_ps (codei); + } +}; + +#endif + +/******************************************************************* + * 8bit_direct quantizer + *******************************************************************/ + +template +struct Quantizer8bitDirect_avx {}; + +template<> +struct Quantizer8bitDirect_avx<1>: Quantizer { + const size_t d; + + Quantizer8bitDirect_avx(size_t d, const std::vector & /* unused */): + d(d) {} + + + void encode_vector(const float* x, uint8_t* code) const final { + for (size_t i = 0; i < d; i++) { + code[i] = (uint8_t)x[i]; + } + } + + void decode_vector(const uint8_t* code, float* x) const final { + for (size_t i = 0; i < d; i++) { + x[i] = code[i]; + } + } + + float reconstruct_component (const uint8_t * code, int i) const + { + return code[i]; + } +}; + +#ifdef USE_AVX + +template<> +struct Quantizer8bitDirect_avx<8>: Quantizer8bitDirect_avx<1> { + Quantizer8bitDirect_avx (size_t d, const std::vector &trained): + Quantizer8bitDirect_avx<1> (d, trained) {} + + __m256 reconstruct_8_components (const uint8_t * code, int i) const + { + __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 + __m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32 + return _mm256_cvtepi32_ps (y8); // 8 * float32 + } +}; + +#endif + + +template +Quantizer *select_quantizer_1_avx ( + QuantizerType qtype, + size_t d, const std::vector & trained) +{ + switch(qtype) { + case QuantizerType::QT_8bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_6bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_4bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_8bit_uniform: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_4bit_uniform: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_fp16: + return new QuantizerFP16_avx (d, trained); + case QuantizerType::QT_8bit_direct: + return new Quantizer8bitDirect_avx (d, trained); + } + FAISS_THROW_MSG ("unknown qtype"); +} + + + +/******************************************************************* + * Similarity: gets vector components and computes a similarity wrt. a + * query vector stored in the object. The data fields just encapsulate + * an accumulator. + */ + +template +struct SimilarityL2_avx {}; + + +template<> +struct SimilarityL2_avx<1> { + static constexpr int simdwidth = 1; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2_avx (const float * y): y(y) {} + + /******* scalar accumulator *******/ + + float accu; + + void begin () { + accu = 0; + yi = y; + } + + void add_component (float x) { + float tmp = *yi++ - x; + accu += tmp * tmp; + } + + void add_component_2 (float x1, float x2) { + float tmp = x1 - x2; + accu += tmp * tmp; + } + + float result () { + return accu; + } +}; + + +#ifdef USE_AVX +template<> +struct SimilarityL2_avx<8> { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2_avx (const float * y): y(y) {} + __m256 accu8; + + void begin_8 () { + accu8 = _mm256_setzero_ps(); + yi = y; + } + + void add_8_components (__m256 x) { + __m256 yiv = _mm256_loadu_ps (yi); + yi += 8; + __m256 tmp = yiv - x; + accu8 += tmp * tmp; + } + + void add_8_components_2 (__m256 x, __m256 y) { + __m256 tmp = y - x; + accu8 += tmp * tmp; + } + + float result_8 () { + __m256 sum = _mm256_hadd_ps(accu8, accu8); + __m256 sum2 = _mm256_hadd_ps(sum, sum); + // now add the 0th and 4th component + return + _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + + _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + } +}; + +/* as same as SimilarityL2<8>, let build pass */ +template<> +struct SimilarityL2_avx<16> { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_L2; + + const float *y, *yi; + + explicit SimilarityL2_avx (const float * y): y(y) {} + __m256 accu8; + + void begin_8 () { + accu8 = _mm256_setzero_ps(); + yi = y; + } + + void add_8_components (__m256 x) { + __m256 yiv = _mm256_loadu_ps (yi); + yi += 8; + __m256 tmp = yiv - x; + accu8 += tmp * tmp; + } + + void add_8_components_2 (__m256 x, __m256 y) { + __m256 tmp = y - x; + accu8 += tmp * tmp; + } + + float result_8 () { + __m256 sum = _mm256_hadd_ps(accu8, accu8); + __m256 sum2 = _mm256_hadd_ps(sum, sum); + // now add the 0th and 4th component + return + _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + + _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + } +}; +#endif + + +template +struct SimilarityIP_avx {}; + + +template<> +struct SimilarityIP_avx<1> { + static constexpr int simdwidth = 1; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + const float *y, *yi; + + float accu; + + explicit SimilarityIP_avx (const float * y): + y (y) {} + + void begin () { + accu = 0; + yi = y; + } + + void add_component (float x) { + accu += *yi++ * x; + } + + void add_component_2 (float x1, float x2) { + accu += x1 * x2; + } + + float result () { + return accu; + } +}; + +#ifdef USE_AVX + +template<> +struct SimilarityIP_avx<8> { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + + const float *y, *yi; + + float accu; + + explicit SimilarityIP_avx (const float * y): + y (y) {} + + __m256 accu8; + + void begin_8 () { + accu8 = _mm256_setzero_ps(); + yi = y; + } + + void add_8_components (__m256 x) { + __m256 yiv = _mm256_loadu_ps (yi); + yi += 8; + accu8 += yiv * x; + } + + void add_8_components_2 (__m256 x1, __m256 x2) { + accu8 += x1 * x2; + } + + float result_8 () { + __m256 sum = _mm256_hadd_ps(accu8, accu8); + __m256 sum2 = _mm256_hadd_ps(sum, sum); + // now add the 0th and 4th component + return + _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + + _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + } +}; + +/* as same as SimilarityIP<8>, let build pass */ +template<> +struct SimilarityIP_avx<16> { + static constexpr int simdwidth = 8; + static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; + + const float *y, *yi; + + float accu; + + explicit SimilarityIP_avx (const float * y): + y (y) {} + + __m256 accu8; + + void begin_8 () { + accu8 = _mm256_setzero_ps(); + yi = y; + } + + void add_8_components (__m256 x) { + __m256 yiv = _mm256_loadu_ps (yi); + yi += 8; + accu8 += yiv * x; + } + + void add_8_components_2 (__m256 x1, __m256 x2) { + accu8 += x1 * x2; + } + + float result_8 () { + __m256 sum = _mm256_hadd_ps(accu8, accu8); + __m256 sum2 = _mm256_hadd_ps(sum, sum); + // now add the 0th and 4th component + return + _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + + _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); + } +}; +#endif + + +/******************************************************************* + * DistanceComputer: combines a similarity and a quantizer to do + * code-to-vector or code-to-code comparisons + *******************************************************************/ + +template +struct DCTemplate_avx : SQDistanceComputer {}; + +template +struct DCTemplate_avx : SQDistanceComputer +{ + using Sim = Similarity; + + Quantizer quant; + + DCTemplate_avx(size_t d, const std::vector &trained): + quant(d, trained) + {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin(); + for (size_t i = 0; i < quant.d; i++) { + float xi = quant.reconstruct_component(code, i); + sim.add_component(xi); + } + return sim.result(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin(); + for (size_t i = 0; i < quant.d; i++) { + float x1 = quant.reconstruct_component(code1, i); + float x2 = quant.reconstruct_component(code2, i); + sim.add_component_2(x1, x2); + } + return sim.result(); + } + + void set_query (const float *x) final { + q = x; + } + + /// compute distance of vector i to current query + float operator () (idx_t i) final { + return compute_distance (q, codes + i * code_size); + } + + float symmetric_dis (idx_t i, idx_t j) override { + return compute_code_distance (codes + i * code_size, + codes + j * code_size); + } + + float query_to_code (const uint8_t * code) const { + return compute_distance (q, code); + } +}; + +#ifdef USE_AVX + +template +struct DCTemplate_avx : SQDistanceComputer +{ + using Sim = Similarity; + + Quantizer quant; + + DCTemplate_avx(size_t d, const std::vector &trained): + quant(d, trained) + {} + + float compute_distance(const float* x, const uint8_t* code) const { + Similarity sim(x); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + __m256 xi = quant.reconstruct_8_components(code, i); + sim.add_8_components(xi); + } + return sim.result_8(); + } + + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + Similarity sim(nullptr); + sim.begin_8(); + for (size_t i = 0; i < quant.d; i += 8) { + __m256 x1 = quant.reconstruct_8_components(code1, i); + __m256 x2 = quant.reconstruct_8_components(code2, i); + sim.add_8_components_2(x1, x2); + } + return sim.result_8(); + } + + void set_query (const float *x) final { + q = x; + } + + /// compute distance of vector i to current query + float operator () (idx_t i) final { + return compute_distance (q, codes + i * code_size); + } + + float symmetric_dis (idx_t i, idx_t j) override { + return compute_code_distance (codes + i * code_size, + codes + j * code_size); + } + + float query_to_code (const uint8_t * code) const { + return compute_distance (q, code); + } +}; + +#endif + + + +/******************************************************************* + * DistanceComputerByte: computes distances in the integer domain + *******************************************************************/ + +template +struct DistanceComputerByte_avx : SQDistanceComputer {}; + +template +struct DistanceComputerByte_avx : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte_avx(int d, const std::vector &): d(d), tmp(d) { + } + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + int accu = 0; + for (int i = 0; i < d; i++) { + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + accu += int(code1[i]) * code2[i]; + } else { + int diff = int(code1[i]) - code2[i]; + accu += diff * diff; + } + } + return accu; + } + + void set_query (const float *x) final { + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + /// compute distance of vector i to current query + float operator () (idx_t i) final { + return compute_distance (q, codes + i * code_size); + } + + float symmetric_dis (idx_t i, idx_t j) override { + return compute_code_distance (codes + i * code_size, + codes + j * code_size); + } + + float query_to_code (const uint8_t * code) const { + return compute_code_distance (tmp.data(), code); + } +}; + +#ifdef USE_AVX + + +template +struct DistanceComputerByte_avx : SQDistanceComputer { + using Sim = Similarity; + + int d; + std::vector tmp; + + DistanceComputerByte_avx(int d, const std::vector &): d(d), tmp(d) { + } + + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) + const { + // __m256i accu = _mm256_setzero_ps (); + __m256i accu = _mm256_setzero_si256 (); + for (int i = 0; i < d; i += 16) { + // load 16 bytes, convert to 16 uint16_t + __m256i c1 = _mm256_cvtepu8_epi16 + (_mm_loadu_si128((__m128i*)(code1 + i))); + __m256i c2 = _mm256_cvtepu8_epi16 + (_mm_loadu_si128((__m128i*)(code2 + i))); + __m256i prod32; + if (Sim::metric_type == METRIC_INNER_PRODUCT) { + prod32 = _mm256_madd_epi16(c1, c2); + } else { + __m256i diff = _mm256_sub_epi16(c1, c2); + prod32 = _mm256_madd_epi16(diff, diff); + } + accu = _mm256_add_epi32 (accu, prod32); + } + __m128i sum = _mm256_extractf128_si256(accu, 0); + sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1)); + sum = _mm_hadd_epi32 (sum, sum); + sum = _mm_hadd_epi32 (sum, sum); + return _mm_cvtsi128_si32 (sum); + } + + void set_query (const float *x) final { + /* + for (int i = 0; i < d; i += 8) { + __m256 xi = _mm256_loadu_ps (x + i); + __m256i ci = _mm256_cvtps_epi32(xi); + */ + for (int i = 0; i < d; i++) { + tmp[i] = int(x[i]); + } + } + + int compute_distance(const float* x, const uint8_t* code) { + set_query(x); + return compute_code_distance(tmp.data(), code); + } + + /// compute distance of vector i to current query + float operator () (idx_t i) final { + return compute_distance (q, codes + i * code_size); + } + + float symmetric_dis (idx_t i, idx_t j) override { + return compute_code_distance (codes + i * code_size, + codes + j * code_size); + } + + float query_to_code (const uint8_t * code) const { + return compute_code_distance (tmp.data(), code); + } +}; + +#endif + +/******************************************************************* + * select_distance_computer: runtime selection of template + * specialization + *******************************************************************/ + + +template +SQDistanceComputer *select_distance_computer_avx ( + QuantizerType qtype, + size_t d, const std::vector & trained) +{ + constexpr int SIMDWIDTH = Sim::simdwidth; + switch(qtype) { + case QuantizerType::QT_8bit_uniform: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_4bit_uniform: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_8bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_6bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_4bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_fp16: + return new DCTemplate_avx + , Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_8bit_direct: + if (d % 16 == 0) { + return new DistanceComputerByte_avx(d, trained); + } else { + return new DCTemplate_avx + , Sim, SIMDWIDTH>(d, trained); + } + } + FAISS_THROW_MSG ("unknown qtype"); + return nullptr; +} + + +} // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp index 12b6089b22..428024d70a 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp @@ -12,52 +12,10 @@ namespace faiss { -#ifdef __AVX__ -#define USE_AVX -#endif - - /******************************************************************* * ScalarQuantizer Distance Computer ********************************************************************/ -/* AVX */ -SQDistanceComputer * -sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX - if (dim % 8 == 0) { - return select_distance_computer> (qtype, dim, trained); - } else -#endif - { - return select_distance_computer> (qtype, dim, trained); - } -} - -SQDistanceComputer * -sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX - if (dim % 8 == 0) { - return select_distance_computer> (qtype, dim, trained); - } else -#endif - { - return select_distance_computer> (qtype, dim, trained); - } -} - -Quantizer * -sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX - if (dim % 8 == 0) { - return select_quantizer_1<8> (qtype, dim, trained); - } else -#endif - { - return select_quantizer_1<1> (qtype, dim, trained); - } -} - /* SSE */ SQDistanceComputer * sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector& trained) { diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h index a99168b8e5..c4ce86b011 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h @@ -13,24 +13,13 @@ namespace faiss { +SQDistanceComputer * +sq_get_distance_computer_L2_sse(QuantizerType qtype, size_t dim, const std::vector& trained); SQDistanceComputer * -sq_get_distance_computer_L2_avx (QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_avx (QuantizerType qtype, size_t dim, const std::vector& trained); +sq_get_distance_computer_IP_sse(QuantizerType qtype, size_t dim, const std::vector& trained); Quantizer * -sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector& trained); - - -SQDistanceComputer * -sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector& trained); - -Quantizer * -sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector& trained); +sq_select_quantizer_sse(QuantizerType qtype, size_t dim, const std::vector& trained); } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp new file mode 100644 index 0000000000..74bfc0878a --- /dev/null +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp @@ -0,0 +1,46 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#include +#include + +namespace faiss { + +/******************************************************************* + * ScalarQuantizer Distance Computer + ********************************************************************/ + +SQDistanceComputer * +sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { + if (dim % 8 == 0) { + return select_distance_computer_avx>(qtype, dim, trained); + } else { + return select_distance_computer_avx>(qtype, dim, trained); + } +} + +SQDistanceComputer * +sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { + if (dim % 8 == 0) { + return select_distance_computer_avx>(qtype, dim, trained); + } else { + return select_distance_computer_avx>(qtype, dim, trained); + } +} + +Quantizer * +sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { + if (dim % 8 == 0) { + return select_quantizer_1_avx<8>(qtype, dim, trained); + } else { + return select_quantizer_1_avx<1> (qtype, dim, trained); + } +} + +} // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h new file mode 100644 index 0000000000..86b9ab7db3 --- /dev/null +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h @@ -0,0 +1,27 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// -*- c++ -*- + +#pragma once + +#include +#include + +namespace faiss { + + +SQDistanceComputer * +sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector& trained); + +SQDistanceComputer * +sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector& trained); + +Quantizer * +sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector& trained); + +} // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp index 19a9eebe45..6b2912c74c 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp @@ -12,65 +12,39 @@ namespace faiss { -#ifdef __AVX__ -#define USE_AVX -#endif - -#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__)) -#define USE_AVX_512 -#endif - /******************************************************************* * ScalarQuantizer Distance Computer ********************************************************************/ SQDistanceComputer * sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX_512 if (dim % 16 == 0) { return select_distance_computer_avx512> (qtype, dim, trained); - } else -#endif -#ifdef USE_AVX - if (dim % 8 == 0) { + } else if (dim % 8 == 0) { return select_distance_computer_avx512> (qtype, dim, trained); - } else -#endif - { + } else { return select_distance_computer_avx512> (qtype, dim, trained); } } SQDistanceComputer * sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX_512 if (dim % 16 == 0) { return select_distance_computer_avx512> (qtype, dim, trained); - } else -#endif -#ifdef USE_AVX - if (dim % 8 == 0) { + } else if (dim % 8 == 0) { return select_distance_computer_avx512> (qtype, dim, trained); - } else -#endif - { + } else { return select_distance_computer_avx512> (qtype, dim, trained); } } Quantizer * sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained) { -#ifdef USE_AVX_512 if (dim % 16 == 0) { return select_quantizer_1_avx512<16> (qtype, dim, trained); - } else -#endif -#ifdef USE_AVX - if (dim % 8 == 0) { + } else if (dim % 8 == 0) { return select_quantizer_1_avx512<8> (qtype, dim, trained); - } else -#endif - { + } else { return select_quantizer_1_avx512<1> (qtype, dim, trained); } } diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h index bf1f0817ce..308b81ecd0 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h @@ -14,15 +14,13 @@ namespace faiss { +SQDistanceComputer * +sq_get_distance_computer_L2_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); SQDistanceComputer * -sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained); +sq_get_distance_computer_IP_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); Quantizer * -sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained); - +sq_select_quantizer_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/makefile.inc.in b/core/src/index/thirdparty/faiss/makefile.inc.in index 01ff95ac1c..244f94a17c 100644 --- a/core/src/index/thirdparty/faiss/makefile.inc.in +++ b/core/src/index/thirdparty/faiss/makefile.inc.in @@ -7,7 +7,7 @@ CXX = @CXX@ CXXCPP = @CXXCPP@ CPPFLAGS = -DFINTEGER=int @CPPFLAGS@ @OPENMP_CXXFLAGS@ @NVCC_CPPFLAGS@ CXXFLAGS = -fPIC @ARCH_CXXFLAGS@ -Wno-sign-compare @CXXFLAGS@ -CPUFLAGS = -mavx2 -mf16c @ARCH_CPUFLAGS@ +CPUFLAGS = @ARCH_CPUFLAGS@ LDFLAGS = @OPENMP_LDFLAGS@ @LDFLAGS@ @NVCC_LDFLAGS@ LIBS = @BLAS_LIBS@ @LAPACK_LIBS@ @LIBS@ @NVCC_LIBS@ PYTHONCFLAGS = @PYTHON_CFLAGS@ -I@NUMPY_INCLUDE@ diff --git a/core/src/index/thirdparty/faiss/utils/distances.h b/core/src/index/thirdparty/faiss/utils/distances.h index bc4098b050..e227c37514 100644 --- a/core/src/index/thirdparty/faiss/utils/distances.h +++ b/core/src/index/thirdparty/faiss/utils/distances.h @@ -24,31 +24,6 @@ namespace faiss { * Optimized distance/norm/inner prod computations *********************************************************/ -#ifdef __AVX__ -/// Squared L2 distance between two vectors -float fvec_L2sqr_avx ( - const float * x, - const float * y, - size_t d); - -/// inner product -float fvec_inner_product_avx ( - const float * x, - const float * y, - size_t d); - -/// L1 distance -float fvec_L1_avx ( - const float * x, - const float * y, - size_t d); - -float fvec_Linf_avx ( - const float * x, - const float * y, - size_t d); -#endif - #ifdef __SSE__ float fvec_L2sqr_sse ( const float * x, diff --git a/core/src/index/thirdparty/faiss/utils/distances_avx.h b/core/src/index/thirdparty/faiss/utils/distances_avx.h new file mode 100644 index 0000000000..734c38ebe7 --- /dev/null +++ b/core/src/index/thirdparty/faiss/utils/distances_avx.h @@ -0,0 +1,32 @@ + +// -*- c++ -*- + +/* All distance functions for L2 and IP distances. + * The actual functions are implemented in distances_simd_avx512.cpp */ + +#pragma once + +#include + +namespace faiss { + +/********************************************************* + * Optimized distance/norm/inner prod computations + *********************************************************/ + +/// Squared L2 distance between two vectors +float +fvec_L2sqr_avx(const float* x, const float* y, size_t d); + +/// inner product +float +fvec_inner_product_avx(const float* x, const float* y, size_t d); + +/// L1 distance +float +fvec_L1_avx(const float* x, const float* y, size_t d); + +float +fvec_Linf_avx(const float* x, const float* y, size_t d); + +} // namespace faiss diff --git a/core/src/index/thirdparty/faiss/utils/distances_avx512.h b/core/src/index/thirdparty/faiss/utils/distances_avx512.h index bd15e78aba..d410f3e821 100644 --- a/core/src/index/thirdparty/faiss/utils/distances_avx512.h +++ b/core/src/index/thirdparty/faiss/utils/distances_avx512.h @@ -15,26 +15,18 @@ namespace faiss { *********************************************************/ /// Squared L2 distance between two vectors -float fvec_L2sqr_avx512 ( - const float * x, - const float * y, - size_t d); +float +fvec_L2sqr_avx512(const float* x, const float* y, size_t d); /// inner product -float fvec_inner_product_avx512 ( - const float * x, - const float * y, - size_t d); +float +fvec_inner_product_avx512(const float * x, const float * y, size_t d); /// L1 distance -float fvec_L1_avx512 ( - const float * x, - const float * y, - size_t d); +float +fvec_L1_avx512(const float* x, const float* y, size_t d); -float fvec_Linf_avx512 ( - const float * x, - const float * y, - size_t d); +float +fvec_Linf_avx512(const float* x, const float* y, size_t d); } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/utils/distances_simd.cpp b/core/src/index/thirdparty/faiss/utils/distances_simd.cpp index 69356bb490..e33967d5e6 100644 --- a/core/src/index/thirdparty/faiss/utils/distances_simd.cpp +++ b/core/src/index/thirdparty/faiss/utils/distances_simd.cpp @@ -27,10 +27,6 @@ namespace faiss { -#ifdef __AVX__ -#define USE_AVX -#endif - /********************************************************* * Optimized distance computations *********************************************************/ @@ -313,171 +309,6 @@ void fvec_L2sqr_ny (float * dis, const float * x, #endif -#if defined(USE_AVX) - -// reads 0 <= d < 8 floats as __m256 -static inline __m256 masked_read_8 (int d, const float *x) -{ - assert (0 <= d && d < 8); - if (d < 4) { - __m256 res = _mm256_setzero_ps (); - res = _mm256_insertf128_ps (res, masked_read (d, x), 0); - return res; - } else { - __m256 res = _mm256_setzero_ps (); - res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0); - res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1); - return res; - } -} - -float fvec_inner_product_avx (const float * x, - const float * y, - size_t d) -{ - __m256 msum1 = _mm256_setzero_ps(); - - while (d >= 8) { - __m256 mx = _mm256_loadu_ps (x); x += 8; - __m256 my = _mm256_loadu_ps (y); y += 8; - msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my)); - d -= 8; - } - - __m128 msum2 = _mm256_extractf128_ps(msum1, 1); - msum2 += _mm256_extractf128_ps(msum1, 0); - - if (d >= 4) { - __m128 mx = _mm_loadu_ps (x); x += 4; - __m128 my = _mm_loadu_ps (y); y += 4; - msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my)); - d -= 4; - } - - if (d > 0) { - __m128 mx = masked_read (d, x); - __m128 my = masked_read (d, y); - msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my)); - } - - msum2 = _mm_hadd_ps (msum2, msum2); - msum2 = _mm_hadd_ps (msum2, msum2); - return _mm_cvtss_f32 (msum2); -} - -float fvec_L2sqr_avx (const float * x, - const float * y, - size_t d) -{ - __m256 msum1 = _mm256_setzero_ps(); - - while (d >= 8) { - __m256 mx = _mm256_loadu_ps (x); x += 8; - __m256 my = _mm256_loadu_ps (y); y += 8; - const __m256 a_m_b1 = mx - my; - msum1 += a_m_b1 * a_m_b1; - d -= 8; - } - - __m128 msum2 = _mm256_extractf128_ps(msum1, 1); - msum2 += _mm256_extractf128_ps(msum1, 0); - - if (d >= 4) { - __m128 mx = _mm_loadu_ps (x); x += 4; - __m128 my = _mm_loadu_ps (y); y += 4; - const __m128 a_m_b1 = mx - my; - msum2 += a_m_b1 * a_m_b1; - d -= 4; - } - - if (d > 0) { - __m128 mx = masked_read (d, x); - __m128 my = masked_read (d, y); - __m128 a_m_b1 = mx - my; - msum2 += a_m_b1 * a_m_b1; - } - - msum2 = _mm_hadd_ps (msum2, msum2); - msum2 = _mm_hadd_ps (msum2, msum2); - return _mm_cvtss_f32 (msum2); -} - -float fvec_L1_avx (const float * x, const float * y, size_t d) -{ - __m256 msum1 = _mm256_setzero_ps(); - __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL)); - - while (d >= 8) { - __m256 mx = _mm256_loadu_ps (x); x += 8; - __m256 my = _mm256_loadu_ps (y); y += 8; - const __m256 a_m_b = mx - my; - msum1 += _mm256_and_ps(signmask, a_m_b); - d -= 8; - } - - __m128 msum2 = _mm256_extractf128_ps(msum1, 1); - msum2 += _mm256_extractf128_ps(msum1, 0); - __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL)); - - if (d >= 4) { - __m128 mx = _mm_loadu_ps (x); x += 4; - __m128 my = _mm_loadu_ps (y); y += 4; - const __m128 a_m_b = mx - my; - msum2 += _mm_and_ps(signmask2, a_m_b); - d -= 4; - } - - if (d > 0) { - __m128 mx = masked_read (d, x); - __m128 my = masked_read (d, y); - __m128 a_m_b = mx - my; - msum2 += _mm_and_ps(signmask2, a_m_b); - } - - msum2 = _mm_hadd_ps (msum2, msum2); - msum2 = _mm_hadd_ps (msum2, msum2); - return _mm_cvtss_f32 (msum2); -} - -float fvec_Linf_avx (const float * x, const float * y, size_t d) -{ - __m256 msum1 = _mm256_setzero_ps(); - __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL)); - - while (d >= 8) { - __m256 mx = _mm256_loadu_ps (x); x += 8; - __m256 my = _mm256_loadu_ps (y); y += 8; - const __m256 a_m_b = mx - my; - msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b)); - d -= 8; - } - - __m128 msum2 = _mm256_extractf128_ps(msum1, 1); - msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0)); - __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL)); - - if (d >= 4) { - __m128 mx = _mm_loadu_ps (x); x += 4; - __m128 my = _mm_loadu_ps (y); y += 4; - const __m128 a_m_b = mx - my; - msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b)); - d -= 4; - } - - if (d > 0) { - __m128 mx = masked_read (d, x); - __m128 my = masked_read (d, y); - __m128 a_m_b = mx - my; - msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b)); - } - - msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2); - msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1)); - return _mm_cvtss_f32 (msum2); -} - -#endif /* defined(USE_AVX) */ - #if defined(__SSE__) // But not AVX float fvec_L1_sse (const float * x, const float * y, size_t d) diff --git a/core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp b/core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp new file mode 100644 index 0000000000..4a3c83a89e --- /dev/null +++ b/core/src/index/thirdparty/faiss/utils/distances_simd_avx.cpp @@ -0,0 +1,213 @@ + +// -*- c++ -*- + +#include +#include + +#include +#include +#include +#include + +#include + +namespace faiss { + +#ifdef __SSE__ +// reads 0 <= d < 4 floats as __m128 +static inline __m128 masked_read (int d, const float *x) { + assert (0 <= d && d < 4); + __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0}; + switch (d) { + case 3: + buf[2] = x[2]; + case 2: + buf[1] = x[1]; + case 1: + buf[0] = x[0]; + } + return _mm_load_ps(buf); + // cannot use AVX2 _mm_mask_set1_epi32 +} +#endif + +#ifdef __AVX__ + +// reads 0 <= d < 8 floats as __m256 +static inline __m256 masked_read_8 (int d, const float* x) { + assert (0 <= d && d < 8); + if (d < 4) { + __m256 res = _mm256_setzero_ps (); + res = _mm256_insertf128_ps (res, masked_read (d, x), 0); + return res; + } else { + __m256 res = _mm256_setzero_ps (); + res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0); + res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1); + return res; + } +} + +float fvec_inner_product_avx (const float* x, const float* y, size_t d) { + __m256 msum1 = _mm256_setzero_ps(); + + while (d >= 8) { + __m256 mx = _mm256_loadu_ps (x); x += 8; + __m256 my = _mm256_loadu_ps (y); y += 8; + msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my)); + d -= 8; + } + + __m128 msum2 = _mm256_extractf128_ps(msum1, 1); + msum2 += _mm256_extractf128_ps(msum1, 0); + + if (d >= 4) { + __m128 mx = _mm_loadu_ps (x); x += 4; + __m128 my = _mm_loadu_ps (y); y += 4; + msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my)); + d -= 4; + } + + if (d > 0) { + __m128 mx = masked_read (d, x); + __m128 my = masked_read (d, y); + msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my)); + } + + msum2 = _mm_hadd_ps (msum2, msum2); + msum2 = _mm_hadd_ps (msum2, msum2); + return _mm_cvtss_f32 (msum2); +} + +float fvec_L2sqr_avx (const float* x, const float* y, size_t d) { + __m256 msum1 = _mm256_setzero_ps(); + + while (d >= 8) { + __m256 mx = _mm256_loadu_ps (x); x += 8; + __m256 my = _mm256_loadu_ps (y); y += 8; + const __m256 a_m_b1 = mx - my; + msum1 += a_m_b1 * a_m_b1; + d -= 8; + } + + __m128 msum2 = _mm256_extractf128_ps(msum1, 1); + msum2 += _mm256_extractf128_ps(msum1, 0); + + if (d >= 4) { + __m128 mx = _mm_loadu_ps (x); x += 4; + __m128 my = _mm_loadu_ps (y); y += 4; + const __m128 a_m_b1 = mx - my; + msum2 += a_m_b1 * a_m_b1; + d -= 4; + } + + if (d > 0) { + __m128 mx = masked_read (d, x); + __m128 my = masked_read (d, y); + __m128 a_m_b1 = mx - my; + msum2 += a_m_b1 * a_m_b1; + } + + msum2 = _mm_hadd_ps (msum2, msum2); + msum2 = _mm_hadd_ps (msum2, msum2); + return _mm_cvtss_f32 (msum2); +} + +float fvec_L1_avx (const float * x, const float * y, size_t d) +{ + __m256 msum1 = _mm256_setzero_ps(); + __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL)); + + while (d >= 8) { + __m256 mx = _mm256_loadu_ps (x); x += 8; + __m256 my = _mm256_loadu_ps (y); y += 8; + const __m256 a_m_b = mx - my; + msum1 += _mm256_and_ps(signmask, a_m_b); + d -= 8; + } + + __m128 msum2 = _mm256_extractf128_ps(msum1, 1); + msum2 += _mm256_extractf128_ps(msum1, 0); + __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL)); + + if (d >= 4) { + __m128 mx = _mm_loadu_ps (x); x += 4; + __m128 my = _mm_loadu_ps (y); y += 4; + const __m128 a_m_b = mx - my; + msum2 += _mm_and_ps(signmask2, a_m_b); + d -= 4; + } + + if (d > 0) { + __m128 mx = masked_read (d, x); + __m128 my = masked_read (d, y); + __m128 a_m_b = mx - my; + msum2 += _mm_and_ps(signmask2, a_m_b); + } + + msum2 = _mm_hadd_ps (msum2, msum2); + msum2 = _mm_hadd_ps (msum2, msum2); + return _mm_cvtss_f32 (msum2); +} + +float fvec_Linf_avx (const float* x, const float* y, size_t d) { + __m256 msum1 = _mm256_setzero_ps(); + __m256 signmask = __m256(_mm256_set1_epi32 (0x7fffffffUL)); + + while (d >= 8) { + __m256 mx = _mm256_loadu_ps (x); x += 8; + __m256 my = _mm256_loadu_ps (y); y += 8; + const __m256 a_m_b = mx - my; + msum1 = _mm256_max_ps(msum1, _mm256_and_ps(signmask, a_m_b)); + d -= 8; + } + + __m128 msum2 = _mm256_extractf128_ps(msum1, 1); + msum2 = _mm_max_ps (msum2, _mm256_extractf128_ps(msum1, 0)); + __m128 signmask2 = __m128(_mm_set1_epi32 (0x7fffffffUL)); + + if (d >= 4) { + __m128 mx = _mm_loadu_ps (x); x += 4; + __m128 my = _mm_loadu_ps (y); y += 4; + const __m128 a_m_b = mx - my; + msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b)); + d -= 4; + } + + if (d > 0) { + __m128 mx = masked_read (d, x); + __m128 my = masked_read (d, y); + __m128 a_m_b = mx - my; + msum2 = _mm_max_ps(msum2, _mm_and_ps(signmask2, a_m_b)); + } + + msum2 = _mm_max_ps(_mm_movehl_ps(msum2, msum2), msum2); + msum2 = _mm_max_ps(msum2, _mm_shuffle_ps (msum2, msum2, 1)); + return _mm_cvtss_f32 (msum2); +} + +#else + +float fvec_inner_product_avx(const float* x, const float* y, size_t d) { + FAISS_ASSERT(false); + return 0.0; +} + +float fvec_L2sqr_avx(const float* x, const float* y, size_t d) { + FAISS_ASSERT(false); + return 0.0; +} + +float fvec_L1_avx(const float* x, const float* y, size_t d) { + FAISS_ASSERT(false); + return 0.0; +} + +float fvec_Linf_avx (const float* x, const float* y, size_t d) { + FAISS_ASSERT(false); + return 0.0; +} + +#endif + +} // namespace faiss diff --git a/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp b/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp index adc1c1738a..a73d9b7da9 100644 --- a/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp +++ b/core/src/index/thirdparty/faiss/utils/distances_simd_avx512.cpp @@ -2,7 +2,6 @@ // -*- c++ -*- #include -#include #include #include @@ -34,10 +33,8 @@ static inline __m128 masked_read (int d, const float *x) { #if (defined(__AVX512F__) && defined(__AVX512DQ__)) -float fvec_inner_product_avx512 (const float * x, - const float * y, - size_t d) -{ +float +fvec_inner_product_avx512(const float* x, const float* y, size_t d) { __m512 msum0 = _mm512_setzero_ps(); while (d >= 16) { @@ -78,10 +75,8 @@ float fvec_inner_product_avx512 (const float * x, return _mm_cvtss_f32 (msum2); } -float fvec_L2sqr_avx512 (const float * x, - const float * y, - size_t d) -{ +float +fvec_L2sqr_avx512(const float* x, const float* y, size_t d) { __m512 msum0 = _mm512_setzero_ps(); while (d >= 16) { @@ -126,8 +121,8 @@ float fvec_L2sqr_avx512 (const float * x, return _mm_cvtss_f32 (msum2); } -float fvec_L1_avx512 (const float * x, const float * y, size_t d) -{ +float +fvec_L1_avx512(const float* x, const float* y, size_t d) { __m512 msum0 = _mm512_setzero_ps(); __m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL)); @@ -175,8 +170,8 @@ float fvec_L1_avx512 (const float * x, const float * y, size_t d) return _mm_cvtss_f32 (msum2); } -float fvec_Linf_avx512 (const float * x, const float * y, size_t d) -{ +float +fvec_Linf_avx512(const float* x, const float* y, size_t d) { __m512 msum0 = _mm512_setzero_ps(); __m512 signmask0 = __m512(_mm512_set1_epi32 (0x7fffffffUL)); @@ -226,30 +221,26 @@ float fvec_Linf_avx512 (const float * x, const float * y, size_t d) #else -float fvec_inner_product_avx512 (const float * x, - const float * y, - size_t d) -{ +float +fvec_inner_product_avx512(const float* x, const float* y, size_t d) { FAISS_ASSERT(false); return 0.0; } -float fvec_L2sqr_avx512 (const float * x, - const float * y, - size_t d) -{ +float +fvec_L2sqr_avx512(const float* x, const float* y, size_t d) { FAISS_ASSERT(false); return 0.0; } -float fvec_L1_avx512 (const float * x, const float * y, size_t d) -{ +float +fvec_L1_avx512(const float* x, const float* y, size_t d) { FAISS_ASSERT(false); return 0.0; } -float fvec_Linf_avx512 (const float * x, const float * y, size_t d) -{ +float +fvec_Linf_avx512(const float* x, const float* y, size_t d) { FAISS_ASSERT(false); return 0.0; }