From 8254e9ed67242ac4b85e07dc04ac0d841d25e8ec Mon Sep 17 00:00:00 2001 From: Cai Yudong Date: Sat, 6 Jun 2020 15:41:57 +0800 Subject: [PATCH] fix performance (#2499) * optimize sq_get_distance_computer Signed-off-by: yudong.cai * add sq_select_inverted_list_scanner_ref Signed-off-by: yudong.cai * add sq_select_inverted_list_scanner_avx Signed-off-by: yudong.cai * add sq_select_inverted_list_scanner_avx512 Signed-off-by: yudong.cai * optimize Codec Signed-off-by: yudong.cai * optimize ScalarQuantizerCodec_avx.h Signed-off-by: yudong.cai * code format Signed-off-by: yudong.cai * optimize ScalarQuantizerCodec_avx512.h Signed-off-by: yudong.cai * update changelog Signed-off-by: yudong.cai * clean code Signed-off-by: yudong.cai --- CHANGELOG.md | 1 + core/src/index/thirdparty/faiss/FaissHook.cpp | 21 +- core/src/index/thirdparty/faiss/FaissHook.h | 14 +- .../thirdparty/faiss/impl/ScalarQuantizer.cpp | 274 +----- .../thirdparty/faiss/impl/ScalarQuantizer.h | 160 ++++ .../faiss/impl/ScalarQuantizerCodec.h | 119 ++- .../faiss/impl/ScalarQuantizerCodec_avx.h | 630 ++++-------- .../faiss/impl/ScalarQuantizerCodec_avx512.h | 897 ++++-------------- .../faiss/impl/ScalarQuantizerDC.cpp | 20 +- .../thirdparty/faiss/impl/ScalarQuantizerDC.h | 26 +- .../faiss/impl/ScalarQuantizerDC_avx.cpp | 36 +- .../faiss/impl/ScalarQuantizerDC_avx.h | 26 +- .../faiss/impl/ScalarQuantizerDC_avx512.cpp | 43 +- .../faiss/impl/ScalarQuantizerDC_avx512.h | 25 +- 14 files changed, 741 insertions(+), 1551 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index abd700992f..b80e2c4fbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Please mark all change in change log and use the issue from GitHub ## Improvement - \#2381 Upgrade FAISS to 1.6.3 +- \#2429 Fix Milvus 0.9.1 performance degrade issue - \#2441 Improve Knowhere code coverage - \#2466 optimize k-selection implemention of faiss gpu version - \#2495 Add creating lock file failure reason. diff --git a/core/src/index/thirdparty/faiss/FaissHook.cpp b/core/src/index/thirdparty/faiss/FaissHook.cpp index 0c30a5ceb8..e20ab37e55 100644 --- a/core/src/index/thirdparty/faiss/FaissHook.cpp +++ b/core/src/index/thirdparty/faiss/FaissHook.cpp @@ -26,10 +26,9 @@ fvec_func_ptr fvec_L2sqr = fvec_L2sqr_avx; fvec_func_ptr fvec_L1 = fvec_L1_avx; fvec_func_ptr fvec_Linf = fvec_Linf_avx; -sq_get_func_ptr sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx; -sq_get_func_ptr sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx; -sq_sel_func_ptr sq_sel_quantizer = sq_select_quantizer_avx; - +sq_get_distance_computer_func_ptr sq_get_distance_computer = sq_get_distance_computer_avx; +sq_sel_quantizer_func_ptr sq_sel_quantizer = sq_select_quantizer_avx; +sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx; /*****************************************************************************/ @@ -68,9 +67,9 @@ bool hook_init(std::string& cpu_flag) { fvec_Linf = fvec_Linf_avx512; /* for IVFSQ */ - sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx512; - sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx512; + sq_get_distance_computer = sq_get_distance_computer_avx512; sq_sel_quantizer = sq_select_quantizer_avx512; + sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx512; cpu_flag = "AVX512"; } else if (support_avx2()) { @@ -81,9 +80,9 @@ bool hook_init(std::string& cpu_flag) { fvec_Linf = fvec_Linf_avx; /* for IVFSQ */ - sq_get_distance_computer_L2 = sq_get_distance_computer_L2_avx; - sq_get_distance_computer_IP = sq_get_distance_computer_IP_avx; + sq_get_distance_computer = sq_get_distance_computer_avx; sq_sel_quantizer = sq_select_quantizer_avx; + sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_avx; cpu_flag = "AVX2"; } else if (support_sse()) { @@ -94,9 +93,9 @@ bool hook_init(std::string& cpu_flag) { fvec_Linf = fvec_Linf_sse; /* for IVFSQ */ - sq_get_distance_computer_L2 = sq_get_distance_computer_L2_sse; - sq_get_distance_computer_IP = sq_get_distance_computer_IP_sse; - sq_sel_quantizer = sq_select_quantizer_sse; + sq_get_distance_computer = sq_get_distance_computer_ref; + sq_sel_quantizer = sq_select_quantizer_ref; + sq_sel_inv_list_scanner = sq_select_inverted_list_scanner_ref; cpu_flag = "SSE42"; } else { diff --git a/core/src/index/thirdparty/faiss/FaissHook.h b/core/src/index/thirdparty/faiss/FaissHook.h index dfd25a9d3a..f1aa98f606 100644 --- a/core/src/index/thirdparty/faiss/FaissHook.h +++ b/core/src/index/thirdparty/faiss/FaissHook.h @@ -6,15 +6,17 @@ #include #include #include +#include #include +#include namespace faiss { typedef float (*fvec_func_ptr)(const float*, const float*, size_t); -typedef SQDistanceComputer* (*sq_get_func_ptr)(QuantizerType, size_t, const std::vector&); -typedef Quantizer* (*sq_sel_func_ptr)(QuantizerType, size_t, const std::vector&); - +typedef SQDistanceComputer* (*sq_get_distance_computer_func_ptr)(MetricType, QuantizerType, size_t, const std::vector&); +typedef Quantizer* (*sq_sel_quantizer_func_ptr)(QuantizerType, size_t, const std::vector&); +typedef InvertedListScanner* (*sq_sel_inv_list_scanner_func_ptr)(MetricType, const ScalarQuantizer*, const Index*, size_t, bool, bool); extern bool faiss_use_avx512; extern bool faiss_use_avx2; @@ -25,9 +27,9 @@ extern fvec_func_ptr fvec_L2sqr; extern fvec_func_ptr fvec_L1; extern fvec_func_ptr fvec_Linf; -extern sq_get_func_ptr sq_get_distance_computer_L2; -extern sq_get_func_ptr sq_get_distance_computer_IP; -extern sq_sel_func_ptr sq_sel_quantizer; +extern sq_get_distance_computer_func_ptr sq_get_distance_computer; +extern sq_sel_quantizer_func_ptr sq_sel_quantizer; +extern sq_sel_inv_list_scanner_func_ptr sq_sel_inv_list_scanner; extern bool support_avx512(); extern bool support_avx2(); diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp index 767bb8485f..54f0903c4b 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.cpp @@ -16,7 +16,6 @@ #include #include #include -#include namespace faiss { @@ -160,11 +159,7 @@ ScalarQuantizer::get_distance_computer (MetricType metric) const { FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); /* use hook to decide use AVX512 or not */ - if (metric == METRIC_L2) { - return sq_get_distance_computer_L2(qtype, d, trained); - } else { - return sq_get_distance_computer_IP(qtype, d, trained); - } + return sq_get_distance_computer(metric, qtype, d, trained); } @@ -175,276 +170,13 @@ ScalarQuantizer::get_distance_computer (MetricType metric) const * IndexScalarQuantizer as well. ********************************************************************/ -namespace { - -template -struct IVFSQScannerIP: InvertedListScanner { - DCClass dc; - bool store_pairs, by_residual; - - size_t code_size; - - idx_t list_no; /// current list (set to 0 for Flat index - float accu0; /// added to all distances - - IVFSQScannerIP(int d, const std::vector & trained, - size_t code_size, bool store_pairs, - bool by_residual): - dc(d, trained), store_pairs(store_pairs), - by_residual(by_residual), - code_size(code_size), list_no(0), accu0(0) - {} - - - void set_query (const float *query) override { - dc.set_query (query); - } - - void set_list (idx_t list_no, float coarse_dis) override { - this->list_no = list_no; - accu0 = by_residual ? coarse_dis : 0; - } - - float distance_to_code (const uint8_t *code) const final { - return accu0 + dc.query_to_code (code); - } - - size_t scan_codes (size_t list_size, - const uint8_t *codes, - const idx_t *ids, - float *simi, idx_t *idxi, - size_t k, - ConcurrentBitsetPtr bitset) const override - { - size_t nup = 0; - - for (size_t j = 0; j < list_size; j++) { - if(!bitset || !bitset->test(ids[j])){ - float accu = accu0 + dc.query_to_code (codes); - - if (accu > simi [0]) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - minheap_swap_top (k, simi, idxi, accu, id); - nup++; - } - } - codes += code_size; - } - return nup; - } - - void scan_codes_range (size_t list_size, - const uint8_t *codes, - const idx_t *ids, - float radius, - RangeQueryResult & res, - ConcurrentBitsetPtr bitset = nullptr) const override - { - for (size_t j = 0; j < list_size; j++) { - float accu = accu0 + dc.query_to_code (codes); - if (accu > radius) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - res.add (accu, id); - } - codes += code_size; - } - } -}; - - -template -struct IVFSQScannerL2: InvertedListScanner { - DCClass dc; - - bool store_pairs, by_residual; - size_t code_size; - const Index *quantizer; - idx_t list_no; /// current inverted list - const float *x; /// current query - - std::vector tmp; - - IVFSQScannerL2(int d, const std::vector & trained, - size_t code_size, const Index *quantizer, - bool store_pairs, bool by_residual): - dc(d, trained), store_pairs(store_pairs), by_residual(by_residual), - code_size(code_size), quantizer(quantizer), - list_no (0), x (nullptr), tmp (d) - { - } - - - void set_query (const float *query) override { - x = query; - if (!quantizer) { - dc.set_query (query); - } - } - - - void set_list (idx_t list_no, float /*coarse_dis*/) override { - if (by_residual) { - this->list_no = list_no; - // shift of x_in wrt centroid - quantizer->Index::compute_residual (x, tmp.data(), list_no); - dc.set_query (tmp.data ()); - } else { - dc.set_query (x); - } - } - - float distance_to_code (const uint8_t *code) const final { - return dc.query_to_code (code); - } - - size_t scan_codes (size_t list_size, - const uint8_t *codes, - const idx_t *ids, - float *simi, idx_t *idxi, - size_t k, - ConcurrentBitsetPtr bitset) const override - { - size_t nup = 0; - for (size_t j = 0; j < list_size; j++) { - if(!bitset || !bitset->test(ids[j])){ - float dis = dc.query_to_code (codes); - - if (dis < simi [0]) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - maxheap_swap_top (k, simi, idxi, dis, id); - nup++; - } - } - codes += code_size; - } - return nup; - } - - void scan_codes_range (size_t list_size, - const uint8_t *codes, - const idx_t *ids, - float radius, - RangeQueryResult & res, - ConcurrentBitsetPtr bitset = nullptr) const override - { - for (size_t j = 0; j < list_size; j++) { - float dis = dc.query_to_code (codes); - if (dis < radius) { - int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; - res.add (dis, id); - } - codes += code_size; - } - } -}; - -template -InvertedListScanner* sel2_InvertedListScanner - (const ScalarQuantizer *sq, - const Index *quantizer, bool store_pairs, bool r) -{ - if (DCClass::Sim::metric_type == METRIC_L2) { - return new IVFSQScannerL2(sq->d, sq->trained, sq->code_size, - quantizer, store_pairs, r); - } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { - return new IVFSQScannerIP(sq->d, sq->trained, sq->code_size, - store_pairs, r); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - -template -InvertedListScanner* sel12_InvertedListScanner - (const ScalarQuantizer *sq, - const Index *quantizer, bool store_pairs, bool r) -{ - constexpr int SIMDWIDTH = Similarity::simdwidth; - using QuantizerClass = QuantizerTemplate; - using DCClass = DCTemplate; - return sel2_InvertedListScanner (sq, quantizer, store_pairs, r); -} - - -template -InvertedListScanner* sel1_InvertedListScanner - (const ScalarQuantizer *sq, const Index *quantizer, - bool store_pairs, bool r) -{ - constexpr int SIMDWIDTH = Similarity::simdwidth; - switch(sq->qtype) { - case QuantizerType::QT_8bit_uniform: - return sel12_InvertedListScanner - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_4bit_uniform: - return sel12_InvertedListScanner - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_8bit: - return sel12_InvertedListScanner - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_4bit: - return sel12_InvertedListScanner - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_6bit: - return sel12_InvertedListScanner - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_fp16: - return sel2_InvertedListScanner - , Similarity, SIMDWIDTH> > - (sq, quantizer, store_pairs, r); - case QuantizerType::QT_8bit_direct: - if (sq->d % 16 == 0) { - return sel2_InvertedListScanner - > - (sq, quantizer, store_pairs, r); - } else { - return sel2_InvertedListScanner - , - Similarity, SIMDWIDTH> > - (sq, quantizer, store_pairs, r); - } - } - - FAISS_THROW_MSG ("unknown qtype"); - return nullptr; -} - -template -InvertedListScanner* sel0_InvertedListScanner - (MetricType mt, const ScalarQuantizer *sq, - const Index *quantizer, bool store_pairs, bool by_residual) -{ - if (mt == METRIC_L2) { - return sel1_InvertedListScanner > - (sq, quantizer, store_pairs, by_residual); - } else if (mt == METRIC_INNER_PRODUCT) { - return sel1_InvertedListScanner > - (sq, quantizer, store_pairs, by_residual); - } else { - FAISS_THROW_MSG("unsupported metric type"); - } -} - - -} // anonymous namespace - - InvertedListScanner* ScalarQuantizer::select_InvertedListScanner (MetricType mt, const Index *quantizer, bool store_pairs, bool by_residual) const { - if (d % 16 == 0 && support_avx512()) { - return sel0_InvertedListScanner<16> - (mt, this, quantizer, store_pairs, by_residual); - } if (d % 8 == 0) { - return sel0_InvertedListScanner<8> - (mt, this, quantizer, store_pairs, by_residual); - } else { - return sel0_InvertedListScanner<1> - (mt, this, quantizer, store_pairs, by_residual); - } + /* use hook to decide use AVX512 or not */ + return sq_sel_inv_list_scanner(mt, this, quantizer, d, store_pairs, by_residual); } - } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h index a8f8c46d5c..cb447c603b 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizer.h @@ -77,6 +77,166 @@ struct ScalarQuantizer { }; +template +struct IVFSQScannerIP: InvertedListScanner { + DCClass dc; + bool store_pairs, by_residual; + + size_t code_size; + + idx_t list_no; /// current list (set to 0 for Flat index + float accu0; /// added to all distances + + IVFSQScannerIP(int d, const std::vector & trained, + size_t code_size, bool store_pairs, + bool by_residual): + dc(d, trained), store_pairs(store_pairs), + by_residual(by_residual), + code_size(code_size), list_no(0), accu0(0) + {} + + + void set_query (const float *query) override { + dc.set_query (query); + } + + void set_list (idx_t list_no, float coarse_dis) override { + this->list_no = list_no; + accu0 = by_residual ? coarse_dis : 0; + } + + float distance_to_code (const uint8_t *code) const final { + return accu0 + dc.query_to_code (code); + } + + size_t scan_codes (size_t list_size, + const uint8_t *codes, + const idx_t *ids, + float *simi, idx_t *idxi, + size_t k, + ConcurrentBitsetPtr bitset) const override + { + size_t nup = 0; + + for (size_t j = 0; j < list_size; j++) { + if(!bitset || !bitset->test(ids[j])){ + float accu = accu0 + dc.query_to_code (codes); + + if (accu > simi [0]) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + minheap_swap_top (k, simi, idxi, accu, id); + nup++; + } + } + codes += code_size; + } + return nup; + } + + void scan_codes_range (size_t list_size, + const uint8_t *codes, + const idx_t *ids, + float radius, + RangeQueryResult & res, + ConcurrentBitsetPtr bitset = nullptr) const override + { + for (size_t j = 0; j < list_size; j++) { + float accu = accu0 + dc.query_to_code (codes); + if (accu > radius) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + res.add (accu, id); + } + codes += code_size; + } + } +}; + + +template +struct IVFSQScannerL2: InvertedListScanner { + DCClass dc; + + bool store_pairs, by_residual; + size_t code_size; + const Index *quantizer; + idx_t list_no; /// current inverted list + const float *x; /// current query + + std::vector tmp; + + IVFSQScannerL2(int d, const std::vector & trained, + size_t code_size, const Index *quantizer, + bool store_pairs, bool by_residual): + dc(d, trained), store_pairs(store_pairs), by_residual(by_residual), + code_size(code_size), quantizer(quantizer), + list_no (0), x (nullptr), tmp (d) + { + } + + + void set_query (const float *query) override { + x = query; + if (!quantizer) { + dc.set_query (query); + } + } + + + void set_list (idx_t list_no, float /*coarse_dis*/) override { + if (by_residual) { + this->list_no = list_no; + // shift of x_in wrt centroid + quantizer->Index::compute_residual (x, tmp.data(), list_no); + dc.set_query (tmp.data ()); + } else { + dc.set_query (x); + } + } + + float distance_to_code (const uint8_t *code) const final { + return dc.query_to_code (code); + } + + size_t scan_codes (size_t list_size, + const uint8_t *codes, + const idx_t *ids, + float *simi, idx_t *idxi, + size_t k, + ConcurrentBitsetPtr bitset) const override + { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + if(!bitset || !bitset->test(ids[j])){ + float dis = dc.query_to_code (codes); + + if (dis < simi [0]) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + maxheap_swap_top (k, simi, idxi, dis, id); + nup++; + } + } + codes += code_size; + } + return nup; + } + + void scan_codes_range (size_t list_size, + const uint8_t *codes, + const idx_t *ids, + float radius, + RangeQueryResult & res, + ConcurrentBitsetPtr bitset = nullptr) const override + { + for (size_t j = 0; j < list_size; j++) { + float dis = dc.query_to_code (codes); + if (dis < radius) { + int64_t id = store_pairs ? (list_no << 32 | j) : ids[j]; + res.add (dis, id); + } + codes += code_size; + } + } +}; } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h index 5baeead978..38abdc7e74 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec.h @@ -15,6 +15,7 @@ #include #include +#include #include namespace faiss { @@ -311,22 +312,6 @@ struct SimilarityL2<1> { } }; -/* as same as SimilarityL2<1>, let build pass */ -template<> -struct SimilarityL2<8> : SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - explicit SimilarityL2 (const float * y) : SimilarityL2<1>(y) {} -}; - -/* as same as SimilarityL2<1>, let build pass */ -template<> -struct SimilarityL2<16> : SimilarityL2<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_L2; - explicit SimilarityL2 (const float * y) : SimilarityL2<1>(y) {} -}; - template struct SimilarityIP {}; @@ -360,22 +345,6 @@ struct SimilarityIP<1> { } }; -/* as same as SimilarityIP<1>, let build pass */ -template<> -struct SimilarityIP<8> : SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - explicit SimilarityIP (const float * y) : SimilarityIP<1>(y) {} -}; - -/* as same as SimilarityIP<1>, let build pass */ -template<> -struct SimilarityIP<16> : SimilarityIP<1> { - static constexpr int simdwidth = 1; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - explicit SimilarityIP (const float * y) : SimilarityIP<1>(y) {} -}; - /******************************************************************* * DistanceComputer: combines a similarity and a quantizer to do @@ -544,5 +513,91 @@ SQDistanceComputer *select_distance_computer ( return nullptr; } +template +InvertedListScanner* sel2_InvertedListScanner ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + if (DCClass::Sim::metric_type == METRIC_L2) { + return new IVFSQScannerL2(sq->d, sq->trained, sq->code_size, + quantizer, store_pairs, r); + } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) { + return new IVFSQScannerIP(sq->d, sq->trained, sq->code_size, + store_pairs, r); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} + +template +InvertedListScanner* sel12_InvertedListScanner ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + using QuantizerClass = QuantizerTemplate; + using DCClass = DCTemplate; + return sel2_InvertedListScanner (sq, quantizer, store_pairs, r); +} + + +template +InvertedListScanner* sel1_InvertedListScanner ( + const ScalarQuantizer *sq, const Index *quantizer, + bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + switch(sq->qtype) { + case QuantizerType::QT_8bit_uniform: + return sel12_InvertedListScanner + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit_uniform: + return sel12_InvertedListScanner + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit: + return sel12_InvertedListScanner + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit: + return sel12_InvertedListScanner + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_6bit: + return sel12_InvertedListScanner + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_fp16: + return sel2_InvertedListScanner + , Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit_direct: + if (sq->d % 16 == 0) { + return sel2_InvertedListScanner + > + (sq, quantizer, store_pairs, r); + } else { + return sel2_InvertedListScanner + , + Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + } + } + + FAISS_THROW_MSG ("unknown qtype"); + return nullptr; +} + +template +InvertedListScanner* sel0_InvertedListScanner ( + MetricType mt, const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool by_residual) +{ + if (mt == METRIC_L2) { + return sel1_InvertedListScanner > + (sq, quantizer, store_pairs, by_residual); + } else if (mt == METRIC_INNER_PRODUCT) { + return sel1_InvertedListScanner > + (sq, quantizer, store_pairs, by_residual); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h index e9d2791b02..e361376b5f 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx.h @@ -11,41 +11,24 @@ #include #include - #include - -#ifdef __SSE__ #include -#endif #include #include +#include +#include #include namespace faiss { - -#ifdef __AVX__ -#define USE_AVX -#endif - - /******************************************************************* * Codec: converts between values in [0, 1] and an index in a code * array. The "i" parameter is the vector component index (not byte * index). */ -struct Codec8bit_avx { - static void encode_component (float x, uint8_t *code, int i) { - code[i] = (int)(255 * x); - } - - static float decode_component (const uint8_t *code, int i) { - return (code[i] + 0.5f) / 255.0f; - } - -#ifdef USE_AVX +struct Codec8bit_avx : public Codec8bit { static __m256 decode_8_components (const uint8_t *code, int i) { uint64_t c8 = *(uint64_t*)(code + i); __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8)); @@ -59,20 +42,9 @@ struct Codec8bit_avx { __m256 one_255 = _mm256_set1_ps (1.f / 255.f); return f8 * one_255; } -#endif }; - -struct Codec4bit_avx { - static void encode_component (float x, uint8_t *code, int i) { - code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); - } - - static float decode_component (const uint8_t *code, int i) { - return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; - } - -#ifdef USE_AVX +struct Codec4bit_avx : public Codec4bit { static __m256 decode_8_components (const uint8_t *code, int i) { uint32_t c4 = *(uint32_t*)(code + (i >> 1)); uint32_t mask = 0x0f0f0f0f; @@ -92,54 +64,9 @@ struct Codec4bit_avx { __m256 one_255 = _mm256_set1_ps (1.f / 15.f); return f8 * one_255; } -#endif }; -struct Codec6bit_avx { - static void encode_component (float x, uint8_t *code, int i) { - int bits = (int)(x * 63.0); - code += (i >> 2) * 3; - switch(i & 3) { - case 0: - code[0] |= bits; - break; - case 1: - code[0] |= bits << 6; - code[1] |= bits >> 2; - break; - case 2: - code[1] |= bits << 4; - code[2] |= bits >> 4; - break; - case 3: - code[2] |= bits << 2; - break; - } - } - - static float decode_component (const uint8_t *code, int i) { - uint8_t bits; - code += (i >> 2) * 3; - switch(i & 3) { - case 0: - bits = code[0] & 0x3f; - break; - case 1: - bits = code[0] >> 6; - bits |= (code[1] & 0xf) << 2; - break; - case 2: - bits = code[1] >> 4; - bits |= (code[2] & 3) << 4; - break; - case 3: - bits = code[2] >> 2; - break; - } - return (bits + 0.5f) / 63.0f; - } - -#ifdef USE_AVX +struct Codec6bit_avx : public Codec6bit { static __m256 decode_8_components (const uint8_t *code, int i) { return _mm256_set_ps (decode_component(code, i + 7), @@ -151,127 +78,51 @@ struct Codec6bit_avx { decode_component(code, i + 1), decode_component(code, i + 0)); } -#endif }; - /******************************************************************* * Quantizer: normalizes scalar vector components, then passes them * through a codec *******************************************************************/ - template struct QuantizerTemplate_avx {}; - template -struct QuantizerTemplate_avx: Quantizer { - const size_t d; - const float vmin, vdiff; - - QuantizerTemplate_avx(size_t d, const std::vector &trained): - d(d), vmin(trained[0]), vdiff(trained[1]) - { - } - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - float xi = Codec::decode_component (code, i); - return vmin + xi * vdiff; - } +struct QuantizerTemplate_avx : public QuantizerTemplate { + QuantizerTemplate_avx(size_t d, const std::vector &trained) : + QuantizerTemplate (d, trained) {} }; - - -#ifdef USE_AVX - template -struct QuantizerTemplate_avx: QuantizerTemplate_avx { - QuantizerTemplate_avx (size_t d, const std::vector &trained): - QuantizerTemplate_avx (d, trained) {} +struct QuantizerTemplate_avx : public QuantizerTemplate { + QuantizerTemplate_avx (size_t d, const std::vector &trained) : + QuantizerTemplate (d, trained) {} - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { + __m256 reconstruct_8_components (const uint8_t * code, int i) const { __m256 xi = Codec::decode_8_components (code, i); return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff); } }; -#endif - - - template -struct QuantizerTemplate_avx: Quantizer { - const size_t d; - const float *vmin, *vdiff; - - QuantizerTemplate_avx (size_t d, const std::vector &trained): - d(d), vmin(trained.data()), vdiff(trained.data() + d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = (x[i] - vmin[i]) / vdiff[i]; - if (xi < 0) - xi = 0; - if (xi > 1.0) - xi = 1.0; - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin[i] + xi * vdiff[i]; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - float xi = Codec::decode_component (code, i); - return vmin[i] + xi * vdiff[i]; - } +struct QuantizerTemplate_avx : public QuantizerTemplate { + QuantizerTemplate_avx (size_t d, const std::vector &trained) : + QuantizerTemplate (d, trained) {} }; - -#ifdef USE_AVX - template -struct QuantizerTemplate_avx: QuantizerTemplate_avx { - QuantizerTemplate_avx (size_t d, const std::vector &trained): - QuantizerTemplate_avx (d, trained) {} +struct QuantizerTemplate_avx: public QuantizerTemplate { + QuantizerTemplate_avx (size_t d, const std::vector &trained) : + QuantizerTemplate (d, trained) {} - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { + __m256 reconstruct_8_components (const uint8_t * code, int i) const { __m256 xi = Codec::decode_8_components (code, i); return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i); } }; -#endif /******************************************************************* * FP16 quantizer @@ -281,45 +132,22 @@ template struct QuantizerFP16_avx {}; template<> -struct QuantizerFP16_avx<1>: Quantizer { - const size_t d; - - QuantizerFP16_avx(size_t d, const std::vector & /* unused */): - d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_fp16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_fp16(((uint16_t*)code)[i]); - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - return decode_fp16(((uint16_t*)code)[i]); - } +struct QuantizerFP16_avx<1> : public QuantizerFP16<1> { + QuantizerFP16_avx (size_t d, const std::vector &unused) : + QuantizerFP16<1> (d, unused) {} }; -#ifdef USE_AVX - template<> -struct QuantizerFP16_avx<8>: QuantizerFP16_avx<1> { +struct QuantizerFP16_avx<8>: public QuantizerFP16<1> { QuantizerFP16_avx (size_t d, const std::vector &trained): - QuantizerFP16_avx<1> (d, trained) {} + QuantizerFP16<1> (d, trained) {} - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { + __m256 reconstruct_8_components (const uint8_t * code, int i) const { __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i)); return _mm256_cvtph_ps (codei); } }; -#endif /******************************************************************* * 8bit_direct quantizer @@ -329,75 +157,46 @@ template struct Quantizer8bitDirect_avx {}; template<> -struct Quantizer8bitDirect_avx<1>: Quantizer { - const size_t d; - - Quantizer8bitDirect_avx(size_t d, const std::vector & /* unused */): - d(d) {} - - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i]; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - return code[i]; - } +struct Quantizer8bitDirect_avx<1> : public Quantizer8bitDirect<1> { + Quantizer8bitDirect_avx (size_t d, const std::vector &unused) : + Quantizer8bitDirect(d, unused) {} }; -#ifdef USE_AVX - template<> -struct Quantizer8bitDirect_avx<8>: Quantizer8bitDirect_avx<1> { - Quantizer8bitDirect_avx (size_t d, const std::vector &trained): - Quantizer8bitDirect_avx<1> (d, trained) {} +struct Quantizer8bitDirect_avx<8>: public Quantizer8bitDirect<1> { + Quantizer8bitDirect_avx (size_t d, const std::vector &trained) : + Quantizer8bitDirect<1> (d, trained) {} - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { + __m256 reconstruct_8_components (const uint8_t * code, int i) const { __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 __m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32 return _mm256_cvtepi32_ps (y8); // 8 * float32 } }; -#endif - - template -Quantizer *select_quantizer_1_avx ( - QuantizerType qtype, - size_t d, const std::vector & trained) -{ +Quantizer *select_quantizer_1_avx (QuantizerType qtype, size_t d, + const std::vector & trained) { switch(qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx(d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx(d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx(d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx(d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx(d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx (d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx (d, trained); + case QuantizerType::QT_8bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_6bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_4bit: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_8bit_uniform: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_4bit_uniform: + return new QuantizerTemplate_avx(d, trained); + case QuantizerType::QT_fp16: + return new QuantizerFP16_avx(d, trained); + case QuantizerType::QT_8bit_direct: + return new Quantizer8bitDirect_avx(d, trained); } FAISS_THROW_MSG ("unknown qtype"); } - /******************************************************************* * Similarity: gets vector components and computes a similarity wrt. a * query vector stored in the object. The data fields just encapsulate @@ -407,42 +206,14 @@ Quantizer *select_quantizer_1_avx ( template struct SimilarityL2_avx {}; - template<> -struct SimilarityL2_avx<1> { +struct SimilarityL2_avx<1> : public SimilarityL2<1> { static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_L2; - const float *y, *yi; - - explicit SimilarityL2_avx (const float * y): y(y) {} - - /******* scalar accumulator *******/ - - float accu; - - void begin () { - accu = 0; - yi = y; - } - - void add_component (float x) { - float tmp = *yi++ - x; - accu += tmp * tmp; - } - - void add_component_2 (float x1, float x2) { - float tmp = x1 - x2; - accu += tmp * tmp; - } - - float result () { - return accu; - } + explicit SimilarityL2_avx (const float * y) : SimilarityL2<1>(y) {} }; - -#ifdef USE_AVX template<> struct SimilarityL2_avx<8> { static constexpr int simdwidth = 8; @@ -480,51 +251,18 @@ struct SimilarityL2_avx<8> { } }; -/* as same as SimilarityL2<8>, let build pass */ -template<> -struct SimilarityL2_avx<16> : SimilarityL2_avx<8>{ - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_L2; - explicit SimilarityL2_avx (const float * y) : SimilarityL2_avx<8>(y) {} -}; -#endif - template struct SimilarityIP_avx {}; - template<> -struct SimilarityIP_avx<1> { +struct SimilarityIP_avx<1> : public SimilarityIP<1> { static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - float accu; - - explicit SimilarityIP_avx (const float * y): - y (y) {} - - void begin () { - accu = 0; - yi = y; - } - - void add_component (float x) { - accu += *yi++ * x; - } - - void add_component_2 (float x1, float x2) { - accu += x1 * x2; - } - - float result () { - return accu; - } + explicit SimilarityIP_avx (const float * y) : SimilarityIP<1>(y) {} }; -#ifdef USE_AVX - template<> struct SimilarityIP_avx<8> { static constexpr int simdwidth = 8; @@ -534,8 +272,7 @@ struct SimilarityIP_avx<8> { float accu; - explicit SimilarityIP_avx (const float * y): - y (y) {} + explicit SimilarityIP_avx (const float * y): y (y) {} __m256 accu8; @@ -564,15 +301,6 @@ struct SimilarityIP_avx<8> { } }; -/* as same as SimilarityIP<8>, let build pass */ -template<> -struct SimilarityIP_avx<16> : SimilarityIP_avx<8> { - static constexpr int simdwidth = 8; - static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - explicit SimilarityIP_avx (const float * y) : SimilarityIP_avx<8>(y) {} -}; -#endif - /******************************************************************* * DistanceComputer: combines a similarity and a quantizer to do @@ -583,69 +311,19 @@ template struct DCTemplate_avx : SQDistanceComputer {}; template -struct DCTemplate_avx : SQDistanceComputer -{ - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx(size_t d, const std::vector &trained): - quant(d, trained) - {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float xi = quant.reconstruct_component(code, i); - sim.add_component(xi); - } - return sim.result(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float x1 = quant.reconstruct_component(code1, i); - float x2 = quant.reconstruct_component(code2, i); - sim.add_component_2(x1, x2); - } - return sim.result(); - } - - void set_query (const float *x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_distance (q, code); - } +struct DCTemplate_avx : public DCTemplate { + DCTemplate_avx(size_t d, const std::vector &trained) : + DCTemplate(d, trained) {} }; -#ifdef USE_AVX - template -struct DCTemplate_avx : SQDistanceComputer -{ +struct DCTemplate_avx : SQDistanceComputer { using Sim = Similarity; Quantizer quant; DCTemplate_avx(size_t d, const std::vector &trained): - quant(d, trained) - {} + quant(d, trained) {} float compute_distance(const float* x, const uint8_t* code) const { Similarity sim(x); @@ -688,9 +366,6 @@ struct DCTemplate_avx : SQDistanceComputer } }; -#endif - - /******************************************************************* * DistanceComputerByte: computes distances in the integer domain @@ -700,58 +375,11 @@ template struct DistanceComputerByte_avx : SQDistanceComputer {}; template -struct DistanceComputerByte_avx : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx(int d, const std::vector &): d(d), tmp(d) { - } - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query (const float *x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_code_distance (tmp.data(), code); - } +struct DistanceComputerByte_avx : public DistanceComputerByte { + DistanceComputerByte_avx(int d, const std::vector &unused) : + DistanceComputerByte(d, unused) {} }; -#ifdef USE_AVX - - template struct DistanceComputerByte_avx : SQDistanceComputer { using Sim = Similarity; @@ -759,11 +387,9 @@ struct DistanceComputerByte_avx : SQDistanceComputer { int d; std::vector tmp; - DistanceComputerByte_avx(int d, const std::vector &): d(d), tmp(d) { - } + DistanceComputerByte_avx(int d, const std::vector &): d(d), tmp(d) {} - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) const { // __m256i accu = _mm256_setzero_ps (); __m256i accu = _mm256_setzero_si256 (); for (int i = 0; i < d; i += 16) { @@ -819,14 +445,12 @@ struct DistanceComputerByte_avx : SQDistanceComputer { } }; -#endif /******************************************************************* * select_distance_computer: runtime selection of template * specialization *******************************************************************/ - template SQDistanceComputer *select_distance_computer_avx ( QuantizerType qtype, @@ -834,41 +458,119 @@ SQDistanceComputer *select_distance_computer_avx ( { constexpr int SIMDWIDTH = Sim::simdwidth; switch(qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_8bit_uniform: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit_uniform: - return new DCTemplate_avx, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_4bit_uniform: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_8bit: - return new DCTemplate_avx, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_8bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_6bit: - return new DCTemplate_avx, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_6bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit: - return new DCTemplate_avx, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_4bit: + return new DCTemplate_avx, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_fp16: - return new DCTemplate_avx - , Sim, SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx(d, trained); - } else { + case QuantizerType::QT_fp16: return new DCTemplate_avx - , Sim, SIMDWIDTH>(d, trained); - } + , Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_8bit_direct: + if (d % 16 == 0) { + return new DistanceComputerByte_avx(d, trained); + } else { + return new DCTemplate_avx + , Sim, SIMDWIDTH>(d, trained); + } } FAISS_THROW_MSG ("unknown qtype"); return nullptr; } +template +InvertedListScanner* sel2_InvertedListScanner_avx ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + return sel2_InvertedListScanner (sq, quantizer, store_pairs, r); +} + +template +InvertedListScanner* sel12_InvertedListScanner_avx ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + using QuantizerClass = QuantizerTemplate_avx; + using DCClass = DCTemplate_avx; + return sel2_InvertedListScanner_avx (sq, quantizer, store_pairs, r); +} + + +template +InvertedListScanner* sel1_InvertedListScanner_avx ( + const ScalarQuantizer *sq, const Index *quantizer, + bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + switch(sq->qtype) { + case QuantizerType::QT_8bit_uniform: + return sel12_InvertedListScanner_avx + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit_uniform: + return sel12_InvertedListScanner_avx + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit: + return sel12_InvertedListScanner_avx + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit: + return sel12_InvertedListScanner_avx + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_6bit: + return sel12_InvertedListScanner_avx + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_fp16: + return sel2_InvertedListScanner_avx + , Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit_direct: + if (sq->d % 16 == 0) { + return sel2_InvertedListScanner_avx + > + (sq, quantizer, store_pairs, r); + } else { + return sel2_InvertedListScanner_avx + , + Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + } + } + + FAISS_THROW_MSG ("unknown qtype"); + return nullptr; +} + +template +InvertedListScanner* sel0_InvertedListScanner_avx ( + MetricType mt, const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool by_residual) +{ + if (mt == METRIC_L2) { + return sel1_InvertedListScanner_avx > + (sq, quantizer, store_pairs, by_residual); + } else if (mt == METRIC_INNER_PRODUCT) { + return sel1_InvertedListScanner_avx > + (sq, quantizer, store_pairs, by_residual); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h index cd3ec5c9db..3892f9dfdd 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerCodec_avx512.h @@ -11,15 +11,13 @@ #include #include - #include - -#ifdef __SSE__ #include -#endif #include #include +#include +#include #include namespace faiss { @@ -39,14 +37,6 @@ namespace faiss { * that hides the template mess. ********************************************************************/ -#ifdef __AVX__ -#define USE_AVX -#endif - -#if (defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__)) -#define USE_AVX_512 -#endif - /******************************************************************* * Codec: converts between values in [0, 1] and an index in a code @@ -54,32 +44,7 @@ namespace faiss { * index). */ -struct Codec8bit_avx512 { - static void encode_component (float x, uint8_t *code, int i) { - code[i] = (int)(255 * x); - } - - static float decode_component (const uint8_t *code, int i) { - return (code[i] + 0.5f) / 255.0f; - } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - uint64_t c8 = *(uint64_t*)(code + i); - __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8)); - __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32)); - // __m256i i8 = _mm256_set_m128i(c4lo, c4hi); - __m256i i8 = _mm256_castsi128_si256 (c4lo); - i8 = _mm256_insertf128_si256 (i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps (i8); - __m256 half = _mm256_set1_ps (0.5f); - f8 += half; - __m256 one_255 = _mm256_set1_ps (1.f / 255.f); - return f8 * one_255; - } -#endif - -#ifdef USE_AVX_512 +struct Codec8bit_avx512 : public Codec8bit_avx { static __m512 decode_16_components (const uint8_t *code, int i) { uint64_t c8 = *(uint64_t*)(code + i); __m256i c8lo = _mm256_cvtepu8_epi32 (_mm_set1_epi64x(c8)); @@ -94,42 +59,9 @@ struct Codec8bit_avx512 { __m512 one_255 = _mm512_set1_ps (1.f / 255.f); return f16 * one_255; } -#endif }; - -struct Codec4bit_avx512 { - static void encode_component (float x, uint8_t *code, int i) { - code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2); - } - - static float decode_component (const uint8_t *code, int i) { - return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f; - } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - uint32_t c4 = *(uint32_t*)(code + (i >> 1)); - uint32_t mask = 0x0f0f0f0f; - uint32_t c4ev = c4 & mask; - uint32_t c4od = (c4 >> 4) & mask; - - // the 8 lower bytes of c8 contain the values - __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev), - _mm_set1_epi32(c4od)); - __m128i c4lo = _mm_cvtepu8_epi32 (c8); - __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4)); - __m256i i8 = _mm256_castsi128_si256 (c4lo); - i8 = _mm256_insertf128_si256 (i8, c4hi, 1); - __m256 f8 = _mm256_cvtepi32_ps (i8); - __m256 half = _mm256_set1_ps (0.5f); - f8 += half; - __m256 one_255 = _mm256_set1_ps (1.f / 15.f); - return f8 * one_255; - } -#endif - -#ifdef USE_AVX_512 +struct Codec4bit_avx512 : public Codec4bit_avx { static __m512 decode_16_components (const uint8_t *code, int i) { uint64_t c8 = *(uint64_t*)(code + (i >> 1)); uint64_t mask = 0x0f0f0f0f0f0f0f0f; @@ -149,68 +81,9 @@ struct Codec4bit_avx512 { __m512 one_255 = _mm512_set1_ps (1.f / 15.f); return f16 * one_255; } -#endif }; -struct Codec6bit_avx512 { - static void encode_component (float x, uint8_t *code, int i) { - int bits = (int)(x * 63.0); - code += (i >> 2) * 3; - switch(i & 3) { - case 0: - code[0] |= bits; - break; - case 1: - code[0] |= bits << 6; - code[1] |= bits >> 2; - break; - case 2: - code[1] |= bits << 4; - code[2] |= bits >> 4; - break; - case 3: - code[2] |= bits << 2; - break; - } - } - - static float decode_component (const uint8_t *code, int i) { - uint8_t bits; - code += (i >> 2) * 3; - switch(i & 3) { - case 0: - bits = code[0] & 0x3f; - break; - case 1: - bits = code[0] >> 6; - bits |= (code[1] & 0xf) << 2; - break; - case 2: - bits = code[1] >> 4; - bits |= (code[2] & 3) << 4; - break; - case 3: - bits = code[2] >> 2; - break; - } - return (bits + 0.5f) / 63.0f; - } - -#ifdef USE_AVX - static __m256 decode_8_components (const uint8_t *code, int i) { - return _mm256_set_ps - (decode_component(code, i + 7), - decode_component(code, i + 6), - decode_component(code, i + 5), - decode_component(code, i + 4), - decode_component(code, i + 3), - decode_component(code, i + 2), - decode_component(code, i + 1), - decode_component(code, i + 0)); - } -#endif - -#ifdef USE_AVX_512 +struct Codec6bit_avx512 : public Codec6bit_avx { static __m512 decode_16_components (const uint8_t *code, int i) { return _mm512_set_ps (decode_component(code, i + 15), @@ -230,150 +103,63 @@ struct Codec6bit_avx512 { decode_component(code, i + 1), decode_component(code, i + 0)); } -#endif }; - /******************************************************************* * Quantizer: normalizes scalar vector components, then passes them * through a codec *******************************************************************/ - template struct QuantizerTemplate_avx512 {}; - template -struct QuantizerTemplate_avx512: Quantizer { - const size_t d; - const float vmin, vdiff; - - QuantizerTemplate_avx512(size_t d, const std::vector &trained): - d(d), vmin(trained[0]), vdiff(trained[1]) - { - } - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = (x[i] - vmin) / vdiff; - if (xi < 0) { - xi = 0; - } - if (xi > 1.0) { - xi = 1.0; - } - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin + xi * vdiff; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - float xi = Codec::decode_component (code, i); - return vmin + xi * vdiff; - } +struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { + QuantizerTemplate_avx512(size_t d, const std::vector &trained) : + QuantizerTemplate_avx (d, trained) {} }; - -#ifdef USE_AVX template -struct QuantizerTemplate_avx512: QuantizerTemplate_avx512 { - QuantizerTemplate_avx512 (size_t d, const std::vector &trained): - QuantizerTemplate_avx512 (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m256 xi = Codec::decode_8_components (code, i); - return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff); - } +struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { + QuantizerTemplate_avx512 (size_t d, const std::vector &trained) : + QuantizerTemplate_avx (d, trained) {} }; -#endif -#ifdef USE_AVX_512 template -struct QuantizerTemplate_avx512: QuantizerTemplate_avx512 { - QuantizerTemplate_avx512 (size_t d, const std::vector &trained): - QuantizerTemplate_avx512 (d, trained) {} +struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { + QuantizerTemplate_avx512 (size_t d, const std::vector &trained) : + QuantizerTemplate_avx (d, trained) {} - __m512 reconstruct_16_components (const uint8_t * code, int i) const - { + __m512 reconstruct_16_components (const uint8_t * code, int i) const { __m512 xi = Codec::decode_16_components (code, i); return _mm512_set1_ps(this->vmin) + xi * _mm512_set1_ps (this->vdiff); } }; -#endif template -struct QuantizerTemplate_avx512: Quantizer { - const size_t d; - const float *vmin, *vdiff; - - QuantizerTemplate_avx512 (size_t d, const std::vector &trained): - d(d), vmin(trained.data()), vdiff(trained.data() + d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - float xi = (x[i] - vmin[i]) / vdiff[i]; - if (xi < 0) - xi = 0; - if (xi > 1.0) - xi = 1.0; - Codec::encode_component(xi, code, i); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - float xi = Codec::decode_component(code, i); - x[i] = vmin[i] + xi * vdiff[i]; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - float xi = Codec::decode_component (code, i); - return vmin[i] + xi * vdiff[i]; - } +struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { + QuantizerTemplate_avx512 (size_t d, const std::vector &trained) : + QuantizerTemplate_avx (d, trained) {} }; - -#ifdef USE_AVX template -struct QuantizerTemplate_avx512: QuantizerTemplate_avx512 { +struct QuantizerTemplate_avx512 : public QuantizerTemplate_avx { QuantizerTemplate_avx512 (size_t d, const std::vector &trained): - QuantizerTemplate_avx512 (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m256 xi = Codec::decode_8_components (code, i); - return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i); - } + QuantizerTemplate_avx (d, trained) {} }; -#endif -#ifdef USE_AVX_512 template -struct QuantizerTemplate_avx512: QuantizerTemplate_avx512 { - +struct QuantizerTemplate_avx512: public QuantizerTemplate_avx { QuantizerTemplate_avx512 (size_t d, const std::vector &trained): - QuantizerTemplate_avx512 (d, trained) {} + QuantizerTemplate_avx (d, trained) {} - __m512 reconstruct_16_components (const uint8_t * code, int i) const - { + __m512 reconstruct_16_components (const uint8_t * code, int i) const { __m512 xi = Codec::decode_16_components (code, i); return _mm512_loadu_ps (this->vmin + i) + xi * _mm512_loadu_ps (this->vdiff + i); } }; -#endif /******************************************************************* * FP16 quantizer @@ -383,57 +169,27 @@ template struct QuantizerFP16_avx512 {}; template<> -struct QuantizerFP16_avx512<1>: Quantizer { - const size_t d; - - QuantizerFP16_avx512(size_t d, const std::vector & /* unused */): - d(d) {} - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - ((uint16_t*)code)[i] = encode_fp16(x[i]); - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = decode_fp16(((uint16_t*)code)[i]); - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - return decode_fp16(((uint16_t*)code)[i]); - } +struct QuantizerFP16_avx512<1> : public QuantizerFP16_avx<1> { + QuantizerFP16_avx512(size_t d, const std::vector &unused) : + QuantizerFP16_avx<1> (d, unused) {} }; -#ifdef USE_AVX template<> -struct QuantizerFP16_avx512<8>: QuantizerFP16_avx512<1> { - QuantizerFP16_avx512 (size_t d, const std::vector &trained): - QuantizerFP16_avx512<1> (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i)); - return _mm256_cvtph_ps (codei); - } +struct QuantizerFP16_avx512<8> : public QuantizerFP16_avx<8> { + QuantizerFP16_avx512 (size_t d, const std::vector &trained) : + QuantizerFP16_avx<8> (d, trained) {} }; -#endif -#ifdef USE_AVX_512 template<> -struct QuantizerFP16_avx512<16>: QuantizerFP16_avx512<1> { +struct QuantizerFP16_avx512<16>: public QuantizerFP16_avx<8> { QuantizerFP16_avx512 (size_t d, const std::vector &trained): - QuantizerFP16_avx512<1> (d, trained) {} + QuantizerFP16_avx<8> (d, trained) {} - __m512 reconstruct_16_components (const uint8_t * code, int i) const - { + __m512 reconstruct_16_components (const uint8_t * code, int i) const { __m256i codei = _mm256_loadu_si256 ((const __m256i*)(code + 2 * i)); return _mm512_cvtph_ps (codei); } }; -#endif /******************************************************************* * 8bit_direct quantizer @@ -443,89 +199,54 @@ template struct Quantizer8bitDirect_avx512 {}; template<> -struct Quantizer8bitDirect_avx512<1>: Quantizer { - const size_t d; - - Quantizer8bitDirect_avx512(size_t d, const std::vector & /* unused */): - d(d) {} - - - void encode_vector(const float* x, uint8_t* code) const final { - for (size_t i = 0; i < d; i++) { - code[i] = (uint8_t)x[i]; - } - } - - void decode_vector(const uint8_t* code, float* x) const final { - for (size_t i = 0; i < d; i++) { - x[i] = code[i]; - } - } - - float reconstruct_component (const uint8_t * code, int i) const - { - return code[i]; - } +struct Quantizer8bitDirect_avx512<1> : public Quantizer8bitDirect_avx<1> { + Quantizer8bitDirect_avx512(size_t d, const std::vector &unused) : + Quantizer8bitDirect_avx<1> (d, unused) {} }; -#ifdef USE_AVX template<> -struct Quantizer8bitDirect_avx512<8>: Quantizer8bitDirect_avx512<1> { +struct Quantizer8bitDirect_avx512<8> : public Quantizer8bitDirect_avx<8> { Quantizer8bitDirect_avx512 (size_t d, const std::vector &trained): - Quantizer8bitDirect_avx512<1> (d, trained) {} - - __m256 reconstruct_8_components (const uint8_t * code, int i) const - { - __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8 - __m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32 - return _mm256_cvtepi32_ps (y8); // 8 * float32 - } + Quantizer8bitDirect_avx<8> (d, trained) {} }; -#endif -#ifdef USE_AVX_512 template<> -struct Quantizer8bitDirect_avx512<16>: Quantizer8bitDirect_avx512<1> { - +struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> { Quantizer8bitDirect_avx512 (size_t d, const std::vector &trained): - Quantizer8bitDirect_avx512<1> (d, trained) {} + Quantizer8bitDirect_avx<8> (d, trained) {} - __m512 reconstruct_16_components (const uint8_t * code, int i) const - { + __m512 reconstruct_16_components (const uint8_t * code, int i) const { __m128i x8 = _mm_load_si128((__m128i*)(code + i)); // 16 * int8 __m512i y8 = _mm512_cvtepu8_epi32 (x8); // 16 * int32 return _mm512_cvtepi32_ps (y8); // 16 * float32 } }; -#endif template -Quantizer *select_quantizer_1_avx512 ( - QuantizerType qtype, - size_t d, const std::vector & trained) +Quantizer *select_quantizer_1_avx512 (QuantizerType qtype, size_t d, + const std::vector & trained) { switch(qtype) { - case QuantizerType::QT_8bit: - return new QuantizerTemplate_avx512(d, trained); - case QuantizerType::QT_6bit: - return new QuantizerTemplate_avx512(d, trained); - case QuantizerType::QT_4bit: - return new QuantizerTemplate_avx512(d, trained); - case QuantizerType::QT_8bit_uniform: - return new QuantizerTemplate_avx512(d, trained); - case QuantizerType::QT_4bit_uniform: - return new QuantizerTemplate_avx512(d, trained); - case QuantizerType::QT_fp16: - return new QuantizerFP16_avx512 (d, trained); - case QuantizerType::QT_8bit_direct: - return new Quantizer8bitDirect_avx512 (d, trained); + case QuantizerType::QT_8bit: + return new QuantizerTemplate_avx512(d, trained); + case QuantizerType::QT_6bit: + return new QuantizerTemplate_avx512(d, trained); + case QuantizerType::QT_4bit: + return new QuantizerTemplate_avx512(d, trained); + case QuantizerType::QT_8bit_uniform: + return new QuantizerTemplate_avx512(d, trained); + case QuantizerType::QT_4bit_uniform: + return new QuantizerTemplate_avx512(d, trained); + case QuantizerType::QT_fp16: + return new QuantizerFP16_avx512(d, trained); + case QuantizerType::QT_8bit_direct: + return new Quantizer8bitDirect_avx512(d, trained); } FAISS_THROW_MSG ("unknown qtype"); } - /******************************************************************* * Similarity: gets vector components and computes a similarity wrt. a * query vector stored in the object. The data fields just encapsulate @@ -537,79 +258,21 @@ struct SimilarityL2_avx512 {}; template<> -struct SimilarityL2_avx512<1> { +struct SimilarityL2_avx512<1> : public SimilarityL2_avx<1> { static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_L2; - const float *y, *yi; - - explicit SimilarityL2_avx512 (const float * y): y(y) {} - - /******* scalar accumulator *******/ - - float accu; - - void begin () { - accu = 0; - yi = y; - } - - void add_component (float x) { - float tmp = *yi++ - x; - accu += tmp * tmp; - } - - void add_component_2 (float x1, float x2) { - float tmp = x1 - x2; - accu += tmp * tmp; - } - - float result () { - return accu; - } + explicit SimilarityL2_avx512 (const float * y) : SimilarityL2_avx<1> (y) {} }; - -#ifdef USE_AVX template<> -struct SimilarityL2_avx512<8> { +struct SimilarityL2_avx512<8> : public SimilarityL2_avx<8> { static constexpr int simdwidth = 8; static constexpr MetricType metric_type = METRIC_L2; - const float *y, *yi; - - explicit SimilarityL2_avx512 (const float * y): y(y) {} - __m256 accu8; - - void begin_8 () { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - __m256 tmp = yiv - x; - accu8 += tmp * tmp; - } - - void add_8_components_2 (__m256 x, __m256 y) { - __m256 tmp = y - x; - accu8 += tmp * tmp; - } - - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); - } + explicit SimilarityL2_avx512 (const float * y) : SimilarityL2_avx<8> (y) {} }; -#endif -#ifdef USE_AVX_512 template<> struct SimilarityL2_avx512<16> { static constexpr int simdwidth = 16; @@ -647,7 +310,6 @@ struct SimilarityL2_avx512<16> { _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); } }; -#endif template @@ -655,76 +317,21 @@ struct SimilarityIP_avx512 {}; template<> -struct SimilarityIP_avx512<1> { +struct SimilarityIP_avx512<1> : public SimilarityIP_avx<1> { static constexpr int simdwidth = 1; static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - float accu; - - explicit SimilarityIP_avx512 (const float * y): - y (y) {} - - void begin () { - accu = 0; - yi = y; - } - - void add_component (float x) { - accu += *yi++ * x; - } - - void add_component_2 (float x1, float x2) { - accu += x1 * x2; - } - - float result () { - return accu; - } + explicit SimilarityIP_avx512 (const float * y) : SimilarityIP_avx<1> (y) {} }; -#ifdef USE_AVX template<> -struct SimilarityIP_avx512<8> { +struct SimilarityIP_avx512<8> : public SimilarityIP_avx<8> { static constexpr int simdwidth = 8; static constexpr MetricType metric_type = METRIC_INNER_PRODUCT; - const float *y, *yi; - - float accu; - - explicit SimilarityIP_avx512 (const float * y): - y (y) {} - - __m256 accu8; - - void begin_8 () { - accu8 = _mm256_setzero_ps(); - yi = y; - } - - void add_8_components (__m256 x) { - __m256 yiv = _mm256_loadu_ps (yi); - yi += 8; - accu8 += yiv * x; - } - - void add_8_components_2 (__m256 x1, __m256 x2) { - accu8 += x1 * x2; - } - - float result_8 () { - __m256 sum = _mm256_hadd_ps(accu8, accu8); - __m256 sum2 = _mm256_hadd_ps(sum, sum); - // now add the 0th and 4th component - return - _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) + - _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); - } + explicit SimilarityIP_avx512 (const float * y) : SimilarityIP_avx<8> (y) {} }; -#endif -#ifdef USE_AVX_512 template<> struct SimilarityIP_avx512<16> { static constexpr int simdwidth = 16; @@ -734,8 +341,7 @@ struct SimilarityIP_avx512<16> { float accu; - explicit SimilarityIP_avx512 (const float * y): - y (y) {} + explicit SimilarityIP_avx512 (const float * y) : y (y) {} __m512 accu16; @@ -764,7 +370,6 @@ struct SimilarityIP_avx512<16> { _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1)); } }; -#endif /******************************************************************* @@ -776,116 +381,19 @@ template struct DCTemplate_avx512 : SQDistanceComputer {}; template -struct DCTemplate_avx512 : SQDistanceComputer -{ - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx512(size_t d, const std::vector &trained): - quant(d, trained) - {} - - float compute_distance(const float* x, const uint8_t* code) const { - Similarity sim(x); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float xi = quant.reconstruct_component(code, i); - sim.add_component(xi); - } - return sim.result(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin(); - for (size_t i = 0; i < quant.d; i++) { - float x1 = quant.reconstruct_component(code1, i); - float x2 = quant.reconstruct_component(code2, i); - sim.add_component_2(x1, x2); - } - return sim.result(); - } - - void set_query (const float *x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_distance (q, code); - } +struct DCTemplate_avx512 : public DCTemplate_avx { + DCTemplate_avx512(size_t d, const std::vector &trained) : + DCTemplate_avx (d, trained) {} }; -#ifdef USE_AVX template -struct DCTemplate_avx512 : SQDistanceComputer -{ - using Sim = Similarity; - - Quantizer quant; - - DCTemplate_avx512(size_t d, const std::vector &trained): - quant(d, trained) - {} - - float compute_distance(const float* x, const uint8_t* code) const { - - Similarity sim(x); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 xi = quant.reconstruct_8_components(code, i); - sim.add_8_components(xi); - } - return sim.result_8(); - } - - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - Similarity sim(nullptr); - sim.begin_8(); - for (size_t i = 0; i < quant.d; i += 8) { - __m256 x1 = quant.reconstruct_8_components(code1, i); - __m256 x2 = quant.reconstruct_8_components(code2, i); - sim.add_8_components_2(x1, x2); - } - return sim.result_8(); - } - - void set_query (const float *x) final { - q = x; - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_distance (q, code); - } +struct DCTemplate_avx512 : public DCTemplate_avx { + DCTemplate_avx512(size_t d, const std::vector &trained) : + DCTemplate_avx (d, trained) {} }; -#endif -#ifdef USE_AVX_512 template -struct DCTemplate_avx512 : SQDistanceComputer -{ +struct DCTemplate_avx512 : SQDistanceComputer { using Sim = Similarity; Quantizer quant; @@ -904,8 +412,7 @@ struct DCTemplate_avx512 : SQDistanceComputer return sim.result_16(); } - float compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { + float compute_code_distance(const uint8_t* code1, const uint8_t* code2) const { Similarity sim(nullptr); sim.begin_16(); for (size_t i = 0; i < quant.d; i += 16) { @@ -934,7 +441,6 @@ struct DCTemplate_avx512 : SQDistanceComputer return compute_distance (q, code); } }; -#endif /******************************************************************* @@ -945,125 +451,17 @@ template struct DistanceComputerByte_avx512 : SQDistanceComputer {}; template -struct DistanceComputerByte_avx512 : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx512(int d, const std::vector &): d(d), tmp(d) { - } - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - int accu = 0; - for (int i = 0; i < d; i++) { - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - accu += int(code1[i]) * code2[i]; - } else { - int diff = int(code1[i]) - code2[i]; - accu += diff * diff; - } - } - return accu; - } - - void set_query (const float *x) final { - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_code_distance (tmp.data(), code); - } +struct DistanceComputerByte_avx512 : public DistanceComputerByte_avx { + DistanceComputerByte_avx512(int d, const std::vector &unused) : + DistanceComputerByte_avx (d, unused) {} }; -#ifdef USE_AVX template -struct DistanceComputerByte_avx512 : SQDistanceComputer { - using Sim = Similarity; - - int d; - std::vector tmp; - - DistanceComputerByte_avx512(int d, const std::vector &): d(d), tmp(d) { - } - - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { - // __m256i accu = _mm256_setzero_ps (); - __m256i accu = _mm256_setzero_si256 (); - for (int i = 0; i < d; i += 16) { - // load 16 bytes, convert to 16 uint16_t - __m256i c1 = _mm256_cvtepu8_epi16 - (_mm_loadu_si128((__m128i*)(code1 + i))); - __m256i c2 = _mm256_cvtepu8_epi16 - (_mm_loadu_si128((__m128i*)(code2 + i))); - __m256i prod32; - if (Sim::metric_type == METRIC_INNER_PRODUCT) { - prod32 = _mm256_madd_epi16(c1, c2); - } else { - __m256i diff = _mm256_sub_epi16(c1, c2); - prod32 = _mm256_madd_epi16(diff, diff); - } - accu = _mm256_add_epi32 (accu, prod32); - } - __m128i sum = _mm256_extractf128_si256(accu, 0); - sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1)); - sum = _mm_hadd_epi32 (sum, sum); - sum = _mm_hadd_epi32 (sum, sum); - return _mm_cvtsi128_si32 (sum); - } - - void set_query (const float *x) final { - /* - for (int i = 0; i < d; i += 8) { - __m256 xi = _mm256_loadu_ps (x + i); - __m256i ci = _mm256_cvtps_epi32(xi); - */ - for (int i = 0; i < d; i++) { - tmp[i] = int(x[i]); - } - } - - int compute_distance(const float* x, const uint8_t* code) { - set_query(x); - return compute_code_distance(tmp.data(), code); - } - - /// compute distance of vector i to current query - float operator () (idx_t i) final { - return compute_distance (q, codes + i * code_size); - } - - float symmetric_dis (idx_t i, idx_t j) override { - return compute_code_distance (codes + i * code_size, - codes + j * code_size); - } - - float query_to_code (const uint8_t * code) const { - return compute_code_distance (tmp.data(), code); - } +struct DistanceComputerByte_avx512 : public DistanceComputerByte_avx { + DistanceComputerByte_avx512(int d, const std::vector &unused) : + DistanceComputerByte_avx (d, unused) {} }; -#endif -#ifdef USE_AVX_512 template struct DistanceComputerByte_avx512 : SQDistanceComputer { using Sim = Similarity; @@ -1071,11 +469,9 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { int d; std::vector tmp; - DistanceComputerByte_avx512(int d, const std::vector &): d(d), tmp(d) { - } + DistanceComputerByte_avx512(int d, const std::vector &): d(d), tmp(d) {} - int compute_code_distance(const uint8_t* code1, const uint8_t* code2) - const { + int compute_code_distance(const uint8_t* code1, const uint8_t* code2) const { // __m256i accu = _mm256_setzero_ps (); __m512i accu = _mm512_setzero_si512 (); for (int i = 0; i < d; i += 32) { @@ -1132,7 +528,6 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { return compute_code_distance (tmp.data(), code); } }; -#endif /******************************************************************* @@ -1140,7 +535,6 @@ struct DistanceComputerByte_avx512 : SQDistanceComputer { * specialization *******************************************************************/ - template SQDistanceComputer *select_distance_computer_avx512 ( QuantizerType qtype, @@ -1148,41 +542,120 @@ SQDistanceComputer *select_distance_computer_avx512 ( { constexpr int SIMDWIDTH = Sim::simdwidth; switch(qtype) { - case QuantizerType::QT_8bit_uniform: - return new DCTemplate_avx512, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_8bit_uniform: + return new DCTemplate_avx512, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit_uniform: - return new DCTemplate_avx512, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_4bit_uniform: + return new DCTemplate_avx512, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_8bit: - return new DCTemplate_avx512, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_8bit: + return new DCTemplate_avx512, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_6bit: - return new DCTemplate_avx512, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_6bit: + return new DCTemplate_avx512, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_4bit: - return new DCTemplate_avx512, - Sim, SIMDWIDTH>(d, trained); + case QuantizerType::QT_4bit: + return new DCTemplate_avx512, + Sim, SIMDWIDTH>(d, trained); - case QuantizerType::QT_fp16: - return new DCTemplate_avx512 - , Sim, SIMDWIDTH>(d, trained); - - case QuantizerType::QT_8bit_direct: - if (d % 16 == 0) { - return new DistanceComputerByte_avx512(d, trained); - } else { + case QuantizerType::QT_fp16: return new DCTemplate_avx512 - , Sim, SIMDWIDTH>(d, trained); - } + , Sim, SIMDWIDTH>(d, trained); + + case QuantizerType::QT_8bit_direct: + if (d % 16 == 0) { + return new DistanceComputerByte_avx512(d, trained); + } else { + return new DCTemplate_avx512 + , Sim, SIMDWIDTH>(d, trained); + } } FAISS_THROW_MSG ("unknown qtype"); return nullptr; } +template +InvertedListScanner* sel2_InvertedListScanner_avx512 ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + return sel2_InvertedListScanner (sq, quantizer, store_pairs, r); +} + +template +InvertedListScanner* sel12_InvertedListScanner_avx512 ( + const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + using QuantizerClass = QuantizerTemplate_avx512; + using DCClass = DCTemplate_avx512; + return sel2_InvertedListScanner_avx512 (sq, quantizer, store_pairs, r); +} + + +template +InvertedListScanner* sel1_InvertedListScanner_avx512 ( + const ScalarQuantizer *sq, const Index *quantizer, + bool store_pairs, bool r) +{ + constexpr int SIMDWIDTH = Similarity::simdwidth; + switch(sq->qtype) { + case QuantizerType::QT_8bit_uniform: + return sel12_InvertedListScanner_avx512 + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit_uniform: + return sel12_InvertedListScanner_avx512 + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit: + return sel12_InvertedListScanner_avx512 + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_4bit: + return sel12_InvertedListScanner_avx512 + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_6bit: + return sel12_InvertedListScanner_avx512 + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_fp16: + return sel2_InvertedListScanner_avx512 + , Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + case QuantizerType::QT_8bit_direct: + if (sq->d % 16 == 0) { + return sel2_InvertedListScanner_avx512 + > + (sq, quantizer, store_pairs, r); + } else { + return sel2_InvertedListScanner_avx512 + , + Similarity, SIMDWIDTH> > + (sq, quantizer, store_pairs, r); + } + } + + FAISS_THROW_MSG ("unknown qtype"); + return nullptr; +} + +template +InvertedListScanner* sel0_InvertedListScanner_avx512 ( + MetricType mt, const ScalarQuantizer *sq, + const Index *quantizer, bool store_pairs, bool by_residual) +{ + if (mt == METRIC_L2) { + return sel1_InvertedListScanner_avx512 > + (sq, quantizer, store_pairs, by_residual); + } else if (mt == METRIC_INNER_PRODUCT) { + return sel1_InvertedListScanner_avx512 > + (sq, quantizer, store_pairs, by_residual); + } else { + FAISS_THROW_MSG("unsupported metric type"); + } +} + } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp index 428024d70a..71fc4807b9 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.cpp @@ -18,18 +18,22 @@ namespace faiss { /* SSE */ SQDistanceComputer * -sq_get_distance_computer_L2_sse (QuantizerType qtype, size_t dim, const std::vector& trained) { - return select_distance_computer> (qtype, dim, trained); -} - -SQDistanceComputer * -sq_get_distance_computer_IP_sse (QuantizerType qtype, size_t dim, const std::vector& trained) { - return select_distance_computer> (qtype, dim, trained); +sq_get_distance_computer_ref (MetricType metric, QuantizerType qtype, size_t dim, const std::vector& trained) { + if (metric == METRIC_L2) { + return select_distance_computer>(qtype, dim, trained); + } else { + return select_distance_computer>(qtype, dim, trained); + } } Quantizer * -sq_select_quantizer_sse (QuantizerType qtype, size_t dim, const std::vector& trained) { +sq_select_quantizer_ref (QuantizerType qtype, size_t dim, const std::vector& trained) { return select_quantizer_1<1> (qtype, dim, trained); } +InvertedListScanner* +sq_select_inverted_list_scanner_ref (MetricType mt, const ScalarQuantizer *sq, const Index *quantizer, size_t dim, bool store_pairs, bool by_residual) { + return sel0_InvertedListScanner<1> (mt, sq, quantizer, store_pairs, by_residual); +} + } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h index c4ce86b011..d088d54bc9 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC.h @@ -9,17 +9,33 @@ #pragma once +#include +#include #include +#include namespace faiss { SQDistanceComputer * -sq_get_distance_computer_L2_sse(QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_sse(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_get_distance_computer_ref( + MetricType metric, + QuantizerType qtype, + size_t dim, + const std::vector& trained); Quantizer * -sq_select_quantizer_sse(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_select_quantizer_ref( + QuantizerType qtype, + size_t dim, + const std::vector& trained); + +InvertedListScanner* +sq_select_inverted_list_scanner_ref( + MetricType mt, + const ScalarQuantizer *sq, + const Index *quantizer, + size_t dim, + bool store_pairs, + bool by_residual); } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp index 74bfc0878a..2da2af6f60 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.cpp @@ -17,25 +17,24 @@ namespace faiss { ********************************************************************/ SQDistanceComputer * -sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { - if (dim % 8 == 0) { - return select_distance_computer_avx>(qtype, dim, trained); +sq_get_distance_computer_avx (MetricType metric, QuantizerType qtype, size_t dim, const std::vector& trained) { + if (metric == METRIC_L2) { + if (dim % 8 == 0) { + return select_distance_computer_avx>(qtype, dim, trained); + } else { + return select_distance_computer_avx>(qtype, dim, trained); + } } else { - return select_distance_computer_avx>(qtype, dim, trained); - } -} - -SQDistanceComputer * -sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { - if (dim % 8 == 0) { - return select_distance_computer_avx>(qtype, dim, trained); - } else { - return select_distance_computer_avx>(qtype, dim, trained); + if (dim % 8 == 0) { + return select_distance_computer_avx>(qtype, dim, trained); + } else { + return select_distance_computer_avx>(qtype, dim, trained); + } } } Quantizer * -sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector& trained) { +sq_select_quantizer_avx (QuantizerType qtype, size_t dim, const std::vector& trained) { if (dim % 8 == 0) { return select_quantizer_1_avx<8>(qtype, dim, trained); } else { @@ -43,4 +42,13 @@ sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector (mt, sq, quantizer, store_pairs, by_residual); + } else { + return sel0_InvertedListScanner_avx<1> (mt, sq, quantizer, store_pairs, by_residual); + } +} + } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h index 86b9ab7db3..3b04aa4d2e 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx.h @@ -10,18 +10,32 @@ #pragma once #include +#include #include +#include namespace faiss { - SQDistanceComputer * -sq_get_distance_computer_L2_avx(QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_avx(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_get_distance_computer_avx( + MetricType metric, + QuantizerType qtype, + size_t dim, + const std::vector& trained); Quantizer * -sq_select_quantizer_avx(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_select_quantizer_avx( + QuantizerType qtype, + size_t dim, + const std::vector& trained); + +InvertedListScanner* +sq_select_inverted_list_scanner_avx( + MetricType mt, + const ScalarQuantizer *sq, + const Index *quantizer, + size_t dim, + bool store_pairs, + bool by_residual); } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp index 6b2912c74c..6a62847c1d 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.cpp @@ -17,24 +17,23 @@ namespace faiss { ********************************************************************/ SQDistanceComputer * -sq_get_distance_computer_L2_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained) { - if (dim % 16 == 0) { - return select_distance_computer_avx512> (qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512> (qtype, dim, trained); +sq_get_distance_computer_avx512 (MetricType metric, QuantizerType qtype, size_t dim, const std::vector& trained) { + if (metric == METRIC_L2) { + if (dim % 16 == 0) { + return select_distance_computer_avx512>(qtype, dim, trained); + } else if (dim % 8 == 0) { + return select_distance_computer_avx512>(qtype, dim, trained); + } else { + return select_distance_computer_avx512>(qtype, dim, trained); + } } else { - return select_distance_computer_avx512> (qtype, dim, trained); - } -} - -SQDistanceComputer * -sq_get_distance_computer_IP_avx512 (QuantizerType qtype, size_t dim, const std::vector& trained) { - if (dim % 16 == 0) { - return select_distance_computer_avx512> (qtype, dim, trained); - } else if (dim % 8 == 0) { - return select_distance_computer_avx512> (qtype, dim, trained); - } else { - return select_distance_computer_avx512> (qtype, dim, trained); + if (dim % 16 == 0) { + return select_distance_computer_avx512>(qtype, dim, trained); + } else if (dim % 8 == 0) { + return select_distance_computer_avx512>(qtype, dim, trained); + } else { + return select_distance_computer_avx512>(qtype, dim, trained); + } } } @@ -49,5 +48,15 @@ sq_select_quantizer_avx512 (QuantizerType qtype, size_t dim, const std::vector (mt, sq, quantizer, store_pairs, by_residual); + } else if (dim % 8 == 0) { + return sel0_InvertedListScanner_avx512<8> (mt, sq, quantizer, store_pairs, by_residual); + } else { + return sel0_InvertedListScanner_avx512<1> (mt, sq, quantizer, store_pairs, by_residual); + } +} } // namespace faiss diff --git a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h index 308b81ecd0..f1b03027a9 100644 --- a/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h +++ b/core/src/index/thirdparty/faiss/impl/ScalarQuantizerDC_avx512.h @@ -10,17 +10,32 @@ #pragma once #include +#include #include +#include namespace faiss { SQDistanceComputer * -sq_get_distance_computer_L2_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); - -SQDistanceComputer * -sq_get_distance_computer_IP_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_get_distance_computer_avx512( + MetricType metric, + QuantizerType qtype, + size_t dim, + const std::vector& trained); Quantizer * -sq_select_quantizer_avx512(QuantizerType qtype, size_t dim, const std::vector& trained); +sq_select_quantizer_avx512( + QuantizerType qtype, + size_t dim, + const std::vector& trained); + +InvertedListScanner* +sq_select_inverted_list_scanner_avx512( + MetricType mt, + const ScalarQuantizer *sq, + const Index *quantizer, + size_t dim, + bool store_pairs, + bool by_residual); } // namespace faiss