From cae5722229af34358786fb89f3c40e4defb6908b Mon Sep 17 00:00:00 2001 From: Alexander Guzhva Date: Fri, 5 Apr 2024 17:19:22 -0400 Subject: [PATCH] enhance: performance improvements for the bitset (#31753) Issue: #31752 This PR improves the performance for bitset utilities (introduced in PR #30454), including varchar filtering Signed-off-by: Alexandr Guzhva --- .../src/bitset/detail/element_vectorized.h | 2 +- .../core/src/bitset/detail/element_wise.h | 127 +++++-- internal/core/unittest/test_bitset.cpp | 310 +++++++++++------- 3 files changed, 291 insertions(+), 148 deletions(-) diff --git a/internal/core/src/bitset/detail/element_vectorized.h b/internal/core/src/bitset/detail/element_vectorized.h index 393f9d01ae..e21aca883b 100644 --- a/internal/core/src/bitset/detail/element_vectorized.h +++ b/internal/core/src/bitset/detail/element_vectorized.h @@ -410,7 +410,7 @@ struct VectorizedElementWiseBitsetPolicy { // process the first element if (start_shift != 0) { // it is possible to do vectorized masking here, but it is not worth it - func_baseline(start, 0, size); + func_baseline(start, 0, data_bits - start_shift); // start from the next element start_element += 1; diff --git a/internal/core/src/bitset/detail/element_wise.h b/internal/core/src/bitset/detail/element_wise.h index 062b144290..62e49b5a93 100644 --- a/internal/core/src/bitset/detail/element_wise.h +++ b/internal/core/src/bitset/detail/element_wise.h @@ -678,10 +678,6 @@ struct ElementWiseBitsetPolicy { const size_t start_left, const size_t start_right, const size_t size) { - if (size == 0) { - return; - } - op_func(left, right, start_left, @@ -775,10 +771,9 @@ struct ElementWiseBitsetPolicy { const T* const __restrict t, const U* const __restrict u, const size_type size) { - for (size_type i = 0; i < size; i++) { - get_proxy(data, start + i) = - CompareOperator::compare(t[i], u[i]); - } + op_func(data, start, size, [t, u](const size_type bit_idx) { + return CompareOperator::compare(t[bit_idx], u[bit_idx]); + }); } // @@ -789,10 +784,9 @@ struct ElementWiseBitsetPolicy { const T* const __restrict t, const size_type size, const T& value) { - for (size_type i = 0; i < size; i++) { - get_proxy(data, start + i) = - CompareOperator::compare(t[i], value); - } + op_func(data, start, size, [t, value](const size_type bit_idx) { + return CompareOperator::compare(t[bit_idx], value); + }); } // @@ -804,10 +798,11 @@ struct ElementWiseBitsetPolicy { const T* const __restrict upper, const T* const __restrict values, const size_type size) { - for (size_type i = 0; i < size; i++) { - get_proxy(data, start + i) = - RangeOperator::within_range(lower[i], upper[i], values[i]); - } + op_func( + data, start, size, [lower, upper, values](const size_type bit_idx) { + return RangeOperator::within_range( + lower[bit_idx], upper[bit_idx], values[bit_idx]); + }); } // @@ -819,10 +814,11 @@ struct ElementWiseBitsetPolicy { const T& upper, const T* const __restrict values, const size_type size) { - for (size_type i = 0; i < size; i++) { - get_proxy(data, start + i) = - RangeOperator::within_range(lower, upper, values[i]); - } + op_func( + data, start, size, [lower, upper, values](const size_type bit_idx) { + return RangeOperator::within_range( + lower, upper, values[bit_idx]); + }); } // @@ -834,11 +830,13 @@ struct ElementWiseBitsetPolicy { const ArithHighPrecisionType& right_operand, const ArithHighPrecisionType& value, const size_type size) { - for (size_type i = 0; i < size; i++) { - get_proxy(data, start + i) = - ArithCompareOperator::compare( - src[i], right_operand, value); - } + op_func(data, + start, + size, + [src, right_operand, value](const size_type bit_idx) { + return ArithCompareOperator::compare( + src[bit_idx], right_operand, value); + }); } // @@ -972,6 +970,85 @@ struct ElementWiseBitsetPolicy { op_write(left, start_left + size_b, size - size_b, result_v); } } + + // bool Func(const size_type bit_idx); + template + static inline void + op_func(data_type* const __restrict data, + const size_type start, + const size_t size, + Func func) { + if (size == 0) { + return; + } + + auto start_element = get_element(start); + const auto end_element = get_element(start + size); + + const auto start_shift = get_shift(start); + const auto end_shift = get_shift(start + size); + + if (start_element == end_element) { + data_type bits = 0; + for (size_type j = 0; j < size; j++) { + const bool bit = func(j); + // // a curious example where the compiler does not optimize the code properly + // bits |= (bit ? (data_type(1) << j) : 0); + // + // use the following code + bits |= (data_type(bit ? 1 : 0) << j); + } + + op_write(data, start, size, bits); + return; + } + + // + uintptr_t ptr_offset = 0; + + // process the first element + if (start_shift != 0) { + const size_type n_bits = data_bits - start_shift; + + data_type bits = 0; + for (size_type j = 0; j < n_bits; j++) { + const bool bit = func(j); + bits |= (data_type(bit ? 1 : 0) << j); + } + + op_write(data, start, n_bits, bits); + + // start from the next element + start_element += 1; + ptr_offset += n_bits; + } + + // process the middle + { + for (size_type i = start_element; i < end_element; i++) { + data_type bits = 0; + for (size_type j = 0; j < data_bits; j++) { + const bool bit = func(ptr_offset + j); + bits |= (data_type(bit ? 1 : 0) << j); + } + + data[i] = bits; + ptr_offset += data_bits; + } + } + + // process the last element + if (end_shift != 0) { + data_type bits = 0; + for (size_type j = 0; j < end_shift; j++) { + const bool bit = func(ptr_offset + j); + bits |= (data_type(bit ? 1 : 0) << j); + } + + const size_t starting_bit_idx = end_element * data_bits; + op_write(data, starting_bit_idx, end_shift, bits); + } + } }; } // namespace detail diff --git a/internal/core/unittest/test_bitset.cpp b/internal/core/unittest/test_bitset.cpp index 0ebe660b6c..a5f93a9f83 100644 --- a/internal/core/unittest/test_bitset.cpp +++ b/internal/core/unittest/test_bitset.cpp @@ -119,7 +119,7 @@ static constexpr bool print_log = false; static constexpr bool print_timing = true; static constexpr size_t typical_sizes[] = {10000000}; -static constexpr size_t typical_offsets[] = {}; +static constexpr size_t typical_offsets[] = {1}; static constexpr CompareOpType typical_compare_ops[] = {CompareOpType::EQ, CompareOpType::GE, CompareOpType::GT, @@ -186,6 +186,7 @@ using Ttypes2 = ::testing::Types< std::tuple, std::tuple, std::tuple, + std::tuple, #endif std::tuple, @@ -193,7 +194,8 @@ using Ttypes2 = ::testing::Types< std::tuple, std::tuple, std::tuple, - std::tuple + std::tuple, + std::tuple #if FULL_TESTS == 1 , @@ -203,13 +205,15 @@ using Ttypes2 = ::testing::Types< std::tuple, std::tuple, std::tuple, + std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, - std::tuple + std::tuple, + std::tuple #endif >; @@ -222,6 +226,7 @@ using Ttypes1 = ::testing::Types< std::tuple, std::tuple, std::tuple, + std::tuple, #endif std::tuple, @@ -229,7 +234,8 @@ using Ttypes1 = ::testing::Types< std::tuple, std::tuple, std::tuple, - std::tuple + std::tuple, + std::tuple #if FULL_TESTS == 1 , @@ -239,13 +245,15 @@ using Ttypes1 = ::testing::Types< std::tuple, std::tuple, std::tuple, + std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, std::tuple, - std::tuple + std::tuple, + std::tuple #endif >; @@ -284,6 +292,17 @@ FillRandom(std::vector& t, } } +template <> +void +FillRandom(std::vector& t, + std::default_random_engine& rng, + const size_t max_v) { + std::uniform_int_distribution tt(0, max_v); + for (size_t i = 0; i < t.size(); i++) { + t[i] = std::to_string(tt(rng)); + } +} + template void FillRandom(BitsetT& bitset, std::default_random_engine& rng) { @@ -293,6 +312,19 @@ FillRandom(BitsetT& bitset, std::default_random_engine& rng) { } } +// +template +T +from_i32(const int32_t i) { + return T(i); +} + +template <> +std::string +from_i32(const int32_t i) { + return std::to_string(i); +} + ////////////////////////////////////////////////////////////////////////////////////////// // @@ -396,8 +428,8 @@ TestInplaceCompareColumnImpl(BitsetT& bitset, CompareOpType op) { const size_t n = bitset.size(); constexpr size_t max_v = 2; - std::vector t(n, 0); - std::vector u(n, 0); + std::vector t(n, from_i32(0)); + std::vector u(n, from_i32(0)); std::default_random_engine rng(123); FillRandom(t, rng, max_v); @@ -597,9 +629,9 @@ void TestInplaceCompareValImpl(BitsetT& bitset, CompareOpType op) { const size_t n = bitset.size(); constexpr size_t max_v = 3; - constexpr T value = 1; + const T value = from_i32(1); - std::vector t(n, 0); + std::vector t(n, from_i32(0)); std::default_random_engine rng(123); FillRandom(t, rng, max_v); @@ -783,11 +815,11 @@ TestInplaceWithinRangeColumnImpl(BitsetT& bitset, RangeType op) { const size_t n = bitset.size(); constexpr size_t max_v = 3; - std::vector range(n, 0); - std::vector values(n, 0); + std::vector range(n, from_i32(0)); + std::vector values(n, from_i32(0)); - std::vector lower(n, 0); - std::vector upper(n, 0); + std::vector lower(n, from_i32(0)); + std::vector upper(n, from_i32(0)); std::default_random_engine rng(123); FillRandom(lower, rng, max_v); @@ -977,10 +1009,10 @@ void TestInplaceWithinRangeValImpl(BitsetT& bitset, RangeType op) { const size_t n = bitset.size(); constexpr size_t max_v = 10; - constexpr T lower_v = 3; - constexpr T upper_v = 7; + const T lower_v = from_i32(3); + const T upper_v = from_i32(7); - std::vector values(n, 0); + std::vector values(n, from_i32(0)); std::default_random_engine rng(123); FillRandom(values, rng, max_v); @@ -1157,122 +1189,155 @@ INSTANTIATE_TYPED_TEST_SUITE_P(InplaceWithinRangeValTest, ////////////////////////////////////////////////////////////////////////////////////////// -// template -void -TestInplaceArithCompareImpl(BitsetT& bitset, - ArithOpType a_op, - CompareOpType cmp_op) { - using HT = ArithHighPrecisionType; +struct TestInplaceArithCompareImplS { + static void + process(BitsetT& bitset, ArithOpType a_op, CompareOpType cmp_op) { + using HT = ArithHighPrecisionType; - const size_t n = bitset.size(); - constexpr size_t max_v = 10; + const size_t n = bitset.size(); + constexpr size_t max_v = 10; - std::vector left(n, 0); - HT right_operand = 2; - HT value = 5; + std::vector left(n, 0); + const HT right_operand = from_i32(2); + const HT value = from_i32(5); - std::default_random_engine rng(123); - FillRandom(left, rng, max_v); + std::default_random_engine rng(123); + FillRandom(left, rng, max_v); - StopWatch sw; - bitset.inplace_arith_compare( - left.data(), right_operand, value, n, a_op, cmp_op); + StopWatch sw; + bitset.inplace_arith_compare( + left.data(), right_operand, value, n, a_op, cmp_op); - if (print_timing) { - printf("elapsed %f\n", sw.elapsed()); - } + if (print_timing) { + printf("elapsed %f\n", sw.elapsed()); + } - for (size_t i = 0; i < n; i++) { - if (a_op == ArithOpType::Add) { - if (cmp_op == CompareOpType::EQ) { - ASSERT_EQ((left[i] + right_operand) == value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GE) { - ASSERT_EQ((left[i] + right_operand) >= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GT) { - ASSERT_EQ((left[i] + right_operand) > value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LE) { - ASSERT_EQ((left[i] + right_operand) <= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LT) { - ASSERT_EQ((left[i] + right_operand) < value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::NE) { - ASSERT_EQ((left[i] + right_operand) != value, bitset[i]) << i; + for (size_t i = 0; i < n; i++) { + if (a_op == ArithOpType::Add) { + if (cmp_op == CompareOpType::EQ) { + ASSERT_EQ((left[i] + right_operand) == value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GE) { + ASSERT_EQ((left[i] + right_operand) >= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GT) { + ASSERT_EQ((left[i] + right_operand) > value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LE) { + ASSERT_EQ((left[i] + right_operand) <= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LT) { + ASSERT_EQ((left[i] + right_operand) < value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::NE) { + ASSERT_EQ((left[i] + right_operand) != value, bitset[i]) + << i; + } else { + ASSERT_TRUE(false) << "Not implemented"; + } + } else if (a_op == ArithOpType::Sub) { + if (cmp_op == CompareOpType::EQ) { + ASSERT_EQ((left[i] - right_operand) == value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GE) { + ASSERT_EQ((left[i] - right_operand) >= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GT) { + ASSERT_EQ((left[i] - right_operand) > value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LE) { + ASSERT_EQ((left[i] - right_operand) <= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LT) { + ASSERT_EQ((left[i] - right_operand) < value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::NE) { + ASSERT_EQ((left[i] - right_operand) != value, bitset[i]) + << i; + } else { + ASSERT_TRUE(false) << "Not implemented"; + } + } else if (a_op == ArithOpType::Mul) { + if (cmp_op == CompareOpType::EQ) { + ASSERT_EQ((left[i] * right_operand) == value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GE) { + ASSERT_EQ((left[i] * right_operand) >= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GT) { + ASSERT_EQ((left[i] * right_operand) > value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LE) { + ASSERT_EQ((left[i] * right_operand) <= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LT) { + ASSERT_EQ((left[i] * right_operand) < value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::NE) { + ASSERT_EQ((left[i] * right_operand) != value, bitset[i]) + << i; + } else { + ASSERT_TRUE(false) << "Not implemented"; + } + } else if (a_op == ArithOpType::Div) { + if (cmp_op == CompareOpType::EQ) { + ASSERT_EQ((left[i] / right_operand) == value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GE) { + ASSERT_EQ((left[i] / right_operand) >= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GT) { + ASSERT_EQ((left[i] / right_operand) > value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LE) { + ASSERT_EQ((left[i] / right_operand) <= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LT) { + ASSERT_EQ((left[i] / right_operand) < value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::NE) { + ASSERT_EQ((left[i] / right_operand) != value, bitset[i]) + << i; + } else { + ASSERT_TRUE(false) << "Not implemented"; + } + } else if (a_op == ArithOpType::Mod) { + if (cmp_op == CompareOpType::EQ) { + ASSERT_EQ(fmod(left[i], right_operand) == value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GE) { + ASSERT_EQ(fmod(left[i], right_operand) >= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::GT) { + ASSERT_EQ(fmod(left[i], right_operand) > value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LE) { + ASSERT_EQ(fmod(left[i], right_operand) <= value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::LT) { + ASSERT_EQ(fmod(left[i], right_operand) < value, bitset[i]) + << i; + } else if (cmp_op == CompareOpType::NE) { + ASSERT_EQ(fmod(left[i], right_operand) != value, bitset[i]) + << i; + } else { + ASSERT_TRUE(false) << "Not implemented"; + } } else { ASSERT_TRUE(false) << "Not implemented"; } - } else if (a_op == ArithOpType::Sub) { - if (cmp_op == CompareOpType::EQ) { - ASSERT_EQ((left[i] - right_operand) == value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GE) { - ASSERT_EQ((left[i] - right_operand) >= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GT) { - ASSERT_EQ((left[i] - right_operand) > value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LE) { - ASSERT_EQ((left[i] - right_operand) <= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LT) { - ASSERT_EQ((left[i] - right_operand) < value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::NE) { - ASSERT_EQ((left[i] - right_operand) != value, bitset[i]) << i; - } else { - ASSERT_TRUE(false) << "Not implemented"; - } - } else if (a_op == ArithOpType::Mul) { - if (cmp_op == CompareOpType::EQ) { - ASSERT_EQ((left[i] * right_operand) == value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GE) { - ASSERT_EQ((left[i] * right_operand) >= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GT) { - ASSERT_EQ((left[i] * right_operand) > value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LE) { - ASSERT_EQ((left[i] * right_operand) <= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LT) { - ASSERT_EQ((left[i] * right_operand) < value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::NE) { - ASSERT_EQ((left[i] * right_operand) != value, bitset[i]) << i; - } else { - ASSERT_TRUE(false) << "Not implemented"; - } - } else if (a_op == ArithOpType::Div) { - if (cmp_op == CompareOpType::EQ) { - ASSERT_EQ((left[i] / right_operand) == value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GE) { - ASSERT_EQ((left[i] / right_operand) >= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::GT) { - ASSERT_EQ((left[i] / right_operand) > value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LE) { - ASSERT_EQ((left[i] / right_operand) <= value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LT) { - ASSERT_EQ((left[i] / right_operand) < value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::NE) { - ASSERT_EQ((left[i] / right_operand) != value, bitset[i]) << i; - } else { - ASSERT_TRUE(false) << "Not implemented"; - } - } else if (a_op == ArithOpType::Mod) { - if (cmp_op == CompareOpType::EQ) { - ASSERT_EQ(fmod(left[i], right_operand) == value, bitset[i]) - << i; - } else if (cmp_op == CompareOpType::GE) { - ASSERT_EQ(fmod(left[i], right_operand) >= value, bitset[i]) - << i; - } else if (cmp_op == CompareOpType::GT) { - ASSERT_EQ(fmod(left[i], right_operand) > value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::LE) { - ASSERT_EQ(fmod(left[i], right_operand) <= value, bitset[i]) - << i; - } else if (cmp_op == CompareOpType::LT) { - ASSERT_EQ(fmod(left[i], right_operand) < value, bitset[i]) << i; - } else if (cmp_op == CompareOpType::NE) { - ASSERT_EQ(fmod(left[i], right_operand) != value, bitset[i]) - << i; - } else { - ASSERT_TRUE(false) << "Not implemented"; - } - } else { - ASSERT_TRUE(false) << "Not implemented"; } } -} +}; + +template +struct TestInplaceArithCompareImplS { + static void + process(BitsetT&, ArithOpType, CompareOpType) { + // does nothing + } +}; template void @@ -1288,7 +1353,8 @@ TestInplaceArithCompareImpl() { "Testing bitset, n=%zd, a_op=%zd\n", n, (size_t)a_op); } - TestInplaceArithCompareImpl(bitset, a_op, cmp_op); + TestInplaceArithCompareImplS::process( + bitset, a_op, cmp_op); for (const size_t offset : typical_offsets) { if (offset >= n) { @@ -1308,7 +1374,7 @@ TestInplaceArithCompareImpl() { (size_t)cmp_op); } - TestInplaceArithCompareImpl( + TestInplaceArithCompareImplS::process( view, a_op, cmp_op); } }