enhance: performance improvements for the bitset (#31753)

Issue: #31752

This PR improves the performance for bitset utilities (introduced in PR
#30454), including varchar filtering

Signed-off-by: Alexandr Guzhva <alexanderguzhva@gmail.com>
This commit is contained in:
Alexander Guzhva 2024-04-05 17:19:22 -04:00 committed by GitHub
parent db698756dc
commit cae5722229
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 291 additions and 148 deletions

View File

@ -410,7 +410,7 @@ struct VectorizedElementWiseBitsetPolicy {
// process the first element
if (start_shift != 0) {
// it is possible to do vectorized masking here, but it is not worth it
func_baseline(start, 0, size);
func_baseline(start, 0, data_bits - start_shift);
// start from the next element
start_element += 1;

View File

@ -678,10 +678,6 @@ struct ElementWiseBitsetPolicy {
const size_t start_left,
const size_t start_right,
const size_t size) {
if (size == 0) {
return;
}
op_func(left,
right,
start_left,
@ -775,10 +771,9 @@ struct ElementWiseBitsetPolicy {
const T* const __restrict t,
const U* const __restrict u,
const size_type size) {
for (size_type i = 0; i < size; i++) {
get_proxy(data, start + i) =
CompareOperator<Op>::compare(t[i], u[i]);
}
op_func(data, start, size, [t, u](const size_type bit_idx) {
return CompareOperator<Op>::compare(t[bit_idx], u[bit_idx]);
});
}
//
@ -789,10 +784,9 @@ struct ElementWiseBitsetPolicy {
const T* const __restrict t,
const size_type size,
const T& value) {
for (size_type i = 0; i < size; i++) {
get_proxy(data, start + i) =
CompareOperator<Op>::compare(t[i], value);
}
op_func(data, start, size, [t, value](const size_type bit_idx) {
return CompareOperator<Op>::compare(t[bit_idx], value);
});
}
//
@ -804,10 +798,11 @@ struct ElementWiseBitsetPolicy {
const T* const __restrict upper,
const T* const __restrict values,
const size_type size) {
for (size_type i = 0; i < size; i++) {
get_proxy(data, start + i) =
RangeOperator<Op>::within_range(lower[i], upper[i], values[i]);
}
op_func(
data, start, size, [lower, upper, values](const size_type bit_idx) {
return RangeOperator<Op>::within_range(
lower[bit_idx], upper[bit_idx], values[bit_idx]);
});
}
//
@ -819,10 +814,11 @@ struct ElementWiseBitsetPolicy {
const T& upper,
const T* const __restrict values,
const size_type size) {
for (size_type i = 0; i < size; i++) {
get_proxy(data, start + i) =
RangeOperator<Op>::within_range(lower, upper, values[i]);
}
op_func(
data, start, size, [lower, upper, values](const size_type bit_idx) {
return RangeOperator<Op>::within_range(
lower, upper, values[bit_idx]);
});
}
//
@ -834,11 +830,13 @@ struct ElementWiseBitsetPolicy {
const ArithHighPrecisionType<T>& right_operand,
const ArithHighPrecisionType<T>& value,
const size_type size) {
for (size_type i = 0; i < size; i++) {
get_proxy(data, start + i) =
ArithCompareOperator<AOp, CmpOp>::compare(
src[i], right_operand, value);
}
op_func(data,
start,
size,
[src, right_operand, value](const size_type bit_idx) {
return ArithCompareOperator<AOp, CmpOp>::compare(
src[bit_idx], right_operand, value);
});
}
//
@ -972,6 +970,85 @@ struct ElementWiseBitsetPolicy {
op_write(left, start_left + size_b, size - size_b, result_v);
}
}
// bool Func(const size_type bit_idx);
template <typename Func>
static inline void
op_func(data_type* const __restrict data,
const size_type start,
const size_t size,
Func func) {
if (size == 0) {
return;
}
auto start_element = get_element(start);
const auto end_element = get_element(start + size);
const auto start_shift = get_shift(start);
const auto end_shift = get_shift(start + size);
if (start_element == end_element) {
data_type bits = 0;
for (size_type j = 0; j < size; j++) {
const bool bit = func(j);
// // a curious example where the compiler does not optimize the code properly
// bits |= (bit ? (data_type(1) << j) : 0);
//
// use the following code
bits |= (data_type(bit ? 1 : 0) << j);
}
op_write(data, start, size, bits);
return;
}
//
uintptr_t ptr_offset = 0;
// process the first element
if (start_shift != 0) {
const size_type n_bits = data_bits - start_shift;
data_type bits = 0;
for (size_type j = 0; j < n_bits; j++) {
const bool bit = func(j);
bits |= (data_type(bit ? 1 : 0) << j);
}
op_write(data, start, n_bits, bits);
// start from the next element
start_element += 1;
ptr_offset += n_bits;
}
// process the middle
{
for (size_type i = start_element; i < end_element; i++) {
data_type bits = 0;
for (size_type j = 0; j < data_bits; j++) {
const bool bit = func(ptr_offset + j);
bits |= (data_type(bit ? 1 : 0) << j);
}
data[i] = bits;
ptr_offset += data_bits;
}
}
// process the last element
if (end_shift != 0) {
data_type bits = 0;
for (size_type j = 0; j < end_shift; j++) {
const bool bit = func(ptr_offset + j);
bits |= (data_type(bit ? 1 : 0) << j);
}
const size_t starting_bit_idx = end_element * data_bits;
op_write(data, starting_bit_idx, end_shift, bits);
}
}
};
} // namespace detail

View File

@ -119,7 +119,7 @@ static constexpr bool print_log = false;
static constexpr bool print_timing = true;
static constexpr size_t typical_sizes[] = {10000000};
static constexpr size_t typical_offsets[] = {};
static constexpr size_t typical_offsets[] = {1};
static constexpr CompareOpType typical_compare_ops[] = {CompareOpType::EQ,
CompareOpType::GE,
CompareOpType::GT,
@ -186,6 +186,7 @@ using Ttypes2 = ::testing::Types<
std::tuple<int64_t, int64_t, uint8_t, uint8_t>,
std::tuple<float, float, uint8_t, uint8_t>,
std::tuple<double, double, uint8_t, uint8_t>,
std::tuple<std::string, std::string, uint8_t, uint8_t>,
#endif
std::tuple<int8_t, int8_t, uint64_t, uint8_t>,
@ -193,7 +194,8 @@ using Ttypes2 = ::testing::Types<
std::tuple<int32_t, int32_t, uint64_t, uint8_t>,
std::tuple<int64_t, int64_t, uint64_t, uint8_t>,
std::tuple<float, float, uint64_t, uint8_t>,
std::tuple<double, double, uint64_t, uint8_t>
std::tuple<double, double, uint64_t, uint8_t>,
std::tuple<std::string, std::string, uint64_t, uint8_t>
#if FULL_TESTS == 1
,
@ -203,13 +205,15 @@ using Ttypes2 = ::testing::Types<
std::tuple<int64_t, int64_t, uint8_t, uint64_t>,
std::tuple<float, float, uint8_t, uint64_t>,
std::tuple<double, double, uint8_t, uint64_t>,
std::tuple<std::string, std::string, uint8_t, uint64_t>,
std::tuple<int8_t, int8_t, uint64_t, uint64_t>,
std::tuple<int16_t, int16_t, uint64_t, uint64_t>,
std::tuple<int32_t, int32_t, uint64_t, uint64_t>,
std::tuple<int64_t, int64_t, uint64_t, uint64_t>,
std::tuple<float, float, uint64_t, uint64_t>,
std::tuple<double, double, uint64_t, uint64_t>
std::tuple<double, double, uint64_t, uint64_t>,
std::tuple<std::string, std::string, uint64_t, uint64_t>
#endif
>;
@ -222,6 +226,7 @@ using Ttypes1 = ::testing::Types<
std::tuple<int64_t, uint8_t, uint8_t>,
std::tuple<float, uint8_t, uint8_t>,
std::tuple<double, uint8_t, uint8_t>,
std::tuple<std::string, uint8_t, uint8_t>,
#endif
std::tuple<int8_t, uint64_t, uint8_t>,
@ -229,7 +234,8 @@ using Ttypes1 = ::testing::Types<
std::tuple<int32_t, uint64_t, uint8_t>,
std::tuple<int64_t, uint64_t, uint8_t>,
std::tuple<float, uint64_t, uint8_t>,
std::tuple<double, uint64_t, uint8_t>
std::tuple<double, uint64_t, uint8_t>,
std::tuple<std::string, uint64_t, uint8_t>
#if FULL_TESTS == 1
,
@ -239,13 +245,15 @@ using Ttypes1 = ::testing::Types<
std::tuple<int64_t, uint8_t, uint64_t>,
std::tuple<float, uint8_t, uint64_t>,
std::tuple<double, uint8_t, uint64_t>,
std::tuple<std::string, uint8_t, uint64_t>,
std::tuple<int8_t, uint64_t, uint64_t>,
std::tuple<int16_t, uint64_t, uint64_t>,
std::tuple<int32_t, uint64_t, uint64_t>,
std::tuple<int64_t, uint64_t, uint64_t>,
std::tuple<float, uint64_t, uint64_t>,
std::tuple<double, uint64_t, uint64_t>
std::tuple<double, uint64_t, uint64_t>,
std::tuple<std::string, uint64_t, uint64_t>
#endif
>;
@ -284,6 +292,17 @@ FillRandom(std::vector<T>& t,
}
}
template <>
void
FillRandom<std::string>(std::vector<std::string>& t,
std::default_random_engine& rng,
const size_t max_v) {
std::uniform_int_distribution<uint8_t> tt(0, max_v);
for (size_t i = 0; i < t.size(); i++) {
t[i] = std::to_string(tt(rng));
}
}
template <typename BitsetT>
void
FillRandom(BitsetT& bitset, std::default_random_engine& rng) {
@ -293,6 +312,19 @@ FillRandom(BitsetT& bitset, std::default_random_engine& rng) {
}
}
//
template <typename T>
T
from_i32(const int32_t i) {
return T(i);
}
template <>
std::string
from_i32(const int32_t i) {
return std::to_string(i);
}
//////////////////////////////////////////////////////////////////////////////////////////
//
@ -396,8 +428,8 @@ TestInplaceCompareColumnImpl(BitsetT& bitset, CompareOpType op) {
const size_t n = bitset.size();
constexpr size_t max_v = 2;
std::vector<T> t(n, 0);
std::vector<U> u(n, 0);
std::vector<T> t(n, from_i32<T>(0));
std::vector<U> u(n, from_i32<T>(0));
std::default_random_engine rng(123);
FillRandom(t, rng, max_v);
@ -597,9 +629,9 @@ void
TestInplaceCompareValImpl(BitsetT& bitset, CompareOpType op) {
const size_t n = bitset.size();
constexpr size_t max_v = 3;
constexpr T value = 1;
const T value = from_i32<T>(1);
std::vector<T> t(n, 0);
std::vector<T> t(n, from_i32<T>(0));
std::default_random_engine rng(123);
FillRandom(t, rng, max_v);
@ -783,11 +815,11 @@ TestInplaceWithinRangeColumnImpl(BitsetT& bitset, RangeType op) {
const size_t n = bitset.size();
constexpr size_t max_v = 3;
std::vector<T> range(n, 0);
std::vector<T> values(n, 0);
std::vector<T> range(n, from_i32<T>(0));
std::vector<T> values(n, from_i32<T>(0));
std::vector<T> lower(n, 0);
std::vector<T> upper(n, 0);
std::vector<T> lower(n, from_i32<T>(0));
std::vector<T> upper(n, from_i32<T>(0));
std::default_random_engine rng(123);
FillRandom(lower, rng, max_v);
@ -977,10 +1009,10 @@ void
TestInplaceWithinRangeValImpl(BitsetT& bitset, RangeType op) {
const size_t n = bitset.size();
constexpr size_t max_v = 10;
constexpr T lower_v = 3;
constexpr T upper_v = 7;
const T lower_v = from_i32<T>(3);
const T upper_v = from_i32<T>(7);
std::vector<T> values(n, 0);
std::vector<T> values(n, from_i32<T>(0));
std::default_random_engine rng(123);
FillRandom(values, rng, max_v);
@ -1157,122 +1189,155 @@ INSTANTIATE_TYPED_TEST_SUITE_P(InplaceWithinRangeValTest,
//////////////////////////////////////////////////////////////////////////////////////////
//
template <typename BitsetT, typename T>
void
TestInplaceArithCompareImpl(BitsetT& bitset,
ArithOpType a_op,
CompareOpType cmp_op) {
using HT = ArithHighPrecisionType<T>;
struct TestInplaceArithCompareImplS {
static void
process(BitsetT& bitset, ArithOpType a_op, CompareOpType cmp_op) {
using HT = ArithHighPrecisionType<T>;
const size_t n = bitset.size();
constexpr size_t max_v = 10;
const size_t n = bitset.size();
constexpr size_t max_v = 10;
std::vector<T> left(n, 0);
HT right_operand = 2;
HT value = 5;
std::vector<T> left(n, 0);
const HT right_operand = from_i32<HT>(2);
const HT value = from_i32<HT>(5);
std::default_random_engine rng(123);
FillRandom(left, rng, max_v);
std::default_random_engine rng(123);
FillRandom(left, rng, max_v);
StopWatch sw;
bitset.inplace_arith_compare(
left.data(), right_operand, value, n, a_op, cmp_op);
StopWatch sw;
bitset.inplace_arith_compare(
left.data(), right_operand, value, n, a_op, cmp_op);
if (print_timing) {
printf("elapsed %f\n", sw.elapsed());
}
if (print_timing) {
printf("elapsed %f\n", sw.elapsed());
}
for (size_t i = 0; i < n; i++) {
if (a_op == ArithOpType::Add) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] + right_operand) == value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] + right_operand) >= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] + right_operand) > value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] + right_operand) <= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] + right_operand) < value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] + right_operand) != value, bitset[i]) << i;
for (size_t i = 0; i < n; i++) {
if (a_op == ArithOpType::Add) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] + right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] + right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] + right_operand) > value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] + right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] + right_operand) < value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] + right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Sub) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] - right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] - right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] - right_operand) > value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] - right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] - right_operand) < value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] - right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Mul) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] * right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] * right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] * right_operand) > value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] * right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] * right_operand) < value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] * right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Div) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] / right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] / right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] / right_operand) > value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] / right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] / right_operand) < value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] / right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Mod) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ(fmod(left[i], right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ(fmod(left[i], right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ(fmod(left[i], right_operand) > value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ(fmod(left[i], right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ(fmod(left[i], right_operand) < value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ(fmod(left[i], right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Sub) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] - right_operand) == value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] - right_operand) >= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] - right_operand) > value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] - right_operand) <= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] - right_operand) < value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] - right_operand) != value, bitset[i]) << i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Mul) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] * right_operand) == value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] * right_operand) >= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] * right_operand) > value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] * right_operand) <= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] * right_operand) < value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] * right_operand) != value, bitset[i]) << i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Div) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ((left[i] / right_operand) == value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ((left[i] / right_operand) >= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ((left[i] / right_operand) > value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ((left[i] / right_operand) <= value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ((left[i] / right_operand) < value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ((left[i] / right_operand) != value, bitset[i]) << i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else if (a_op == ArithOpType::Mod) {
if (cmp_op == CompareOpType::EQ) {
ASSERT_EQ(fmod(left[i], right_operand) == value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GE) {
ASSERT_EQ(fmod(left[i], right_operand) >= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::GT) {
ASSERT_EQ(fmod(left[i], right_operand) > value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::LE) {
ASSERT_EQ(fmod(left[i], right_operand) <= value, bitset[i])
<< i;
} else if (cmp_op == CompareOpType::LT) {
ASSERT_EQ(fmod(left[i], right_operand) < value, bitset[i]) << i;
} else if (cmp_op == CompareOpType::NE) {
ASSERT_EQ(fmod(left[i], right_operand) != value, bitset[i])
<< i;
} else {
ASSERT_TRUE(false) << "Not implemented";
}
} else {
ASSERT_TRUE(false) << "Not implemented";
}
}
}
};
template <typename BitsetT>
struct TestInplaceArithCompareImplS<BitsetT, std::string> {
static void
process(BitsetT&, ArithOpType, CompareOpType) {
// does nothing
}
};
template <typename BitsetT, typename T>
void
@ -1288,7 +1353,8 @@ TestInplaceArithCompareImpl() {
"Testing bitset, n=%zd, a_op=%zd\n", n, (size_t)a_op);
}
TestInplaceArithCompareImpl<BitsetT, T>(bitset, a_op, cmp_op);
TestInplaceArithCompareImplS<BitsetT, T>::process(
bitset, a_op, cmp_op);
for (const size_t offset : typical_offsets) {
if (offset >= n) {
@ -1308,7 +1374,7 @@ TestInplaceArithCompareImpl() {
(size_t)cmp_op);
}
TestInplaceArithCompareImpl<decltype(view), T>(
TestInplaceArithCompareImplS<decltype(view), T>::process(
view, a_op, cmp_op);
}
}