Skip to content

Commit 474dc03

Browse files
committed
Clean up xsimd bpacking
1 parent 17f66b3 commit 474dc03

1 file changed

Lines changed: 15 additions & 108 deletions

File tree

cpp/src/arrow/util/bpacking_simd_kernel_internal.h

Lines changed: 15 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -73,22 +73,6 @@ ARROW_FORCE_INLINE constexpr T max_value(const std::array<T, N>& arr) {
7373
return out;
7474
}
7575

76-
template <std::array kArr, typename Arch, std::size_t... Is>
77-
ARROW_FORCE_INLINE constexpr auto array_to_batch_constant_impl(
78-
std::index_sequence<Is...>) {
79-
using Array = std::decay_t<decltype(kArr)>;
80-
using value_type = typename Array::value_type;
81-
82-
return xsimd::batch_constant<value_type, Arch, kArr[Is]...>{};
83-
}
84-
85-
/// Make a ``xsimd::batch_constant`` from a static constexpr array.
86-
template <std::array kArr, typename Arch>
87-
ARROW_FORCE_INLINE constexpr auto array_to_batch_constant() {
88-
return array_to_batch_constant_impl<kArr, Arch>(
89-
std::make_index_sequence<kArr.size()>());
90-
}
91-
9276
template <typename Uint, typename Arch>
9377
ARROW_FORCE_INLINE xsimd::batch<uint8_t, Arch> load_val_as(const uint8_t* in) {
9478
const Uint val = util::SafeLoadAs<Uint>(in);
@@ -139,7 +123,7 @@ ARROW_FORCE_INLINE constexpr auto select_stride(
139123
constexpr auto kStridesArr =
140124
select_stride_impl<ToInt, kOffset, sizeof(ToInt) / sizeof(Int)>(
141125
std::array{kShifts...});
142-
return array_to_batch_constant<kStridesArr, Arch>();
126+
return xsimd::make_batch_constant<kStridesArr, Arch>();
143127
}
144128

145129
/// Whether we are compiling for the SSE2 or above in the SSE family (not AVX).
@@ -151,73 +135,6 @@ constexpr bool IsSse2 = std::is_base_of_v<xsimd::sse2, Arch>;
151135
template <typename Arch>
152136
constexpr bool IsAvx2 = std::is_base_of_v<xsimd::avx2, Arch>;
153137

154-
/// Whether we are compiling for the Neon or above in the arm64 family.
155-
template <typename Arch>
156-
constexpr bool IsNeon = std::is_base_of_v<xsimd::neon, Arch>;
157-
158-
/// Wrapper around ``xsimd::bitwise_lshift`` with optimizations for non implemented sizes.
159-
///
160-
/// We replace the variable left shift by a variable multiply with a power of two.
161-
///
162-
/// This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of
163-
/// integers per second through vectorization, Software Practice & Experience 45 (1),
164-
/// 2015. http://arxiv.org/abs/1209.2137
165-
///
166-
/// TODO(xsimd) Tracking in https://github.com/xtensor-stack/xsimd/pull/1220
167-
/// When migrating, be sure to use batch_constant overload, and not the batch one.
168-
template <typename Arch, typename Int, Int... kShifts>
169-
ARROW_FORCE_INLINE auto left_shift(const xsimd::batch<Int, Arch>& batch,
170-
xsimd::batch_constant<Int, Arch, kShifts...> shifts)
171-
-> xsimd::batch<Int, Arch> {
172-
constexpr bool kIsSse2 = IsSse2<Arch>;
173-
constexpr bool kIsAvx2 = IsAvx2<Arch>;
174-
static_assert(
175-
!(kIsSse2 && kIsAvx2),
176-
"In xsimd, an x86 arch is either part of the SSE family or of the AVX family,"
177-
"not both. If this check fails, it means the assumptions made here to detect SSE "
178-
"and AVX are out of date.");
179-
180-
constexpr auto kMults = xsimd::make_batch_constant<Int, 1, Arch>() << shifts;
181-
182-
constexpr auto IntSize = sizeof(Int);
183-
184-
// Sizes and architecture for which there is no variable left shift and there is a
185-
// multiplication
186-
if constexpr ( //
187-
(kIsSse2 && (IntSize == sizeof(uint16_t) || IntSize == sizeof(uint32_t))) //
188-
|| (kIsAvx2 && (IntSize == sizeof(uint16_t))) //
189-
) {
190-
return batch * kMults;
191-
}
192-
193-
// Architecture for which there is no variable left shift on uint8_t but a fallback
194-
// exists for uint16_t.
195-
if constexpr ((kIsSse2 || kIsAvx2) && (IntSize == sizeof(uint8_t))) {
196-
const auto batch16 = xsimd::bitwise_cast<uint16_t>(batch);
197-
198-
constexpr auto kShifts0 = select_stride<uint16_t, 0>(shifts);
199-
const auto shifted0 = left_shift(batch16, kShifts0) & 0x00FF;
200-
201-
constexpr auto kShifts1 = select_stride<uint16_t, 1>(shifts);
202-
const auto shifted1 = left_shift(batch16 & 0xFF00, kShifts1);
203-
204-
return xsimd::bitwise_cast<Int>(shifted0 | shifted1);
205-
}
206-
207-
// TODO(xsimd) bug fixed in xsimd 14.1.0
208-
// https://github.com/xtensor-stack/xsimd/pull/1266
209-
#if XSIMD_VERSION_MAJOR < 14 || ((XSIMD_VERSION_MAJOR == 14) && XSIMD_VERSION_MINOR == 0)
210-
if constexpr (IsNeon<Arch>) {
211-
using SInt = std::make_signed_t<Int>;
212-
constexpr auto signed_shifts =
213-
xsimd::batch_constant<SInt, Arch, static_cast<SInt>(kShifts)...>();
214-
return xsimd::kernel::bitwise_lshift(batch, signed_shifts.as_batch(), Arch{});
215-
}
216-
#endif
217-
218-
return batch << shifts;
219-
}
220-
221138
/// Fallback for variable shift right.
222139
///
223140
/// When we know that the relevant bits will not overflow, we can instead shift left all
@@ -243,9 +160,8 @@ ARROW_FORCE_INLINE auto right_shift_by_excess(
243160

244161
constexpr auto IntSize = sizeof(Int);
245162

246-
// Architecture for which there is no variable right shift but a larger fallback exists.
247-
// TODO(xsimd) Tracking for Avx2 in https://github.com/xtensor-stack/xsimd/pull/1220
248-
// When migrating, be sure to use batch_constant overload, and not the batch one.
163+
// Architectures for which there is no variable right shift but a larger fallback
164+
// exists.
249165
if constexpr (kIsAvx2 && (IntSize == sizeof(uint8_t) || IntSize == sizeof(uint16_t))) {
250166
using twice_uint = SizedUint<2 * IntSize>;
251167

@@ -262,27 +178,17 @@ ARROW_FORCE_INLINE auto right_shift_by_excess(
262178
return xsimd::bitwise_cast<Int>(shifted0 | shifted1);
263179
}
264180

265-
// These conditions are the ones matched in `left_shift`, i.e. the ones where variable
266-
// shift right will not be available but a left shift (fallback) exists.
181+
// Architectures for which there is no variable right shift but a left shift exists
182+
// (eventually using the multiply trick).
183+
// We use a variable left shift and fix right shift.
267184
if constexpr (kIsSse2 && (IntSize != sizeof(uint64_t))) {
268185
constexpr Int kMaxRShift = max_value(std::array{kShifts...});
269186

270187
constexpr auto kLShifts =
271188
xsimd::make_batch_constant<Int, kMaxRShift, Arch>() - shifts;
272189

273-
return xsimd::bitwise_rshift<kMaxRShift>(left_shift(batch, kLShifts));
274-
}
275-
276-
// TODO(xsimd) bug fixed in xsimd 14.1.0
277-
// https://github.com/xtensor-stack/xsimd/pull/1266
278-
#if XSIMD_VERSION_MAJOR < 14 || ((XSIMD_VERSION_MAJOR == 14) && XSIMD_VERSION_MINOR == 0)
279-
if constexpr (IsNeon<Arch>) {
280-
using SInt = std::make_signed_t<Int>;
281-
constexpr auto signed_shifts =
282-
xsimd::batch_constant<SInt, Arch, static_cast<SInt>(kShifts)...>();
283-
return xsimd::kernel::bitwise_rshift(batch, signed_shifts.as_batch(), Arch{});
190+
return xsimd::bitwise_rshift<kMaxRShift>(batch << kLShifts);
284191
}
285-
#endif
286192

287193
return batch >> shifts;
288194
}
@@ -728,7 +634,8 @@ struct MediumKernel {
728634
unpacked_type* out) {
729635
constexpr auto kRightShiftsArr =
730636
kPlan.shifts.at(kReadIdx).at(kSwizzleIdx).at(kShiftIdx);
731-
constexpr auto kRightShifts = array_to_batch_constant<kRightShiftsArr, arch_type>();
637+
constexpr auto kRightShifts =
638+
xsimd::make_batch_constant<kRightShiftsArr, arch_type>();
732639
constexpr auto kMask = kPlan.mask;
733640
constexpr auto kOutOffset = (kReadIdx * kPlan.unpacked_per_read() +
734641
kSwizzleIdx * kPlan.unpacked_per_swizzle() +
@@ -752,7 +659,7 @@ struct MediumKernel {
752659
const simd_bytes& bytes, unpacked_type* out,
753660
std::integer_sequence<int, kShiftIds...>) {
754661
constexpr auto kSwizzlesArr = kPlan.swizzles.at(kReadIdx).at(kSwizzleIdx);
755-
constexpr auto kSwizzles = array_to_batch_constant<kSwizzlesArr, arch_type>();
662+
constexpr auto kSwizzles = xsimd::make_batch_constant<kSwizzlesArr, arch_type>();
756663

757664
const auto swizzled = xsimd::swizzle(bytes, kSwizzles);
758665
const auto words = xsimd::bitwise_cast<uint_type>(swizzled);
@@ -1016,13 +923,13 @@ struct LargeKernel {
1016923
ARROW_FORCE_INLINE static void unpack_one_read_impl(const uint8_t* in,
1017924
unpacked_type* out) {
1018925
constexpr auto kLowSwizzles =
1019-
array_to_batch_constant<kPlan.low_swizzles.at(kReadIdx), arch_type>();
926+
xsimd::make_batch_constant<kPlan.low_swizzles.at(kReadIdx), arch_type>();
1020927
constexpr auto kLowRShifts =
1021-
array_to_batch_constant<kPlan.low_rshifts.at(kReadIdx), arch_type>();
928+
xsimd::make_batch_constant<kPlan.low_rshifts.at(kReadIdx), arch_type>();
1022929
constexpr auto kHighSwizzles =
1023-
array_to_batch_constant<kPlan.high_swizzles.at(kReadIdx), arch_type>();
930+
xsimd::make_batch_constant<kPlan.high_swizzles.at(kReadIdx), arch_type>();
1024931
constexpr auto kHighLShifts =
1025-
array_to_batch_constant<kPlan.high_lshifts.at(kReadIdx), arch_type>();
932+
xsimd::make_batch_constant<kPlan.high_lshifts.at(kReadIdx), arch_type>();
1026933

1027934
const auto bytes =
1028935
safe_load_bytes<kPlan.bytes_per_read(), arch_type>(in + kPlan.reads.at(kReadIdx));
@@ -1040,7 +947,7 @@ struct LargeKernel {
1040947

1041948
const auto high_swizzled = xsimd::swizzle(bytes, kHighSwizzles);
1042949
const auto high_words = xsimd::bitwise_cast<unpacked_type>(high_swizzled);
1043-
const auto high_shifted = left_shift(high_words, kHighLShifts);
950+
const auto high_shifted = high_words << kHighLShifts;
1044951

1045952
// We can have a single mask and apply it after OR because the shifts will ensure that
1046953
// there are zeros where the high/low values are incomplete.

0 commit comments

Comments
 (0)