@@ -73,22 +73,6 @@ ARROW_FORCE_INLINE constexpr T max_value(const std::array<T, N>& arr) {
7373 return out;
7474}
7575
76- template <std::array kArr , typename Arch, std::size_t ... Is>
77- ARROW_FORCE_INLINE constexpr auto array_to_batch_constant_impl (
78- std::index_sequence<Is...>) {
79- using Array = std::decay_t <decltype (kArr )>;
80- using value_type = typename Array::value_type;
81-
82- return xsimd::batch_constant<value_type, Arch, kArr [Is]...>{};
83- }
84-
85- // / Make a ``xsimd::batch_constant`` from a static constexpr array.
86- template <std::array kArr , typename Arch>
87- ARROW_FORCE_INLINE constexpr auto array_to_batch_constant () {
88- return array_to_batch_constant_impl<kArr , Arch>(
89- std::make_index_sequence<kArr .size ()>());
90- }
91-
9276template <typename Uint, typename Arch>
9377ARROW_FORCE_INLINE xsimd::batch<uint8_t , Arch> load_val_as (const uint8_t * in) {
9478 const Uint val = util::SafeLoadAs<Uint>(in);
@@ -139,7 +123,7 @@ ARROW_FORCE_INLINE constexpr auto select_stride(
139123 constexpr auto kStridesArr =
140124 select_stride_impl<ToInt, kOffset , sizeof (ToInt) / sizeof (Int)>(
141125 std::array{kShifts ...});
142- return array_to_batch_constant <kStridesArr , Arch>();
126+ return xsimd::make_batch_constant <kStridesArr , Arch>();
143127}
144128
145129// / Whether we are compiling for the SSE2 or above in the SSE family (not AVX).
@@ -151,73 +135,6 @@ constexpr bool IsSse2 = std::is_base_of_v<xsimd::sse2, Arch>;
151135template <typename Arch>
152136constexpr bool IsAvx2 = std::is_base_of_v<xsimd::avx2, Arch>;
153137
154- // / Whether we are compiling for the Neon or above in the arm64 family.
155- template <typename Arch>
156- constexpr bool IsNeon = std::is_base_of_v<xsimd::neon, Arch>;
157-
158- // / Wrapper around ``xsimd::bitwise_lshift`` with optimizations for non implemented sizes.
159- // /
160- // / We replace the variable left shift by a variable multiply with a power of two.
161- // /
162- // / This trick is borrowed from Daniel Lemire and Leonid Boytsov, Decoding billions of
163- // / integers per second through vectorization, Software Practice & Experience 45 (1),
164- // / 2015. http://arxiv.org/abs/1209.2137
165- // /
166- // / TODO(xsimd) Tracking in https://github.com/xtensor-stack/xsimd/pull/1220
167- // / When migrating, be sure to use batch_constant overload, and not the batch one.
168- template <typename Arch, typename Int, Int... kShifts >
169- ARROW_FORCE_INLINE auto left_shift (const xsimd::batch<Int, Arch>& batch,
170- xsimd::batch_constant<Int, Arch, kShifts ...> shifts)
171- -> xsimd::batch<Int, Arch> {
172- constexpr bool kIsSse2 = IsSse2<Arch>;
173- constexpr bool kIsAvx2 = IsAvx2<Arch>;
174- static_assert (
175- !(kIsSse2 && kIsAvx2 ),
176- " In xsimd, an x86 arch is either part of the SSE family or of the AVX family,"
177- " not both. If this check fails, it means the assumptions made here to detect SSE "
178- " and AVX are out of date." );
179-
180- constexpr auto kMults = xsimd::make_batch_constant<Int, 1 , Arch>() << shifts;
181-
182- constexpr auto IntSize = sizeof (Int);
183-
184- // Sizes and architecture for which there is no variable left shift and there is a
185- // multiplication
186- if constexpr ( //
187- (kIsSse2 && (IntSize == sizeof (uint16_t ) || IntSize == sizeof (uint32_t ))) //
188- || (kIsAvx2 && (IntSize == sizeof (uint16_t ))) //
189- ) {
190- return batch * kMults ;
191- }
192-
193- // Architecture for which there is no variable left shift on uint8_t but a fallback
194- // exists for uint16_t.
195- if constexpr ((kIsSse2 || kIsAvx2 ) && (IntSize == sizeof (uint8_t ))) {
196- const auto batch16 = xsimd::bitwise_cast<uint16_t >(batch);
197-
198- constexpr auto kShifts0 = select_stride<uint16_t , 0 >(shifts);
199- const auto shifted0 = left_shift (batch16, kShifts0 ) & 0x00FF ;
200-
201- constexpr auto kShifts1 = select_stride<uint16_t , 1 >(shifts);
202- const auto shifted1 = left_shift (batch16 & 0xFF00 , kShifts1 );
203-
204- return xsimd::bitwise_cast<Int>(shifted0 | shifted1);
205- }
206-
207- // TODO(xsimd) bug fixed in xsimd 14.1.0
208- // https://github.com/xtensor-stack/xsimd/pull/1266
209- #if XSIMD_VERSION_MAJOR < 14 || ((XSIMD_VERSION_MAJOR == 14) && XSIMD_VERSION_MINOR == 0)
210- if constexpr (IsNeon<Arch>) {
211- using SInt = std::make_signed_t <Int>;
212- constexpr auto signed_shifts =
213- xsimd::batch_constant<SInt, Arch, static_cast <SInt>(kShifts )...>();
214- return xsimd::kernel::bitwise_lshift (batch, signed_shifts.as_batch (), Arch{});
215- }
216- #endif
217-
218- return batch << shifts;
219- }
220-
221138// / Fallback for variable shift right.
222139// /
223140// / When we know that the relevant bits will not overflow, we can instead shift left all
@@ -243,9 +160,8 @@ ARROW_FORCE_INLINE auto right_shift_by_excess(
243160
244161 constexpr auto IntSize = sizeof (Int);
245162
246- // Architecture for which there is no variable right shift but a larger fallback exists.
247- // TODO(xsimd) Tracking for Avx2 in https://github.com/xtensor-stack/xsimd/pull/1220
248- // When migrating, be sure to use batch_constant overload, and not the batch one.
163+ // Architectures for which there is no variable right shift but a larger fallback
164+ // exists.
249165 if constexpr (kIsAvx2 && (IntSize == sizeof (uint8_t ) || IntSize == sizeof (uint16_t ))) {
250166 using twice_uint = SizedUint<2 * IntSize>;
251167
@@ -262,27 +178,17 @@ ARROW_FORCE_INLINE auto right_shift_by_excess(
262178 return xsimd::bitwise_cast<Int>(shifted0 | shifted1);
263179 }
264180
265- // These conditions are the ones matched in `left_shift`, i.e. the ones where variable
266- // shift right will not be available but a left shift (fallback) exists.
181+ // Architectures for which there is no variable right shift but a left shift exists
182+ // (eventually using the multiply trick).
183+ // We use a variable left shift and fix right shift.
267184 if constexpr (kIsSse2 && (IntSize != sizeof (uint64_t ))) {
268185 constexpr Int kMaxRShift = max_value (std::array{kShifts ...});
269186
270187 constexpr auto kLShifts =
271188 xsimd::make_batch_constant<Int, kMaxRShift , Arch>() - shifts;
272189
273- return xsimd::bitwise_rshift<kMaxRShift >(left_shift (batch, kLShifts ));
274- }
275-
276- // TODO(xsimd) bug fixed in xsimd 14.1.0
277- // https://github.com/xtensor-stack/xsimd/pull/1266
278- #if XSIMD_VERSION_MAJOR < 14 || ((XSIMD_VERSION_MAJOR == 14) && XSIMD_VERSION_MINOR == 0)
279- if constexpr (IsNeon<Arch>) {
280- using SInt = std::make_signed_t <Int>;
281- constexpr auto signed_shifts =
282- xsimd::batch_constant<SInt, Arch, static_cast <SInt>(kShifts )...>();
283- return xsimd::kernel::bitwise_rshift (batch, signed_shifts.as_batch (), Arch{});
190+ return xsimd::bitwise_rshift<kMaxRShift >(batch << kLShifts );
284191 }
285- #endif
286192
287193 return batch >> shifts;
288194}
@@ -728,7 +634,8 @@ struct MediumKernel {
728634 unpacked_type* out) {
729635 constexpr auto kRightShiftsArr =
730636 kPlan .shifts .at (kReadIdx ).at (kSwizzleIdx ).at (kShiftIdx );
731- constexpr auto kRightShifts = array_to_batch_constant<kRightShiftsArr , arch_type>();
637+ constexpr auto kRightShifts =
638+ xsimd::make_batch_constant<kRightShiftsArr , arch_type>();
732639 constexpr auto kMask = kPlan .mask ;
733640 constexpr auto kOutOffset = (kReadIdx * kPlan .unpacked_per_read () +
734641 kSwizzleIdx * kPlan .unpacked_per_swizzle () +
@@ -752,7 +659,7 @@ struct MediumKernel {
752659 const simd_bytes& bytes, unpacked_type* out,
753660 std::integer_sequence<int , kShiftIds ...>) {
754661 constexpr auto kSwizzlesArr = kPlan .swizzles .at (kReadIdx ).at (kSwizzleIdx );
755- constexpr auto kSwizzles = array_to_batch_constant <kSwizzlesArr , arch_type>();
662+ constexpr auto kSwizzles = xsimd::make_batch_constant <kSwizzlesArr , arch_type>();
756663
757664 const auto swizzled = xsimd::swizzle (bytes, kSwizzles );
758665 const auto words = xsimd::bitwise_cast<uint_type>(swizzled);
@@ -1016,13 +923,13 @@ struct LargeKernel {
1016923 ARROW_FORCE_INLINE static void unpack_one_read_impl (const uint8_t * in,
1017924 unpacked_type* out) {
1018925 constexpr auto kLowSwizzles =
1019- array_to_batch_constant <kPlan .low_swizzles .at (kReadIdx ), arch_type>();
926+ xsimd::make_batch_constant <kPlan .low_swizzles .at (kReadIdx ), arch_type>();
1020927 constexpr auto kLowRShifts =
1021- array_to_batch_constant <kPlan .low_rshifts .at (kReadIdx ), arch_type>();
928+ xsimd::make_batch_constant <kPlan .low_rshifts .at (kReadIdx ), arch_type>();
1022929 constexpr auto kHighSwizzles =
1023- array_to_batch_constant <kPlan .high_swizzles .at (kReadIdx ), arch_type>();
930+ xsimd::make_batch_constant <kPlan .high_swizzles .at (kReadIdx ), arch_type>();
1024931 constexpr auto kHighLShifts =
1025- array_to_batch_constant <kPlan .high_lshifts .at (kReadIdx ), arch_type>();
932+ xsimd::make_batch_constant <kPlan .high_lshifts .at (kReadIdx ), arch_type>();
1026933
1027934 const auto bytes =
1028935 safe_load_bytes<kPlan .bytes_per_read (), arch_type>(in + kPlan .reads .at (kReadIdx ));
@@ -1040,7 +947,7 @@ struct LargeKernel {
1040947
1041948 const auto high_swizzled = xsimd::swizzle (bytes, kHighSwizzles );
1042949 const auto high_words = xsimd::bitwise_cast<unpacked_type>(high_swizzled);
1043- const auto high_shifted = left_shift ( high_words, kHighLShifts ) ;
950+ const auto high_shifted = high_words << kHighLShifts ;
1044951
1045952 // We can have a single mask and apply it after OR because the shifts will ensure that
1046953 // there are zeros where the high/low values are incomplete.
0 commit comments