Skip to content

Commit 845b7e1

Browse files
committed
Add single shift optimization
1 parent 1d0dd83 commit 845b7e1

3 files changed

Lines changed: 40 additions & 4 deletions

File tree

include/xsimd/arch/utils/shifts.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,26 @@ namespace xsimd
3838
template <typename I>
3939
constexpr I lsb_mask(I bit_index)
4040
{
41+
if (bit_index == 8 * sizeof(I))
42+
{
43+
return ~I { 0 };
44+
}
4145
return static_cast<I>((I { 1 } << bit_index) - I { 1 });
4246
}
4347

48+
template <class T, class A, T... Vs>
49+
constexpr bool all_equals(batch_constant<T, A, Vs...> c)
50+
{
51+
static_assert(sizeof...(Vs) > 0);
52+
53+
bool out = true;
54+
for (std::size_t k = 0; k < sizeof...(Vs); ++k)
55+
{
56+
out &= c.get(k) == c.get(0);
57+
}
58+
return out;
59+
}
60+
4461
template <class T, class T2, class A, T... Vs>
4562
XSIMD_INLINE batch<T, A> bitwise_lshift_as_twice_larger(
4663
batch<T, A> const& self, batch_constant<T, A, Vs...>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,14 @@ namespace xsimd
337337
// Missing implementations are dispacthed to the `batch` overload in xsimd_api.
338338
template <class T, class A, T... Vs, detail::enable_sized_integral_t<T, 2> = 0>
339339
XSIMD_INLINE batch<T, A> bitwise_lshift(
340-
batch<T, A> const& self, batch_constant<T, A, Vs...>, requires_arch<avx2>) noexcept
340+
batch<T, A> const& self, batch_constant<T, A, Vs...> shifts, requires_arch<avx2> req) noexcept
341341
{
342342
using uint_t = typename std::make_unsigned<T>::type;
343+
344+
XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
345+
{
346+
return bitwise_lshift<shifts.get(0), A>(self, req);
347+
}
343348
return bitwise_cast<T>(
344349
utils::bitwise_lshift_as_twice_larger<uint_t, uint32_t>(
345350
bitwise_cast<uint_t>(self),
@@ -348,9 +353,14 @@ namespace xsimd
348353

349354
template <class T, class A, T... Vs, detail::enable_sized_integral_t<T, 1> = 0>
350355
XSIMD_INLINE batch<T, A> bitwise_lshift(
351-
batch<T, A> const& self, batch_constant<T, A, Vs...>, requires_arch<avx2>) noexcept
356+
batch<T, A> const& self, batch_constant<T, A, Vs...> shifts, requires_arch<avx2> req) noexcept
352357
{
353358
using uint_t = typename std::make_unsigned<T>::type;
359+
360+
XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
361+
{
362+
return bitwise_lshift<shifts.get(0), A>(self, req);
363+
}
354364
return bitwise_cast<T>(
355365
utils::bitwise_lshift_as_twice_larger<uint_t, uint16_t>(
356366
bitwise_cast<uint_t>(self),

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -328,18 +328,27 @@ namespace xsimd
328328
}
329329

330330
// bitwise_lshift multiple (constant)
331+
// Missing implementations are dispacthed to the `batch` overload in xsimd_api.
331332
template <class A, uint16_t... Vs>
332333
XSIMD_INLINE batch<uint16_t, A> bitwise_lshift(
333-
batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...>, requires_arch<sse2>) noexcept
334+
batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> shifts, requires_arch<sse2> req) noexcept
334335
{
336+
XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
337+
{
338+
return bitwise_lshift<shifts.get(0), A>(self, req);
339+
}
335340
constexpr auto mults = batch_constant<uint16_t, A, static_cast<uint16_t>(1u << Vs)...>();
336341
return _mm_mullo_epi16(self, mults.as_batch());
337342
}
338343

339344
template <class A, uint8_t... Vs>
340345
XSIMD_INLINE batch<uint8_t, A> bitwise_lshift(
341-
batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> shifts, requires_arch<sse2>) noexcept
346+
batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> shifts, requires_arch<sse2> req) noexcept
342347
{
348+
XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
349+
{
350+
return bitwise_lshift<shifts.get(0), A>(self, req);
351+
}
343352
return utils::bitwise_lshift_as_twice_larger<uint8_t, uint16_t>(self, shifts);
344353
}
345354

0 commit comments

Comments
 (0)