Skip to content

Commit adead38

Browse files
committed
Add x86 optimizations
1 parent bff3a8f commit adead38

File tree

4 files changed

+123
-0
lines changed

4 files changed

+123
-0
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* Copyright (c) Marco Barbone *
7+
* *
8+
* Distributed under the terms of the BSD 3-Clause License. *
9+
* *
10+
* The full license is in the file LICENSE, distributed with this software. *
11+
****************************************************************************/
12+
13+
#ifndef XSIMD_UTILS_SHIFTS_HPP
14+
#define XSIMD_UTILS_SHIFTS_HPP
15+
16+
#include "../../config/xsimd_inline.hpp"
17+
#include "../../types/xsimd_batch.hpp"
18+
#include "../../types/xsimd_batch_constant.hpp"
19+
20+
namespace xsimd
21+
{
22+
namespace kernel
23+
{
24+
namespace utils
25+
{
26+
template <typename I, I offset, I length, I... Vs>
27+
struct select_stride
28+
{
29+
static constexpr I values_array[] = { Vs... };
30+
31+
template <typename K>
32+
static constexpr K get(K i, K)
33+
{
34+
return static_cast<K>(values_array[length * i + offset]);
35+
}
36+
};
37+
38+
template <typename I>
39+
constexpr I lsb_mask(I bit_index)
40+
{
41+
return static_cast<I>((I { 1 } << bit_index) - I { 1 });
42+
}
43+
44+
template <class T, class T2, class A, T... Vs>
45+
XSIMD_INLINE batch<T, A> bitwise_lshift_as_twice_larger(
46+
batch<T, A> const& self, batch_constant<T, A, Vs...>) noexcept
47+
{
48+
static_assert(sizeof(T2) == 2 * sizeof(T), "One size must be twice the other");
49+
50+
const auto self2 = bitwise_cast<T2>(self);
51+
52+
// Lower byte: shift as twice the size and mask bits flowing to higher byte.
53+
constexpr auto shifts_lo = make_batch_constant<T2, select_stride<T, 0, 2, Vs...>, A>();
54+
constexpr auto mask_lo = lsb_mask<T2>(8 * sizeof(T));
55+
const auto shifted_lo = bitwise_lshift(self2, shifts_lo);
56+
constexpr auto batch_mask_lo = make_batch_constant<T2, mask_lo, A>();
57+
const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch());
58+
59+
// Higher byte: mask bits that would flow from lower byte and shift as twice the size.
60+
constexpr auto shifts_hi = make_batch_constant<T2, select_stride<T, 1, 2, Vs...>, A>();
61+
constexpr auto mask_hi = mask_lo << (8 * sizeof(T));
62+
constexpr auto batch_mask_hi = make_batch_constant<T2, mask_hi, A>();
63+
const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch());
64+
const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi);
65+
66+
return bitwise_cast<T>(bitwise_or(masked_lo, shifted_hi));
67+
}
68+
}
69+
}
70+
}
71+
72+
#endif

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "../types/xsimd_avx2_register.hpp"
1919
#include "../types/xsimd_batch_constant.hpp"
20+
#include "./utils/shifts.hpp"
2021

2122
#include <limits>
2223

@@ -332,6 +333,30 @@ namespace xsimd
332333
}
333334
}
334335

336+
// bitwise_lshift multiple (constant) specific implementations.
337+
// Missing implementations are dispacthed to the `batch` overload in xsimd_api.
338+
template <class T, class A, T... Vs, detail::enable_sized_integral_t<T, 2> = 0>
339+
XSIMD_INLINE batch<T, A> bitwise_lshift(
340+
batch<T, A> const& self, batch_constant<T, A, Vs...>, requires_arch<avx2>) noexcept
341+
{
342+
using uint_t = typename std::make_unsigned<T>::type;
343+
return bitwise_cast<T>(
344+
utils::bitwise_lshift_as_twice_larger<uint_t, uint32_t>(
345+
bitwise_cast<uint_t>(self),
346+
batch_constant<uint_t, A, static_cast<uint_t>(Vs)...> {}));
347+
}
348+
349+
template <class T, class A, T... Vs, detail::enable_sized_integral_t<T, 1> = 0>
350+
XSIMD_INLINE batch<T, A> bitwise_lshift(
351+
batch<T, A> const& self, batch_constant<T, A, Vs...>, requires_arch<avx2>) noexcept
352+
{
353+
using uint_t = typename std::make_unsigned<T>::type;
354+
return bitwise_cast<T>(
355+
utils::bitwise_lshift_as_twice_larger<uint_t, uint16_t>(
356+
bitwise_cast<uint_t>(self),
357+
batch_constant<uint_t, A, static_cast<uint_t>(Vs)...> {}));
358+
}
359+
335360
// bitwise_or
336361
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
337362
XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "../types/xsimd_batch_constant.hpp"
2020
#include "../types/xsimd_sse2_register.hpp"
21+
#include "./utils/shifts.hpp"
2122

2223
namespace xsimd
2324
{
@@ -326,6 +327,22 @@ namespace xsimd
326327
return bitwise_lshift<shift>(self, common {});
327328
}
328329

330+
// bitwise_lshift multiple (constant)
331+
template <class A, uint16_t... Vs>
332+
XSIMD_INLINE batch<uint16_t, A> bitwise_lshift(
333+
batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...>, requires_arch<sse2>) noexcept
334+
{
335+
constexpr auto mults = batch_constant<uint16_t, A, static_cast<uint16_t>(1u << Vs)...>();
336+
return _mm_mullo_epi16(self, mults.as_batch());
337+
}
338+
339+
template <class A, uint8_t... Vs>
340+
XSIMD_INLINE batch<uint8_t, A> bitwise_lshift(
341+
batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> shifts, requires_arch<sse2>) noexcept
342+
{
343+
return utils::bitwise_lshift_as_twice_larger<uint8_t, uint16_t>(self, shifts);
344+
}
345+
329346
// bitwise_not
330347
template <class A>
331348
XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,15 @@ namespace xsimd
4141
return _mm_ceil_pd(self);
4242
}
4343

44+
// bitwise_lshift multiple (constant)
45+
template <class A, uint32_t... Vs>
46+
XSIMD_INLINE batch<uint32_t, A> bitwise_lshift(
47+
batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...>, requires_arch<sse4_1>) noexcept
48+
{
49+
constexpr auto mults = batch_constant<uint32_t, A, static_cast<uint32_t>(1u << Vs)...>();
50+
return _mm_mullo_epi32(self, mults.as_batch());
51+
}
52+
4453
// fast_cast
4554
namespace detail
4655
{

0 commit comments

Comments
 (0)