Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/source/api/arithmetic_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ Binary operations:
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mul` | per slot multiply |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mullo` | low N bits of the 2N-bit integer product |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mulhi` | high N bits of the 2N-bit integer product |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mulhilo` | pair {hi, lo} of the 2N-bit integer product |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`div` | per slot division |
+---------------------------------------+----------------------------------------------------+
| :cpp:func:`mod` | per slot modulo |
Expand Down
112 changes: 112 additions & 0 deletions include/xsimd/arch/common/xsimd_common_arithmetic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,118 @@ namespace xsimd
self, other);
}

// mulhi
namespace detail
{
template <class T>
struct mulhi_helper
{
// default: use a wider native integer type
using wider = typename std::conditional<
std::is_signed<T>::value,
typename std::conditional<sizeof(T) == 1, int16_t,
typename std::conditional<sizeof(T) == 2, int32_t, int64_t>::type>::type,
typename std::conditional<sizeof(T) == 1, uint16_t,
typename std::conditional<sizeof(T) == 2, uint32_t, uint64_t>::type>::type>::type;

static XSIMD_INLINE T compute(T x, T y) noexcept
{
constexpr int shift = 8 * sizeof(T);
return static_cast<T>((static_cast<wider>(x) * static_cast<wider>(y)) >> shift);
}
};

// 64-bit unsigned software mulhi via 32-bit splits
XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept
{
#if defined(__SIZEOF_INT128__)
return static_cast<uint64_t>((static_cast<unsigned __int128>(x) * static_cast<unsigned __int128>(y)) >> 64);
#else
uint64_t xl = x & 0xffffffffULL;
uint64_t xh = x >> 32;
uint64_t yl = y & 0xffffffffULL;
uint64_t yh = y >> 32;
uint64_t ll = xl * yl;
uint64_t lh = xl * yh;
uint64_t hl = xh * yl;
uint64_t hh = xh * yh;
uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL);
return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
#endif
}

XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept
{
#if defined(__SIZEOF_INT128__)
return static_cast<int64_t>((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64);
#else
uint64_t uhi = mulhi_u64(static_cast<uint64_t>(x), static_cast<uint64_t>(y));
if (x < 0)
uhi -= static_cast<uint64_t>(y);
if (y < 0)
uhi -= static_cast<uint64_t>(x);
return static_cast<int64_t>(uhi);
#endif
}

template <>
struct mulhi_helper<uint64_t>
{
static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); }
};

template <>
struct mulhi_helper<int64_t>
{
static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); }
};

// Compute the high 64 bits of each lane-wise 64x64 unsigned product,
// given a "widening mul" functor WMul that takes two batch<uint64_t,A>
// and returns batch<uint64_t,A> containing the 64-bit product of the
// low 32 bits of each 64-bit lane (i.e. _mm*_mul_epu32 wrapped).
template <class A, class WMul>
XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
batch<uint64_t, A> const& y,
WMul mul_epu32) noexcept
{
using B = batch<uint64_t, A>;
const B mask(uint64_t(0xffffffffULL));
B xl = x & mask;
B xh = x >> 32;
B yl = y & mask;
B yh = y >> 32;
B ll = mul_epu32(xl, yl);
B lh = mul_epu32(xl, yh);
B hl = mul_epu32(xh, yl);
B hh = mul_epu32(xh, yh);
B mid = (ll >> 32) + (lh & mask) + (hl & mask);
return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
}

// Signed variant: unsigned core + sign fixup via arithmetic shift-by-63.
template <class A, class WMul>
XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
batch<int64_t, A> const& y,
WMul mul_epu32) noexcept
{
auto ux = ::xsimd::bitwise_cast<uint64_t>(x);
auto uy = ::xsimd::bitwise_cast<uint64_t>(y);
auto uhi = mulhi_u64_core<A>(ux, uy, mul_epu32);
auto sa = ::xsimd::bitwise_cast<uint64_t>(x >> 63);
auto sb = ::xsimd::bitwise_cast<uint64_t>(y >> 63);
return ::xsimd::bitwise_cast<int64_t>(uhi - (uy & sa) - (ux & sb));
}
}

template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
XSIMD_INLINE batch<T, A> mulhi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
{
return detail::apply([](T x, T y) noexcept -> T
{ return detail::mulhi_helper<T>::compute(x, y); },
self, other);
}

// rotl
template <class A, class T, class STy>
XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<common>) noexcept
Expand Down
44 changes: 44 additions & 0 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,50 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_mulhi_epi16(self, other);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_mulhi_epu16(self, other);
}
template <class A>
XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx2>) noexcept
{
__m256i even = _mm256_mul_epi32(self, other);
__m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)),
_mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1)));
__m256i even_hi = _mm256_srli_epi64(even, 32);
return _mm256_blend_epi16(even_hi, odd, 0xCC);
}
template <class A>
XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx2>) noexcept
{
__m256i even = _mm256_mul_epu32(self, other);
__m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32));
__m256i even_hi = _mm256_srli_epi64(even, 32);
return _mm256_blend_epi16(even_hi, odd, 0xCC);
}

template <class A>
XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx2>) noexcept
{
return detail::mulhi_u64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
}
template <class A>
XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx2>) noexcept
{
return detail::mulhi_i64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
}

// reduce_add
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
Expand Down
12 changes: 12 additions & 0 deletions include/xsimd/arch/xsimd_avx512bw.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,18 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx512bw>) noexcept
{
return _mm512_mulhi_epi16(self, other);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx512bw>) noexcept
{
return _mm512_mulhi_epu16(self, other);
}

// neq
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
Expand Down
35 changes: 35 additions & 0 deletions include/xsimd/arch/xsimd_avx512f.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1772,6 +1772,41 @@ namespace xsimd
}
}

// mulhi
template <class A>
XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx512f>) noexcept
{
__m512i even = _mm512_mul_epi32(self, other);
__m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))),
_mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))));
__m512i even_hi = _mm512_srli_epi64(even, 32);
// merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64
return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
}
template <class A>
XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx512f>) noexcept
{
__m512i even = _mm512_mul_epu32(self, other);
__m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32));
__m512i even_hi = _mm512_srli_epi64(even, 32);
return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
}

template <class A>
XSIMD_INLINE batch<uint64_t, A> mulhi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512f>) noexcept
{
return detail::mulhi_u64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
}
template <class A>
XSIMD_INLINE batch<int64_t, A> mulhi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512f>) noexcept
{
return detail::mulhi_i64_core<A>(self, other,
[](batch<uint64_t, A> a, batch<uint64_t, A> b)
{ return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
}

// nearbyint
template <class A>
XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
Expand Down
14 changes: 14 additions & 0 deletions include/xsimd/arch/xsimd_common_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ namespace xsimd
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> mulhi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
Expand Down Expand Up @@ -120,6 +122,18 @@ namespace xsimd
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
template <typename T, class A, T... Vs>
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;

// Shared 64-bit mulhi cores, defined in xsimd_common_arithmetic.hpp.
// Forward-declared here so arch-specific kernels (SSE4.1, AVX2,
// AVX-512) can name them with an explicit template argument.
template <class A, class WMul>
XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
batch<uint64_t, A> const& y,
WMul mul_epu32) noexcept;
template <class A, class WMul>
XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
batch<int64_t, A> const& y,
WMul mul_epu32) noexcept;
}
}
}
Expand Down
48 changes: 48 additions & 0 deletions include/xsimd/arch/xsimd_neon.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1037,6 +1037,54 @@ namespace xsimd
return wrap::x_vmulq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
}

/*********
* mulhi *
*********/

template <class A>
XSIMD_INLINE batch<int8_t, A> mulhi(batch<int8_t, A> const& lhs, batch<int8_t, A> const& rhs, requires_arch<neon>) noexcept
{
int16x8_t lo = vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs));
int16x8_t hi = vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs));
return vcombine_s8(vshrn_n_s16(lo, 8), vshrn_n_s16(hi, 8));
}
template <class A>
XSIMD_INLINE batch<uint8_t, A> mulhi(batch<uint8_t, A> const& lhs, batch<uint8_t, A> const& rhs, requires_arch<neon>) noexcept
{
uint16x8_t lo = vmull_u8(vget_low_u8(lhs), vget_low_u8(rhs));
uint16x8_t hi = vmull_u8(vget_high_u8(lhs), vget_high_u8(rhs));
return vcombine_u8(vshrn_n_u16(lo, 8), vshrn_n_u16(hi, 8));
}
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& lhs, batch<int16_t, A> const& rhs, requires_arch<neon>) noexcept
{
int32x4_t lo = vmull_s16(vget_low_s16(lhs), vget_low_s16(rhs));
int32x4_t hi = vmull_s16(vget_high_s16(lhs), vget_high_s16(rhs));
return vcombine_s16(vshrn_n_s32(lo, 16), vshrn_n_s32(hi, 16));
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& lhs, batch<uint16_t, A> const& rhs, requires_arch<neon>) noexcept
{
uint32x4_t lo = vmull_u16(vget_low_u16(lhs), vget_low_u16(rhs));
uint32x4_t hi = vmull_u16(vget_high_u16(lhs), vget_high_u16(rhs));
return vcombine_u16(vshrn_n_u32(lo, 16), vshrn_n_u32(hi, 16));
}
template <class A>
XSIMD_INLINE batch<int32_t, A> mulhi(batch<int32_t, A> const& lhs, batch<int32_t, A> const& rhs, requires_arch<neon>) noexcept
{
int64x2_t lo = vmull_s32(vget_low_s32(lhs), vget_low_s32(rhs));
int64x2_t hi = vmull_s32(vget_high_s32(lhs), vget_high_s32(rhs));
return vcombine_s32(vshrn_n_s64(lo, 32), vshrn_n_s64(hi, 32));
}
template <class A>
XSIMD_INLINE batch<uint32_t, A> mulhi(batch<uint32_t, A> const& lhs, batch<uint32_t, A> const& rhs, requires_arch<neon>) noexcept
{
uint64x2_t lo = vmull_u32(vget_low_u32(lhs), vget_low_u32(rhs));
uint64x2_t hi = vmull_u32(vget_high_u32(lhs), vget_high_u32(rhs));
return vcombine_u32(vshrn_n_u64(lo, 32), vshrn_n_u64(hi, 32));
}
// 64-bit intentionally falls through to the common scalar fallback

/*******
* div *
*******/
Expand Down
10 changes: 10 additions & 0 deletions include/xsimd/arch/xsimd_rvv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,9 @@ namespace xsimd
(__riscv_vmul),
(__riscv_vmul),
(__riscv_vfmul), , vec(vec, vec))
XSIMD_RVV_OVERLOAD2(rvvmulh,
(__riscv_vmulh),
(__riscv_vmulhu), , vec(vec, vec))
XSIMD_RVV_OVERLOAD3(rvvdiv,
(__riscv_vdiv),
(__riscv_vdivu),
Expand Down Expand Up @@ -659,6 +662,13 @@ namespace xsimd
return detail_rvv::rvvmul(lhs, rhs);
}

// mulhi
template <class A, class T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
XSIMD_INLINE batch<T, A> mulhi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
{
return detail::rvvmulh(lhs, rhs);
}

// div
template <class A, class T, typename detail::enable_arithmetic_t<T> = 0>
XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
Expand Down
12 changes: 12 additions & 0 deletions include/xsimd/arch/xsimd_sse2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,6 +1438,18 @@ namespace xsimd
return _mm_mullo_epi16(self, other);
}

// mulhi
template <class A>
XSIMD_INLINE batch<int16_t, A> mulhi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
{
return _mm_mulhi_epi16(self, other);
}
template <class A>
XSIMD_INLINE batch<uint16_t, A> mulhi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<sse2>) noexcept
{
return _mm_mulhi_epu16(self, other);
}

// nearbyint_as_int
template <class A>
XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
Expand Down
Loading
Loading