Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
- { compiler: 'clang', version: '17', flags: 'avx' }
- { compiler: 'clang', version: '17', flags: 'sse3' }
- { compiler: 'clang', version: '18', flags: 'avx512' }
- { compiler: 'clang', version: '18', flags: 'avx_128' }
steps:
- name: Setup compiler
if: ${{ matrix.sys.compiler == 'gcc' }}
Expand Down Expand Up @@ -76,6 +77,9 @@ jobs:
if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
fi
if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128"
fi
if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
fi
Expand Down
16 changes: 8 additions & 8 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,18 +994,18 @@ namespace xsimd
using int_t = as_integer_t<T>;
constexpr size_t half_size = batch<T, A>::size / 2;

// confined to lower 128-bit half → forward to SSE2
// confined to lower 128-bit half → forward to 128 bit
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
}
// confined to upper 128-bit half → forward to SSE2
// confined to upper 128-bit half → forward to 128 bit
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
return detail::zero_extend<A>(hi);
}
else
Expand Down Expand Up @@ -1036,19 +1036,19 @@ namespace xsimd
{
constexpr size_t half_size = batch<T, A>::size / 2;

// confined to lower 128-bit half → forward to SSE2
// confined to lower 128-bit half → forward to 128 bit
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
}
// confined to upper 128-bit half → forward to SSE2
// confined to upper 128-bit half → forward to 128 bit
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
}
else
{
Expand Down
163 changes: 163 additions & 0 deletions include/xsimd/arch/xsimd_avx_128.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* Copyright (c) Marco Barbone *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/

#ifndef XSIMD_AVX_128_HPP
#define XSIMD_AVX_128_HPP

#include <type_traits>

#include "../types/xsimd_avx_register.hpp"
#include "../types/xsimd_batch_constant.hpp"

namespace xsimd
{
namespace kernel
{
using namespace types;

// broadcast
template <class A, class T, class = std::enable_if_t<std::is_same<T, float>::value>>
XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx_128>) noexcept
{
return _mm_broadcast_ss(&val);
}

// eq
template <class A>
XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_EQ_OQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_EQ_OQ);
}

// gt
template <class A>
XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_GT_OQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_GT_OQ);
}

// ge
template <class A>
XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_GE_OQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_GE_OQ);
}

// lt
template <class A>
XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_LT_OQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_LT_OQ);
}

// le
template <class A>
XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_LE_OQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_LE_OQ);
}

// neq
template <class A>
XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_ps(self, other, _CMP_NEQ_UQ);
}
template <class A>
XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
{
return _mm_cmp_pd(self, other, _CMP_NEQ_UQ);
}

// load_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_ps(mem, mask.as_batch());
}
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskload_pd(mem, mask.as_batch());
}

// store_masked
template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskstore_ps(mem, mask.as_batch(), src);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
{
return _mm_maskstore_pd(mem, mask.as_batch(), src);
}

// swizzle (dynamic mask)
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
{
XSIMD_IF_CONSTEXPR(std::is_same<T, float>::value)
{
return _mm_permutevar_ps(self, mask);
}
else
{
// FIXME: _mm_permutevar_pd fails validation, but it shouldn't o_O
return swizzle(self, mask, sse4_2 {});
// return _mm_permutevar_pd(self, mask);
}
}

// swizzle (constant mask)
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<avx_128>) noexcept
{
return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3));
}

template <class A, uint32_t V0, uint32_t V1>
XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<avx_128>) noexcept
{
return _mm_permute_pd(self, detail::mod_shuffle(V0, V1));
}

}
}

#endif
1 change: 1 addition & 0 deletions include/xsimd/arch/xsimd_isa.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

#if XSIMD_WITH_AVX
#include "./xsimd_avx.hpp"
#include "./xsimd_avx_128.hpp"
#endif

#if XSIMD_WITH_FMA3_AVX
Expand Down
15 changes: 15 additions & 0 deletions include/xsimd/types/xsimd_avx_register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define XSIMD_AVX_REGISTER_HPP

#include "./xsimd_common_arch.hpp"
#include "./xsimd_sse4_2_register.hpp"

namespace xsimd
{
Expand All @@ -30,6 +31,18 @@ namespace xsimd
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr char const* name() noexcept { return "avx"; }
};

/**
* @ingroup architectures
*
* AVX instructions extension for 128 bits registers
*/
struct avx_128 : sse4_2
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
static constexpr bool available() noexcept { return true; }
static constexpr char const* name() noexcept { return "avx/128"; }
};
}

#if XSIMD_WITH_AVX
Expand Down Expand Up @@ -58,6 +71,8 @@ namespace xsimd
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);

XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2);
}
}
#endif
Expand Down
Loading