diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index c25690046..39ca049e2 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -29,6 +29,7 @@ jobs: - { compiler: 'clang', version: '17', flags: 'avx' } - { compiler: 'clang', version: '17', flags: 'sse3' } - { compiler: 'clang', version: '18', flags: 'avx512' } + - { compiler: 'clang', version: '18', flags: 'avx_128' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} @@ -76,6 +77,9 @@ jobs: if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" fi + if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge -DXSIMD_DEFAULT_ARCH=avx_128" + fi if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" fi diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index a1a8cf26d..305041f11 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -994,18 +994,18 @@ namespace xsimd using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; - // confined to lower 128-bit half → forward to SSE2 + // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); - const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, sse4_2 {}); + const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, avx_128 {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } - // confined to upper 128-bit half → forward to SSE2 + // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, sse4_2 {}); + const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, avx_128 {}); return detail::zero_extend(hi); } else @@ -1036,19 +1036,19 @@ namespace xsimd { constexpr size_t half_size = batch::size / 2; - // confined to lower 128-bit half → forward to SSE2 + // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } - // confined to upper 128-bit half → forward to SSE2 + // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); } else { diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp new file mode 100644 index 000000000..dea490357 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -0,0 +1,163 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX_128_HPP +#define XSIMD_AVX_128_HPP + +#include + +#include "../types/xsimd_avx_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + // broadcast + template ::value>> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + return _mm_broadcast_ss(&val); + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_EQ_OQ); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_GT_OQ); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_GE_OQ); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_LT_OQ); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_LE_OQ); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_NEQ_UQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_NEQ_UQ); + } + + // load_masked + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, mask.as_batch()); + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, mask.as_batch()); + } + + // store_masked + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_ps(mem, mask.as_batch(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_pd(mem, mask.as_batch(), src); + } + + // swizzle (dynamic mask) + template ::value && sizeof(T) == sizeof(ITy)>> + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_permutevar_ps(self, mask); + } + else + { + // FIXME: _mm_permutevar_pd fails validation, but it shouldn't o_O + return swizzle(self, mask, sse4_2 {}); + // return _mm_permutevar_pd(self, mask); + } + } + + // swizzle (constant mask) + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3)); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + return _mm_permute_pd(self, detail::mod_shuffle(V0, V1)); + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 8ab4261ea..6f276c1f3 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -50,6 +50,7 @@ #if XSIMD_WITH_AVX #include "./xsimd_avx.hpp" +#include "./xsimd_avx_128.hpp" #endif #if XSIMD_WITH_FMA3_AVX diff --git a/include/xsimd/types/xsimd_avx_register.hpp b/include/xsimd/types/xsimd_avx_register.hpp index 47997ee76..515b60901 100644 --- a/include/xsimd/types/xsimd_avx_register.hpp +++ b/include/xsimd/types/xsimd_avx_register.hpp @@ -13,6 +13,7 @@ #define XSIMD_AVX_REGISTER_HPP #include "./xsimd_common_arch.hpp" +#include "./xsimd_sse4_2_register.hpp" namespace xsimd { @@ -30,6 +31,18 @@ namespace xsimd static constexpr bool requires_alignment() noexcept { return true; } static constexpr char const* name() noexcept { return "avx"; } }; + + /** + * @ingroup architectures + * + * AVX instructions extension for 128 bits registers + */ + struct avx_128 : sse4_2 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; } + static constexpr bool available() noexcept { return true; } + static constexpr char const* name() noexcept { return "avx/128"; } + }; } #if XSIMD_WITH_AVX @@ -58,6 +71,8 @@ namespace xsimd XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256); XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d); + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2); } } #endif