From ac12cbc5f19900d6f13216659ac4111f21e782d0 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 27 Mar 2025 09:15:24 +0100 Subject: [PATCH 1/3] Specialize avx512bw bool loader for aligned access --- .../arch/generic/xsimd_generic_memory.hpp | 8 ++++++- include/xsimd/arch/xsimd_avx512bw.hpp | 23 +++++++++---------- include/xsimd/arch/xsimd_avx512f.hpp | 6 ++--- include/xsimd/types/xsimd_batch.hpp | 4 ++-- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 7f1f09c63..7377881d7 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -262,7 +262,7 @@ namespace xsimd // load template - XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { using batch_type = batch; batch_type ref(0); @@ -273,6 +273,12 @@ namespace xsimd return ref != batch_type::load_aligned(&buffer[0]); } + template + XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool b, requires_arch) noexcept + { + return load_unaligned(mem, b, A {}); + } + // load_aligned namespace detail { diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 6a76b597a..1c9416d4d 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -316,19 +316,18 @@ namespace xsimd } // load - template - XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + template ::size == 64, void>::type> + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { - using register_type = typename batch_bool::register_type; - XSIMD_IF_CONSTEXPR(batch_bool::size == 64) - { - __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); - return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); - } - else - { - return load(mem, batch_bool(), avx512dq {}); - } + __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); + return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + } + + template ::size == 64, void>::type> + XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool, requires_arch) noexcept + { + __m512i bool_val = _mm512_load_si512((__m512i const*)mem); + return _mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); } // max diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 1d2ee2dc4..79e29b7b0 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1209,14 +1209,14 @@ namespace xsimd } } - // load + // load mask template - XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; constexpr auto iter = size / 8; - static_assert(size % 8 == 0, "incorrect size of bool batch"); + static_assert((size % 8) == 0, "incorrect size of bool batch"); register_type mask = 0; for (std::size_t i = 0; i < iter; ++i) { diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index c3c0d7fcd..b54d84aae 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -968,13 +968,13 @@ namespace xsimd template XSIMD_INLINE batch_bool batch_bool::load_aligned(bool const* mem) noexcept { - return kernel::load(mem, batch_bool(), A {}); + return kernel::load_aligned(mem, batch_bool(), A {}); } template XSIMD_INLINE batch_bool batch_bool::load_unaligned(bool const* mem) noexcept { - return load_aligned(mem); + return kernel::load_unaligned(mem, batch_bool(), A {}); } /** From 5dfa97f4e6cf0e582a9ea11d87a987626112ddc2 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 27 Mar 2025 10:45:35 +0100 Subject: [PATCH 2/3] Require c++17 for xtl testing --- test/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8fffeed1d..8a4ce50d5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -54,11 +54,11 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" # Users may override the c++ standard: if(NOT DEFINED CMAKE_CXX_STANDARD OR "${CMAKE_CXX_STANDARD}" STREQUAL "") if (ENABLE_XTL_COMPLEX) - CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) - if (NOT HAS_CPP14_FLAG) - message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++14 support when xtl complex support is enabled") + CHECK_CXX_COMPILER_FLAG("-std=c++17" HAS_CPP17_FLAG) + if (NOT HAS_CPP17_FLAG) + message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++17 support when xtl complex support is enabled") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") else() CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (NOT HAS_CPP11_FLAG) From 31ff1f576d2b06de82328832d751b294c91cdd53 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 27 Mar 2025 10:57:12 +0100 Subject: [PATCH 3/3] Fix c++17 related warnings --- include/xsimd/arch/xsimd_avx.hpp | 72 +++++++++++++++---------------- include/xsimd/arch/xsimd_avx2.hpp | 3 +- 2 files changed, 38 insertions(+), 37 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 469cbcaf9..a5fa3266d 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -633,44 +633,26 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { - alignas(A::alignment()) static const uint32_t lut32[] = { - 0x00000000, - 0x000000FF, - 0x0000FF00, - 0x0000FFFF, - 0x00FF0000, - 0x00FF00FF, - 0x00FFFF00, - 0x00FFFFFF, - 0xFF000000, - 0xFF0000FF, - 0xFF00FF00, - 0xFF00FFFF, - 0xFFFF0000, - 0xFFFF00FF, - 0xFFFFFF00, - 0xFFFFFFFF, - }; - alignas(A::alignment()) static const uint64_t lut64[] = { - 0x0000000000000000ul, - 0x000000000000FFFFul, - 0x00000000FFFF0000ul, - 0x00000000FFFFFFFFul, - 0x0000FFFF00000000ul, - 0x0000FFFF0000FFFFul, - 0x0000FFFFFFFF0000ul, - 0x0000FFFFFFFFFFFFul, - 0xFFFF000000000000ul, - 0xFFFF00000000FFFFul, - 0xFFFF0000FFFF0000ul, - 0xFFFF0000FFFFFFFFul, - 0xFFFFFFFF00000000ul, - 0xFFFFFFFF0000FFFFul, - 0xFFFFFFFFFFFF0000ul, - 0xFFFFFFFFFFFFFFFFul, - }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { + alignas(A::alignment()) static const uint32_t lut32[] = { + 0x00000000, + 0x000000FF, + 0x0000FF00, + 0x0000FFFF, + 0x00FF0000, + 0x00FF00FF, + 0x00FFFF00, + 0x00FFFFFF, + 0xFF000000, + 0xFF0000FF, + 0xFF00FF00, + 0xFF00FFFF, + 0xFFFF0000, + 0xFFFF00FF, + 0xFFFFFF00, + 0xFFFFFFFF, + }; assert(!(mask & ~0xFFFFFFFFul) && "inbound mask"); return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF], @@ -679,6 +661,24 @@ namespace xsimd } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { + alignas(A::alignment()) static const uint64_t lut64[] = { + 0x0000000000000000ul, + 0x000000000000FFFFul, + 0x00000000FFFF0000ul, + 0x00000000FFFFFFFFul, + 0x0000FFFF00000000ul, + 0x0000FFFF0000FFFFul, + 0x0000FFFFFFFF0000ul, + 0x0000FFFFFFFFFFFFul, + 0xFFFF000000000000ul, + 0xFFFF00000000FFFFul, + 0xFFFF0000FFFF0000ul, + 0xFFFF0000FFFFFFFFul, + 0xFFFFFFFF00000000ul, + 0xFFFFFFFF0000FFFFul, + 0xFFFFFFFFFFFF0000ul, + 0xFFFFFFFFFFFFFFFFul, + }; assert(!(mask & ~0xFFFFul) && "inbound mask"); return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]); } diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 506299a0d..d1f89d223 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -731,16 +731,17 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { - constexpr int mask = batch_bool_constant::mask(); // FIXME: for some reason mask here is not considered as an immediate, // but it's okay for _mm256_blend_epi32 // case 2: return _mm256_blend_epi16(false_br, true_br, mask); XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { + constexpr int mask = batch_bool_constant::mask(); return _mm256_blend_epi32(false_br, true_br, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { + constexpr int mask = batch_bool_constant::mask(); constexpr int imask = detail::interleave(mask); return _mm256_blend_epi32(false_br, true_br, imask); }