diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index badd0e384..7f1f09c63 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -260,6 +260,19 @@ namespace xsimd return buffer[i]; } + // load + template + XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + { + using batch_type = batch; + batch_type ref(0); + constexpr auto size = batch_bool::size; + alignas(A::alignment()) T buffer[size]; + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mem[i] ? 1 : 0; + return ref != batch_type::load_aligned(&buffer[0]); + } + // load_aligned namespace detail { diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 7097d9d1d..6a76b597a 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -315,6 +315,22 @@ namespace xsimd return detail::compare_int_avx512bw(self, other); } + // load + template + XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + XSIMD_IF_CONSTEXPR(batch_bool::size == 64) + { + __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); + return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); + } + else + { + return load(mem, batch_bool(), avx512dq {}); + } + } + // max template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept @@ -628,6 +644,16 @@ namespace xsimd } } + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + constexpr auto size = batch_bool::size; + __m512i bool_val = _mm512_maskz_set1_epi8(self.data, 0x01); + __mmask64 mask = size >= 64 ? ~(__mmask64)0 : (1ULL << size) - 1; + _mm512_mask_storeu_epi8((void*)mem, mask, bool_val); + } + // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 524b5676e..1d2ee2dc4 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1193,6 +1193,39 @@ namespace xsimd return detail::compare_int_avx512f(self, other); } + namespace detail + { + // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array + // Generate a bitset from an array of boolean. + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8]) + { + uint64_t data; + memcpy(&data, unpacked, sizeof(uint64_t)); + + const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); + + unsigned char res = ((data * magic) >> 56) & 0xFF; + return res; + } + } + + // load + template + XSIMD_INLINE batch_bool load(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto iter = size / 8; + static_assert(size % 8 == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * 8); + mask |= (register_type(block) << (i * 8)); + } + return mask; + } + // load_aligned template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index c9b6d1149..c3c0d7fcd 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -968,11 +968,7 @@ namespace xsimd template XSIMD_INLINE batch_bool batch_bool::load_aligned(bool const* mem) noexcept { - batch_type ref(0); - alignas(A::alignment()) T buffer[size]; - for (std::size_t i = 0; i < size; ++i) - buffer[i] = mem[i] ? 1 : 0; - return ref != batch_type::load_aligned(&buffer[0]); + return kernel::load(mem, batch_bool(), A {}); } template