xtensor-stack · serge-sans-paille · Mar 27, 2025 · Mar 18, 2025 · Mar 21, 2025 · Mar 24, 2025
diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -260,6 +260,19 @@ namespace xsimd
             return buffer[i];
         }
 
+        // load
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type ref(0);
+            constexpr auto size = batch_bool<T, A>::size;
+            alignas(A::alignment()) T buffer[size];
+            for (std::size_t i = 0; i < size; ++i)
+                buffer[i] = mem[i] ? 1 : 0;
+            return ref != batch_type::load_aligned(&buffer[0]);
+        }
+
         // load_aligned
         namespace detail
         {

diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -315,6 +315,22 @@ namespace xsimd
             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
         }
 
+        // load
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512bw>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size == 64)
+            {
+                __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem);
+                return (register_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512());
+            }
+            else
+            {
+                return load(mem, batch_bool<T, A>(), avx512dq {});
+            }
+        }
+
         // max
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
@@ -628,6 +644,16 @@ namespace xsimd
             }
         }
 
+        // store
+        template <class T, class A>
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512bw>) noexcept
+        {
+            constexpr auto size = batch_bool<T, A>::size;
+            __m512i bool_val = _mm512_maskz_set1_epi8(self.data, 0x01);
+            __mmask64 mask = size >= 64 ? ~(__mmask64)0 : (1ULL << size) - 1;
+            _mm512_mask_storeu_epi8((void*)mem, mask, bool_val);
+        }
+
         // sub
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1193,6 +1193,39 @@ namespace xsimd
             return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
         }
 
+        namespace detail
+        {
+            // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array
+            // Generate a bitset from an array of boolean.
+            XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
+            {
+                uint64_t data;
+                memcpy(&data, unpacked, sizeof(uint64_t));
+
+                const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
+
+                unsigned char res = ((data * magic) >> 56) & 0xFF;
+                return res;
+            }
+        }
+
+        // load
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load(bool const* mem, batch_bool<T, A>, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            constexpr auto iter = size / 8;
+            static_assert(size % 8 == 0, "incorrect size of bool batch");
+            register_type mask = 0;
+            for (std::size_t i = 0; i < iter; ++i)
+            {
+                unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
+                mask |= (register_type(block) << (i * 8));
+            }
+            return mask;
+        }
+
         // load_aligned
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept

diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
@@ -968,11 +968,7 @@ namespace xsimd
     template <class T, class A>
     XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
     {
-        batch_type ref(0);
-        alignas(A::alignment()) T buffer[size];
-        for (std::size_t i = 0; i < size; ++i)
-            buffer[i] = mem[i] ? 1 : 0;
-        return ref != batch_type::load_aligned(&buffer[0]);
+        return kernel::load<A>(mem, batch_bool<T, A>(), A {});
     }
 
     template <class T, class A>