@@ -119,7 +119,6 @@ namespace xsimd
119119 }
120120
121121 // load_masked
122- // AVX2 low-level helpers (operate on raw SIMD registers)
123122 namespace detail
124123 {
125124 XSIMD_INLINE __m256i maskload (const int32_t * mem, __m256i mask) noexcept
@@ -138,14 +137,12 @@ namespace xsimd
138137 }
139138 }
140139
141- // single templated implementation for integer masked loads (32/64-bit)
142140 template <class A , class T , bool ... Values, class Mode >
143141 XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) >= 4 ), batch<T, A>>
144142 load_masked (T const * mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
145143 {
146144 static_assert (sizeof (T) == 4 || sizeof (T) == 8 , " load_masked supports only 32/64-bit integers on AVX2" );
147145 using int_t = std::conditional_t <sizeof (T) == 4 , int32_t , long long >;
148- // Use the raw register-level maskload helpers for the remaining cases.
149146 return detail::maskload (reinterpret_cast <const int_t *>(mem), mask.as_batch ());
150147 }
151148
@@ -175,6 +172,20 @@ namespace xsimd
175172 return bitwise_cast<uint64_t >(r);
176173 }
177174
175+ // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
176+ // fall back to the scalar common path: AVX2 has no native maskload for
177+ // those widths, and a load-then-blend would break fault-suppression at
178+ // page boundaries (the main reason callers ask for a masked load).
179+ // Both aligned_mode and unaligned_mode route to the same intrinsic —
180+ // masked-off lanes do not fault regardless of alignment.
181+ template <class A , class T , class Mode >
182+ XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 ), batch<T, A>>
183+ load_masked (T const * mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
184+ {
185+ using int_t = std::conditional_t <sizeof (T) == 4 , int32_t , long long >;
186+ return detail::maskload (reinterpret_cast <const int_t *>(mem), __m256i (mask));
187+ }
188+
178189 // store_masked
179190 namespace detail
180191 {
@@ -196,14 +207,12 @@ namespace xsimd
196207 {
197208 constexpr size_t lanes_per_half = batch<T, A>::size / 2 ;
198209
199- // confined to lower 128-bit half → forward to SSE
200210 XSIMD_IF_CONSTEXPR (mask.countl_zero () >= lanes_per_half)
201211 {
202212 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
203213 const auto lo = detail::lower_half (src);
204214 store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
205215 }
206- // confined to upper 128-bit half → forward to SSE
207216 else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= lanes_per_half)
208217 {
209218 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
@@ -230,6 +239,20 @@ namespace xsimd
230239 store_masked<A>(reinterpret_cast <int64_t *>(mem), s64, batch_bool_constant<int64_t , A, Values...> {}, Mode {}, avx2 {});
231240 }
232241
242+ template <class A , class T , class Mode >
243+ XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 ), void >
244+ store_masked (T* mem, batch<T, A> const & src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
245+ {
246+ XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
247+ {
248+ _mm256_maskstore_epi32 (reinterpret_cast <int *>(mem), __m256i (mask), __m256i (src));
249+ }
250+ else
251+ {
252+ _mm256_maskstore_epi64 (reinterpret_cast <long long *>(mem), __m256i (mask), __m256i (src));
253+ }
254+ }
255+
233256 // load_stream
234257 template <class A , class T , class = std::enable_if_t <std::is_integral<T>::value, void >>
235258 XSIMD_INLINE batch<T, A> load_stream (T const * mem, convert<T>, requires_arch<avx2>) noexcept
0 commit comments