@@ -117,18 +117,34 @@ namespace xsimd
117117 }
118118 }
119119
120- // load_masked
121- // AVX2 low-level helpers (operate on raw SIMD registers)
120+ // load_masked / store_masked: AVX2 has _mm256_maskload/maskstore_epi{32,64};
121+ // 8/16-bit integers fall back to the common scalar path.
122122 namespace detail
123123 {
124- XSIMD_INLINE __m256i maskload (const int32_t * mem, __m256i mask) noexcept
124+ template <class T >
125+ XSIMD_INLINE __m256i maskload (T const * mem, __m256i mask) noexcept
125126 {
126- return _mm256_maskload_epi32 (mem, mask);
127+ XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
128+ {
129+ return _mm256_maskload_epi32 (reinterpret_cast <int const *>(mem), mask);
130+ }
131+ else
132+ {
133+ return _mm256_maskload_epi64 (reinterpret_cast <long long const *>(mem), mask);
134+ }
127135 }
128136
129- XSIMD_INLINE __m256i maskload (const long long * mem, __m256i mask) noexcept
137+ template <class T >
138+ XSIMD_INLINE void maskstore (T* mem, __m256i mask, __m256i src) noexcept
130139 {
131- return _mm256_maskload_epi64 (reinterpret_cast <long long const *>(mem), mask);
140+ XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
141+ {
142+ _mm256_maskstore_epi32 (reinterpret_cast <int *>(mem), mask, src);
143+ }
144+ else
145+ {
146+ _mm256_maskstore_epi64 (reinterpret_cast <long long *>(mem), mask, src);
147+ }
132148 }
133149
134150 XSIMD_INLINE __m256i zero_extend (__m128i hi) noexcept
@@ -137,72 +153,22 @@ namespace xsimd
137153 }
138154 }
139155
140- // single templated implementation for integer masked loads (32/64-bit)
141156 template <class A , class T , bool ... Values, class Mode >
142- XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) >= 4 ), batch<T, A>>
157+ XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 ), batch<T, A>>
143158 load_masked (T const * mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
144159 {
145- static_assert (sizeof (T) == 4 || sizeof (T) == 8 , " load_masked supports only 32/64-bit integers on AVX2" );
146- using int_t = std::conditional_t <sizeof (T) == 4 , int32_t , long long >;
147- // Use the raw register-level maskload helpers for the remaining cases.
148- return detail::maskload (reinterpret_cast <const int_t *>(mem), mask.as_batch ());
149- }
150-
151- template <class A , bool ... Values, class Mode >
152- XSIMD_INLINE batch<int32_t , A> load_masked (int32_t const * mem, batch_bool_constant<int32_t , A, Values...> mask, convert<int32_t >, Mode, requires_arch<avx2>) noexcept
153- {
154- return load_masked<A, int32_t >(mem, mask, convert<int32_t > {}, Mode {}, avx2 {});
155- }
156-
157- template <class A , bool ... Values, class Mode >
158- XSIMD_INLINE batch<uint32_t , A> load_masked (uint32_t const * mem, batch_bool_constant<uint32_t , A, Values...>, convert<uint32_t >, Mode, requires_arch<avx2>) noexcept
159- {
160- const auto r = load_masked<A, int32_t >(reinterpret_cast <int32_t const *>(mem), batch_bool_constant<int32_t , A, Values...> {}, convert<int32_t > {}, Mode {}, avx2 {});
161- return bitwise_cast<uint32_t >(r);
160+ return detail::maskload (mem, mask.as_batch ());
162161 }
163162
164- template <class A , bool ... Values, class Mode >
165- XSIMD_INLINE batch<int64_t , A> load_masked (int64_t const * mem, batch_bool_constant<int64_t , A, Values...> mask, convert<int64_t >, Mode, requires_arch<avx2>) noexcept
166- {
167- return load_masked<A, int64_t >(mem, mask, convert<int64_t > {}, Mode {}, avx2 {});
168- }
169-
170- template <class A , bool ... Values, class Mode >
171- XSIMD_INLINE batch<uint64_t , A> load_masked (uint64_t const * mem, batch_bool_constant<uint64_t , A, Values...>, convert<uint64_t >, Mode, requires_arch<avx2>) noexcept
172- {
173- const auto r = load_masked<A, int64_t >(reinterpret_cast <int64_t const *>(mem), batch_bool_constant<int64_t , A, Values...> {}, convert<int64_t > {}, Mode {}, avx2 {});
174- return bitwise_cast<uint64_t >(r);
175- }
176-
177- // Runtime-mask load for 32/64-bit integers on AVX2; narrower widths fall
178- // back to the scalar common path. Aligned and unaligned share the same
179- // intrinsic — masked-off lanes do not fault regardless of alignment.
180163 template <class A , class T , class Mode >
181164 XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 ), batch<T, A>>
182165 load_masked (T const * mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
183166 {
184- using int_t = std::conditional_t <sizeof (T) == 4 , int32_t , long long >;
185- return detail::maskload (reinterpret_cast <const int_t *>(mem), __m256i (mask));
186- }
187-
188- // store_masked
189- namespace detail
190- {
191- template <class T , class A >
192- XSIMD_INLINE void maskstore (int32_t * mem, __m256i mask, __m256i src) noexcept
193- {
194- _mm256_maskstore_epi32 (reinterpret_cast <int *>(mem), mask, src);
195- }
196-
197- template <class T , class A >
198- XSIMD_INLINE void maskstore (int64_t * mem, __m256i mask, __m256i src) noexcept
199- {
200- _mm256_maskstore_epi64 (reinterpret_cast <long long *>(mem), mask, src);
201- }
167+ return detail::maskload (mem, __m256i (mask));
202168 }
203169
204170 template <class A , class T , bool ... Values, class Mode ,
205- typename = std::enable_if_t <std::is_integral<T>::value && (sizeof (T) >= 4 )>>
171+ typename = std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 )>>
206172 XSIMD_INLINE void store_masked (T* mem, batch<T, A> const & src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
207173 {
208174 constexpr size_t lanes_per_half = batch<T, A>::size / 2 ;
@@ -225,33 +191,15 @@ namespace xsimd
225191 }
226192 else
227193 {
228- detail::maskstore<T, A> (mem, mask.as_batch (), src);
194+ detail::maskstore (mem, mask.as_batch (), src);
229195 }
230196 }
231197
232- template <class A , bool ... Values, class Mode >
233- XSIMD_INLINE void store_masked (uint32_t * mem, batch<uint32_t , A> const & src, batch_bool_constant<uint32_t , A, Values...>, Mode, requires_arch<avx2>) noexcept
234- {
235- const auto s32 = bitwise_cast<int32_t >(src);
236- store_masked<A>(reinterpret_cast <int32_t *>(mem), s32, batch_bool_constant<int32_t , A, Values...> {}, Mode {}, avx2 {});
237- }
238-
239- template <class A , bool ... Values, class Mode >
240- XSIMD_INLINE void store_masked (uint64_t * mem, batch<uint64_t , A> const & src, batch_bool_constant<uint64_t , A, Values...>, Mode, requires_arch<avx2>) noexcept
241- {
242- const auto s64 = bitwise_cast<int64_t >(src);
243- store_masked<A>(reinterpret_cast <int64_t *>(mem), s64, batch_bool_constant<int64_t , A, Values...> {}, Mode {}, avx2 {});
244- }
245-
246- // Runtime-mask store for 32/64-bit integers on AVX2; narrower widths fall
247- // back to the scalar common path. Same fault-suppression semantics as the
248- // masked loads above; alignment mode is irrelevant.
249198 template <class A , class T , class Mode >
250199 XSIMD_INLINE std::enable_if_t <std::is_integral<T>::value && (sizeof (T) == 4 || sizeof (T) == 8 ), void >
251200 store_masked (T* mem, batch<T, A> const & src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
252201 {
253- using int_t = std::conditional_t <sizeof (T) == 4 , int32_t , int64_t >;
254- detail::maskstore<int_t , A>(reinterpret_cast <int_t *>(mem), __m256i (mask), __m256i (src));
202+ detail::maskstore (mem, __m256i (mask), __m256i (src));
255203 }
256204
257205 // load_stream
0 commit comments