Skip to content

Commit e592d54

Browse files
committed
perf: native AVX512BW masked load/store for 8/16-bit integers
8/16-bit int masked load/store on AVX512BW previously fell through to the branchy common scalar fallback because xsimd_avx512bw.hpp had no load_masked/store_masked overloads. Add four requires_arch<avx512bw> overloads (runtime batch_bool + compile-time batch_bool_constant, load + store) constrained to sizeof(T)==1||2, emitting the native vmovdqu8 / vmovdqu16 predicated moves (2 instructions, no branch). The size branch lives only in the runtime overloads; the constant overloads delegate via mask.as_batch_bool(), which also avoids batch_bool_constant::mask() (return type int) truncating a 64-lane int8 compile-time mask. 32/64-bit stays on the avx512f path; SSE/AVX2 8/16-bit scalar fallback is hardware-forced and unchanged.
1 parent 262f5a7 commit e592d54

4 files changed

Lines changed: 113 additions & 1 deletion

File tree

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,53 @@ namespace xsimd
378378
}
379379
}
380380

381+
// load_masked / store_masked: native vmovdqu8 / vmovdqu16 predication for
382+
// 8/16-bit, replacing the common scalar fallback. No aligned masked 8/16
383+
// intrinsic exists and masked moves never fault, so loadu fits both modes.
384+
template <class A, class T, class Mode,
385+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
386+
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
387+
{
388+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
389+
{
390+
return _mm512_maskz_loadu_epi8((__mmask64)mask.mask(), mem);
391+
}
392+
else
393+
{
394+
return _mm512_maskz_loadu_epi16((__mmask32)mask.mask(), mem);
395+
}
396+
}
397+
398+
template <class A, class T, class Mode,
399+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
400+
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx512bw>) noexcept
401+
{
402+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
403+
{
404+
_mm512_mask_storeu_epi8((void*)mem, (__mmask64)mask.mask(), src);
405+
}
406+
else
407+
{
408+
_mm512_mask_storeu_epi16((void*)mem, (__mmask32)mask.mask(), src);
409+
}
410+
}
411+
412+
// Constant masks reuse the runtime overloads; as_batch_bool() also avoids
413+
// batch_bool_constant::mask() truncating a 64-lane int8 mask to int.
414+
template <class A, class T, bool... Values, class Mode,
415+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
416+
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx512bw>) noexcept
417+
{
418+
return load_masked(mem, mask.as_batch_bool(), convert<T> {}, Mode {}, avx512bw {});
419+
}
420+
421+
template <class A, class T, bool... Values, class Mode,
422+
class = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 1 || sizeof(T) == 2)>>
423+
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx512bw>) noexcept
424+
{
425+
store_masked(mem, src, mask.as_batch_bool(), Mode {}, avx512bw {});
426+
}
427+
381428
// max
382429
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
383430
XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept

include/xsimd/arch/xsimd_avx512f.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ namespace xsimd
356356

357357
// Runtime-mask load/store: same native k-register path as the constant
358358
// overloads above, minus the compile-time half-forwarding. 8/16-bit
359-
// elements fall back to the common scalar path.
359+
// elements are handled natively by avx512bw (vmovdqu8 / vmovdqu16);
360+
// without AVX512BW they fall back to the common scalar path.
360361
template <class A, class T, class Mode,
361362
typename = std::enable_if_t<(sizeof(T) >= 4)>>
362363
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx512f>) noexcept

include/xsimd/arch/xsimd_avx512vl_128.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,66 +212,98 @@ namespace xsimd
212212
XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept
213213
{
214214
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
215+
{
215216
return _mm_maskz_load_epi32((__mmask8)m, mem);
217+
}
216218
else
219+
{
217220
return _mm_maskz_loadu_epi32((__mmask8)m, mem);
221+
}
218222
}
219223
template <class T, class Mode, enable_sized_integral_t<T, 8> = 0>
220224
XSIMD_INLINE __m128i maskload128(T const* mem, uint64_t m, Mode) noexcept
221225
{
222226
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
227+
{
223228
return _mm_maskz_load_epi64((__mmask8)m, mem);
229+
}
224230
else
231+
{
225232
return _mm_maskz_loadu_epi64((__mmask8)m, mem);
233+
}
226234
}
227235
template <class Mode>
228236
XSIMD_INLINE __m128 maskload128(float const* mem, uint64_t m, Mode) noexcept
229237
{
230238
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
239+
{
231240
return _mm_maskz_load_ps((__mmask8)m, mem);
241+
}
232242
else
243+
{
233244
return _mm_maskz_loadu_ps((__mmask8)m, mem);
245+
}
234246
}
235247
template <class Mode>
236248
XSIMD_INLINE __m128d maskload128(double const* mem, uint64_t m, Mode) noexcept
237249
{
238250
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
251+
{
239252
return _mm_maskz_load_pd((__mmask8)m, mem);
253+
}
240254
else
255+
{
241256
return _mm_maskz_loadu_pd((__mmask8)m, mem);
257+
}
242258
}
243259

244260
template <class T, class Mode, enable_sized_integral_t<T, 4> = 0>
245261
XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept
246262
{
247263
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
264+
{
248265
_mm_mask_store_epi32(mem, (__mmask8)m, src);
266+
}
249267
else
268+
{
250269
_mm_mask_storeu_epi32(mem, (__mmask8)m, src);
270+
}
251271
}
252272
template <class T, class Mode, enable_sized_integral_t<T, 8> = 0>
253273
XSIMD_INLINE void maskstore128(T* mem, __m128i src, uint64_t m, Mode) noexcept
254274
{
255275
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
276+
{
256277
_mm_mask_store_epi64(mem, (__mmask8)m, src);
278+
}
257279
else
280+
{
258281
_mm_mask_storeu_epi64(mem, (__mmask8)m, src);
282+
}
259283
}
260284
template <class Mode>
261285
XSIMD_INLINE void maskstore128(float* mem, __m128 src, uint64_t m, Mode) noexcept
262286
{
263287
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
288+
{
264289
_mm_mask_store_ps(mem, (__mmask8)m, src);
290+
}
265291
else
292+
{
266293
_mm_mask_storeu_ps(mem, (__mmask8)m, src);
294+
}
267295
}
268296
template <class Mode>
269297
XSIMD_INLINE void maskstore128(double* mem, __m128d src, uint64_t m, Mode) noexcept
270298
{
271299
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
300+
{
272301
_mm_mask_store_pd(mem, (__mmask8)m, src);
302+
}
273303
else
304+
{
274305
_mm_mask_storeu_pd(mem, (__mmask8)m, src);
306+
}
275307
}
276308
}
277309

include/xsimd/arch/xsimd_avx512vl_256.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,66 +211,98 @@ namespace xsimd
211211
XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept
212212
{
213213
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
214+
{
214215
return _mm256_maskz_load_epi32((__mmask8)m, mem);
216+
}
215217
else
218+
{
216219
return _mm256_maskz_loadu_epi32((__mmask8)m, mem);
220+
}
217221
}
218222
template <class T, class Mode, enable_sized_integral_t<T, 8> = 0>
219223
XSIMD_INLINE __m256i maskload256(T const* mem, uint64_t m, Mode) noexcept
220224
{
221225
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
226+
{
222227
return _mm256_maskz_load_epi64((__mmask8)m, mem);
228+
}
223229
else
230+
{
224231
return _mm256_maskz_loadu_epi64((__mmask8)m, mem);
232+
}
225233
}
226234
template <class Mode>
227235
XSIMD_INLINE __m256 maskload256(float const* mem, uint64_t m, Mode) noexcept
228236
{
229237
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
238+
{
230239
return _mm256_maskz_load_ps((__mmask8)m, mem);
240+
}
231241
else
242+
{
232243
return _mm256_maskz_loadu_ps((__mmask8)m, mem);
244+
}
233245
}
234246
template <class Mode>
235247
XSIMD_INLINE __m256d maskload256(double const* mem, uint64_t m, Mode) noexcept
236248
{
237249
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
250+
{
238251
return _mm256_maskz_load_pd((__mmask8)m, mem);
252+
}
239253
else
254+
{
240255
return _mm256_maskz_loadu_pd((__mmask8)m, mem);
256+
}
241257
}
242258

243259
template <class T, class Mode, enable_sized_integral_t<T, 4> = 0>
244260
XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept
245261
{
246262
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
263+
{
247264
_mm256_mask_store_epi32(mem, (__mmask8)m, src);
265+
}
248266
else
267+
{
249268
_mm256_mask_storeu_epi32(mem, (__mmask8)m, src);
269+
}
250270
}
251271
template <class T, class Mode, enable_sized_integral_t<T, 8> = 0>
252272
XSIMD_INLINE void maskstore256(T* mem, __m256i src, uint64_t m, Mode) noexcept
253273
{
254274
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
275+
{
255276
_mm256_mask_store_epi64(mem, (__mmask8)m, src);
277+
}
256278
else
279+
{
257280
_mm256_mask_storeu_epi64(mem, (__mmask8)m, src);
281+
}
258282
}
259283
template <class Mode>
260284
XSIMD_INLINE void maskstore256(float* mem, __m256 src, uint64_t m, Mode) noexcept
261285
{
262286
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
287+
{
263288
_mm256_mask_store_ps(mem, (__mmask8)m, src);
289+
}
264290
else
291+
{
265292
_mm256_mask_storeu_ps(mem, (__mmask8)m, src);
293+
}
266294
}
267295
template <class Mode>
268296
XSIMD_INLINE void maskstore256(double* mem, __m256d src, uint64_t m, Mode) noexcept
269297
{
270298
XSIMD_IF_CONSTEXPR(std::is_same<Mode, aligned_mode>::value)
299+
{
271300
_mm256_mask_store_pd(mem, (__mmask8)m, src);
301+
}
272302
else
303+
{
273304
_mm256_mask_storeu_pd(mem, (__mmask8)m, src);
305+
}
274306
}
275307
}
276308

0 commit comments

Comments
 (0)