Skip to content

Commit d5f21c7

Browse files
committed
feat: add runtime batch_bool mask overloads for avx_128 / avx2_128
Mirror the AVX/AVX2 runtime-mask load_masked / store_masked overloads on the new 128-bit SSE-register variants of those ISAs: - avx_128: float / double via _mm_maskload_ps/pd, _mm_maskstore_ps/pd - avx2_128: 32/64-bit integers via _mm_maskload_epi32/64, _mm_maskstore_epi32/64 8/16-bit integers continue to fall through to the scalar common path (no native maskload/store intrinsic at those widths). Both alignment modes route to the same intrinsic since masked-off lanes do not fault.
1 parent e227346 commit d5f21c7

2 files changed

Lines changed: 68 additions & 0 deletions

File tree

include/xsimd/arch/xsimd_avx2_128.hpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,43 @@ namespace xsimd
133133
return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
134134
}
135135

136+
// Runtime-mask load for 32/64-bit integers on AVX2-128. 8/16-bit
137+
// integers fall back to the scalar common path: there is no native
138+
// _mm_maskload for those widths, and a load-then-blend would break
139+
// fault-suppression at page boundaries (the main reason callers ask
140+
// for a masked load). Both aligned_mode and unaligned_mode route to
141+
// the same intrinsic — masked-off lanes do not fault regardless of
142+
// alignment.
143+
template <class A, class T, class Mode>
144+
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
145+
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
146+
{
147+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
148+
{
149+
return _mm_maskload_epi32(reinterpret_cast<const int*>(mem), __m128i(mask));
150+
}
151+
else
152+
{
153+
return _mm_maskload_epi64(reinterpret_cast<const long long*>(mem), __m128i(mask));
154+
}
155+
}
156+
157+
// Runtime-mask store for 32/64-bit integers on AVX2-128. Same
158+
// fault-suppression semantics as the masked loads above.
159+
template <class A, class T, class Mode>
160+
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
161+
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
162+
{
163+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
164+
{
165+
_mm_maskstore_epi32(reinterpret_cast<int*>(mem), __m128i(mask), __m128i(src));
166+
}
167+
else
168+
{
169+
_mm_maskstore_epi64(reinterpret_cast<long long*>(mem), __m128i(mask), __m128i(src));
170+
}
171+
}
172+
136173
// gather
137174
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
138175
XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,

include/xsimd/arch/xsimd_avx_128.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,22 @@ namespace xsimd
115115
return _mm_maskload_pd(mem, mask.as_batch());
116116
}
117117

118+
// Runtime-mask load for float/double on AVX-128. Both aligned_mode and
119+
// unaligned_mode map to _mm_maskload_* — the intrinsic does not fault
120+
// on masked-off lanes, so partial loads across page boundaries are safe.
121+
template <class A, class Mode>
122+
XSIMD_INLINE batch<float, A>
123+
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
124+
{
125+
return _mm_maskload_ps(mem, _mm_castps_si128(mask));
126+
}
127+
template <class A, class Mode>
128+
XSIMD_INLINE batch<double, A>
129+
load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
130+
{
131+
return _mm_maskload_pd(mem, _mm_castpd_si128(mask));
132+
}
133+
118134
// store_masked
119135
template <class A, bool... Values, class Mode>
120136
XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
@@ -128,6 +144,21 @@ namespace xsimd
128144
return _mm_maskstore_pd(mem, mask.as_batch(), src);
129145
}
130146

147+
// Runtime-mask store for float/double on AVX-128. Same fault-suppression
148+
// semantics as the masked loads above; alignment mode is irrelevant.
149+
template <class A, class Mode>
150+
XSIMD_INLINE void
151+
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept
152+
{
153+
_mm_maskstore_ps(mem, _mm_castps_si128(mask), src);
154+
}
155+
template <class A, class Mode>
156+
XSIMD_INLINE void
157+
store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx_128>) noexcept
158+
{
159+
_mm_maskstore_pd(mem, _mm_castpd_si128(mask), src);
160+
}
161+
131162
// swizzle (dynamic mask)
132163
template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
133164
XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept

0 commit comments

Comments
 (0)