Skip to content

Commit 262f5a7

Browse files
committed
refactor: trim masked load/store comments and consolidate AVX2-128 helpers
Shorten verbose comments around masked load/store paths, drop the sizeof(int)/sizeof(long long) static_asserts (intrinsic boundaries now reinterpret_cast at the call site), and collapse the four maskload_128/maskstore_128 detail overloads into two XSIMD_IF_CONSTEXPR- dispatched templates. Public surface unchanged.
1 parent e71227e commit 262f5a7

11 files changed

Lines changed: 282 additions & 373 deletions

File tree

.github/workflows/doxygen.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ jobs:
99
steps:
1010
- uses: actions/checkout@v6
1111
- name: Install dependencies
12-
run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme
12+
run: |
13+
sudo apt-get update
14+
sudo apt-get install -y doxygen python3-breathe python3-sphinx-rtd-theme
1315
- name: Render
1416
run: make -C docs

include/xsimd/arch/common/xsimd_common_memory.hpp

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -441,12 +441,8 @@ namespace xsimd
441441
XSIMD_INLINE batch<T, A>
442442
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
443443
{
444-
// Per-lane validity contract: only active lanes of ``mem`` are
445-
// required to be addressable. An unconditional whole-vector load
446-
// would touch inactive lanes and trip ASan/Valgrind on partial
447-
// buffers, so stay scalar. Arches with hardware predicated loads
448-
// (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
449-
// intrinsic that suppresses inactive-lane reads in hardware.
444+
// Scalar fallback: only active lanes are touched. Arches with
445+
// hardware predicated loads override this.
450446
constexpr std::size_t size = batch<T, A>::size;
451447
alignas(A::alignment()) std::array<T, size> buffer;
452448
for (std::size_t i = 0; i < size; ++i)
@@ -465,12 +461,8 @@ namespace xsimd
465461
XSIMD_INLINE void
466462
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
467463
{
468-
// Per-lane validity contract (matches native masked-store APIs):
469-
// only active lanes of ``mem`` are touched. A load+select+store
470-
// RMW would both read and write inactive bytes, breaking that
471-
// contract — stay scalar. Arches with hardware predicated stores
472-
// override this with a single intrinsic that suppresses inactive
473-
// lanes in hardware.
464+
// Scalar fallback: only active lanes are touched. Arches with
465+
// hardware predicated stores override this.
474466
constexpr std::size_t size = batch<T, A>::size;
475467
alignas(A::alignment()) std::array<T, size> src_buf;
476468
src.store_aligned(src_buf.data());

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -987,9 +987,7 @@ namespace xsimd
987987
}
988988
}
989989

990-
// Runtime-mask load for float/double on AVX. Both aligned_mode and
991-
// unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
992-
// on masked-off lanes, so partial loads across page boundaries are safe.
990+
// Runtime-mask load (float/double).
993991
template <class A, class Mode>
994992
XSIMD_INLINE batch<float, A>
995993
load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
@@ -1036,12 +1034,8 @@ namespace xsimd
10361034
// store_masked
10371035
namespace detail
10381036
{
1039-
// True when batch_bool<T, A> is the legacy VEX vector mask, i.e. it is stored
1040-
// in the same register as the data (__m256 / __m256d) rather than in an EVEX
1041-
// k-register (__mmask8) as on the avx512vl architectures. The _mm256_cast*_si256
1042-
// path below is only well-formed for the vector-mask representation. This names
1043-
// no architecture — it tests the mask's representation, in the spirit of
1044-
// detail::masked_memory_uses_fp_bitcast.
1037+
// True when batch_bool<T, A> shares the data register (__m256/__m256d) rather
1038+
// than an EVEX k-register; the _mm256_cast*_si256 path below needs the former.
10451039
template <class T, class A>
10461040
using uses_vector_mask = std::is_same<typename batch_bool<T, A>::register_type,
10471041
typename batch<T, A>::register_type>;
@@ -1087,8 +1081,7 @@ namespace xsimd
10871081
}
10881082
}
10891083

1090-
// Runtime-mask store for float/double on AVX. Same fault-suppression
1091-
// semantics as the masked loads above; alignment mode is irrelevant.
1084+
// Runtime-mask store (float/double).
10921085
template <class A, class Mode>
10931086
XSIMD_INLINE void
10941087
store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 28 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,34 @@ namespace xsimd
117117
}
118118
}
119119

120-
// load_masked
121-
// AVX2 low-level helpers (operate on raw SIMD registers)
120+
// load_masked / store_masked: AVX2 has _mm256_maskload/maskstore_epi{32,64};
121+
// 8/16-bit integers fall back to the common scalar path.
122122
namespace detail
123123
{
124-
XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
124+
template <class T>
125+
XSIMD_INLINE __m256i maskload(T const* mem, __m256i mask) noexcept
125126
{
126-
return _mm256_maskload_epi32(mem, mask);
127+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
128+
{
129+
return _mm256_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
130+
}
131+
else
132+
{
133+
return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
134+
}
127135
}
128136

129-
XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept
137+
template <class T>
138+
XSIMD_INLINE void maskstore(T* mem, __m256i mask, __m256i src) noexcept
130139
{
131-
return _mm256_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
140+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
141+
{
142+
_mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
143+
}
144+
else
145+
{
146+
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
147+
}
132148
}
133149

134150
XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept
@@ -137,72 +153,22 @@ namespace xsimd
137153
}
138154
}
139155

140-
// single templated implementation for integer masked loads (32/64-bit)
141156
template <class A, class T, bool... Values, class Mode>
142-
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
157+
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
143158
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
144159
{
145-
static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
146-
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
147-
// Use the raw register-level maskload helpers for the remaining cases.
148-
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
149-
}
150-
151-
template <class A, bool... Values, class Mode>
152-
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
153-
{
154-
return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
155-
}
156-
157-
template <class A, bool... Values, class Mode>
158-
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
159-
{
160-
const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
161-
return bitwise_cast<uint32_t>(r);
160+
return detail::maskload(mem, mask.as_batch());
162161
}
163162

164-
template <class A, bool... Values, class Mode>
165-
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
166-
{
167-
return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
168-
}
169-
170-
template <class A, bool... Values, class Mode>
171-
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
172-
{
173-
const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
174-
return bitwise_cast<uint64_t>(r);
175-
}
176-
177-
// Runtime-mask load for 32/64-bit integers on AVX2; narrower widths fall
178-
// back to the scalar common path. Aligned and unaligned share the same
179-
// intrinsic — masked-off lanes do not fault regardless of alignment.
180163
template <class A, class T, class Mode>
181164
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
182165
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
183166
{
184-
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
185-
return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
186-
}
187-
188-
// store_masked
189-
namespace detail
190-
{
191-
template <class T, class A>
192-
XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept
193-
{
194-
_mm256_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
195-
}
196-
197-
template <class T, class A>
198-
XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept
199-
{
200-
_mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
201-
}
167+
return detail::maskload(mem, __m256i(mask));
202168
}
203169

204170
template <class A, class T, bool... Values, class Mode,
205-
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4)>>
171+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
206172
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
207173
{
208174
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
@@ -225,33 +191,15 @@ namespace xsimd
225191
}
226192
else
227193
{
228-
detail::maskstore<T, A>(mem, mask.as_batch(), src);
194+
detail::maskstore(mem, mask.as_batch(), src);
229195
}
230196
}
231197

232-
template <class A, bool... Values, class Mode>
233-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
234-
{
235-
const auto s32 = bitwise_cast<int32_t>(src);
236-
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
237-
}
238-
239-
template <class A, bool... Values, class Mode>
240-
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
241-
{
242-
const auto s64 = bitwise_cast<int64_t>(src);
243-
store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
244-
}
245-
246-
// Runtime-mask store for 32/64-bit integers on AVX2; narrower widths fall
247-
// back to the scalar common path. Same fault-suppression semantics as the
248-
// masked loads above; alignment mode is irrelevant.
249198
template <class A, class T, class Mode>
250199
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
251200
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
252201
{
253-
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, int64_t>;
254-
detail::maskstore<int_t, A>(reinterpret_cast<int_t*>(mem), __m256i(mask), __m256i(src));
202+
detail::maskstore(mem, __m256i(mask), __m256i(src));
255203
}
256204

257205
// load_stream

include/xsimd/arch/xsimd_avx2_128.hpp

Lines changed: 40 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -89,91 +89,65 @@ namespace xsimd
8989
}
9090
}
9191

92-
// load_masked — native 128-bit integer masked loads. Tagged on avx2_128
93-
// because the vpmaskmov* intrinsics require AVX2; an AVX1-only build routes
94-
// integer masked memory through the float path in xsimd_common_memory.hpp.
95-
// Any arch with a native masked path provides its own exact-tag overload that
96-
// out-ranks this one, so no cross-arch exclusion is needed here.
97-
template <class A, bool... Values, class Mode>
98-
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
99-
{
100-
return _mm_maskload_epi32(mem, mask.as_batch());
101-
}
102-
template <class A, bool... Values, class Mode>
103-
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
104-
{
105-
return _mm_maskload_epi32(reinterpret_cast<int32_t const*>(mem), mask.as_batch());
106-
}
107-
template <class A, bool... Values, class Mode>
108-
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2_128>) noexcept
109-
{
110-
return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask.as_batch());
111-
}
112-
template <class A, bool... Values, class Mode>
113-
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<uint64_t>, Mode, requires_arch<avx2_128>) noexcept
92+
// load_masked / store_masked: native 128-bit integer masked memory.
93+
// Tagged on avx2_128 because vpmaskmov* needs AVX2; an AVX1-only build
94+
// routes integer masked memory through the float path in
95+
// xsimd_common_memory.hpp. 8/16-bit fall back to the common scalar path.
96+
namespace detail
11497
{
115-
return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask.as_batch());
116-
}
98+
template <class T>
99+
XSIMD_INLINE __m128i maskload_avx2_128(T const* mem, __m128i mask) noexcept
100+
{
101+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
102+
{
103+
return _mm_maskload_epi32(reinterpret_cast<int const*>(mem), mask);
104+
}
105+
else
106+
{
107+
return _mm_maskload_epi64(reinterpret_cast<long long const*>(mem), mask);
108+
}
109+
}
117110

118-
// store_masked — native 128-bit integer masked stores (see load note above).
119-
template <class A, bool... Values, class Mode>
120-
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
121-
{
122-
return _mm_maskstore_epi32(mem, mask.as_batch(), src);
123-
}
124-
template <class A, bool... Values, class Mode>
125-
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
126-
{
127-
return _mm_maskstore_epi32(reinterpret_cast<int32_t*>(mem), mask.as_batch(), src);
128-
}
129-
template <class A, bool... Values, class Mode>
130-
XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
131-
{
132-
return _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
111+
template <class T>
112+
XSIMD_INLINE void maskstore_avx2_128(T* mem, __m128i mask, __m128i src) noexcept
113+
{
114+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
115+
{
116+
_mm_maskstore_epi32(reinterpret_cast<int*>(mem), mask, src);
117+
}
118+
else
119+
{
120+
_mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask, src);
121+
}
122+
}
133123
}
134-
template <class A, bool... Values, class Mode>
135-
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
124+
125+
template <class A, class T, bool... Values, class Mode,
126+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
127+
XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
136128
{
137-
return _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), mask.as_batch(), src);
129+
return detail::maskload_avx2_128(mem, mask.as_batch());
138130
}
139131

140-
// Runtime-mask path for 32/64-bit integers; narrower widths fall back to
141-
// the common scalar path. Aligned and unaligned share the same intrinsic
142-
// — masked-off lanes do not fault regardless of alignment.
143-
namespace detail
132+
template <class A, class T, bool... Values, class Mode,
133+
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)>>
134+
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
144135
{
145-
XSIMD_INLINE __m128i maskload_128(int32_t const* mem, __m128i mask) noexcept
146-
{
147-
return _mm_maskload_epi32(mem, mask);
148-
}
149-
XSIMD_INLINE __m128i maskload_128(long long const* mem, __m128i mask) noexcept
150-
{
151-
return _mm_maskload_epi64(mem, mask);
152-
}
153-
XSIMD_INLINE void maskstore_128(int32_t* mem, __m128i mask, __m128i src) noexcept
154-
{
155-
_mm_maskstore_epi32(mem, mask, src);
156-
}
157-
XSIMD_INLINE void maskstore_128(long long* mem, __m128i mask, __m128i src) noexcept
158-
{
159-
_mm_maskstore_epi64(mem, mask, src);
160-
}
136+
detail::maskstore_avx2_128(mem, mask.as_batch(), __m128i(src));
161137
}
162138

163139
template <class A, class T, class Mode>
164140
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
165141
load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
166142
{
167-
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
168-
return detail::maskload_128(reinterpret_cast<int_t const*>(mem), __m128i(mask));
143+
return detail::maskload_avx2_128(mem, __m128i(mask));
169144
}
170145

171146
template <class A, class T, class Mode>
172147
XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
173148
store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
174149
{
175-
using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
176-
detail::maskstore_128(reinterpret_cast<int_t*>(mem), __m128i(mask), __m128i(src));
150+
detail::maskstore_avx2_128(mem, __m128i(mask), __m128i(src));
177151
}
178152

179153
// gather

0 commit comments

Comments
 (0)