Skip to content

Commit d571f2f

Browse files
committed
fix: harden fast-math reassociation barriers
Use arch-specific register constraints to prevent -ffast-math from reassociating arithmetic without forcing a register spill to the stack. Each platform's base arch header provides a reassociation_barrier overload using the tightest register constraint for that target: - x86 (sse2.hpp): "+x" — XMM/YMM/ZMM - ARM (neon.hpp): "+w" — NEON vector - ARM (sve.hpp): "+w" — SVE Z-register - PPC (vsx.hpp): "+wa" — VS register - fallback (common): "r"(&x) + "memory" clobber The x86 overload uses template<T,A> to catch all x86 arches (sse2, avx, avx512f and descendants) via overload resolution against the common fallback's requires_arch<common>. Also adds a mandatory const char* reason parameter to document why each barrier exists at each call site, and removes the now-unused memory_barrier_tag.
1 parent e2f0536 commit d571f2f

File tree

10 files changed

+112
-39
lines changed

10 files changed

+112
-39
lines changed

include/xsimd/arch/common/xsimd_common_details.hpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,48 @@ namespace xsimd
111111

112112
namespace detail
113113
{
114+
// Prevent -ffast-math from reassociating arithmetic across this
115+
// point. The const char* argument documents *why* the barrier
116+
// exists at each call site; it is unused at runtime.
117+
//
118+
// Two overloads:
119+
// reassociation_barrier(reg, reason) – raw register
120+
// reassociation_barrier(batch, reason) – extracts .data
121+
//
122+
// Uses the tightest register-class constraint for the target so
123+
// the value stays in its native SIMD register (no spill):
124+
// x86 (SSE/AVX/AVX-512) : "+x" – XMM / YMM / ZMM
125+
// ARM (NEON / SVE) : "+w" – vector / SVE Z-reg
126+
// PPC (VSX) : "+wa" – VS register
127+
// other / MSVC : address + memory clobber (fallback)
128+
template <class T>
129+
XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept
130+
{
131+
#if XSIMD_WITH_INLINE_ASM
132+
#if !XSIMD_WITH_EMULATED && !defined(__EMSCRIPTEN__)
133+
#if XSIMD_WITH_SSE2
134+
__asm__ volatile("" : "+x"(x));
135+
#elif XSIMD_WITH_NEON || XSIMD_WITH_SVE
136+
__asm__ volatile("" : "+w"(x));
137+
#elif XSIMD_WITH_VSX
138+
__asm__ volatile("" : "+wa"(x));
139+
#else
140+
__asm__ volatile("" : : "r"(&x) : "memory");
141+
#endif
142+
#else
143+
__asm__ volatile("" : : "r"(&x) : "memory");
144+
#endif
145+
#else
146+
(void)x;
147+
#endif
148+
}
149+
150+
template <class T, class A>
151+
XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept
152+
{
153+
reassociation_barrier(b.data, reason);
154+
}
155+
114156
template <class F, class A, class T, class... Batches>
115157
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
116158
{

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,7 @@ namespace xsimd
878878
{
879879
batch_type k = nearbyint(a);
880880
x = (a - k) * constants::log_2<batch_type>();
881+
detail::reassociation_barrier(x, "keep reduced exponent ordered before finalize");
881882
return k;
882883
}
883884

@@ -937,7 +938,10 @@ namespace xsimd
937938
template <class A, class T>
938939
XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<common>) noexcept
939940
{
940-
return detail::exp<detail::exp10_tag>(self);
941+
using batch_type = batch<T, A>;
942+
batch_type out = detail::exp<detail::exp10_tag>(self);
943+
detail::reassociation_barrier(out, "prevent folding exp10 for literal inputs");
944+
return out;
941945
}
942946

943947
// exp2
@@ -1494,6 +1498,7 @@ namespace xsimd
14941498
batch_type R = t2 + t1;
14951499
batch_type hfsq = batch_type(0.5) * f * f;
14961500
batch_type dk = to_float(k);
1501+
detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
14971502
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
14981503
#ifdef __FAST_MATH__
14991504
return r;
@@ -1525,6 +1530,7 @@ namespace xsimd
15251530
hx += 0x3ff00000 - 0x3fe6a09e;
15261531
k += (hx >> 20) - 0x3ff;
15271532
batch_type dk = to_float(k);
1533+
detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
15281534
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
15291535
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
15301536

@@ -1705,6 +1711,7 @@ namespace xsimd
17051711
batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
17061712
batch_type R = t2 + t1;
17071713
batch_type dk = to_float(k);
1714+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
17081715
batch_type hfsq = batch_type(0.5) * f * f;
17091716
batch_type hibits = f - hfsq;
17101717
hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
@@ -1752,10 +1759,11 @@ namespace xsimd
17521759
#endif
17531760
hx += 0x3ff00000 - 0x3fe6a09e;
17541761
k += (hx >> 20) - 0x3ff;
1762+
batch_type dk = to_float(k);
1763+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
17551764
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
17561765
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
17571766
batch_type f = --x;
1758-
batch_type dk = to_float(k);
17591767
batch_type s = f / (batch_type(2.) + f);
17601768
batch_type z = s * s;
17611769
batch_type w = z * z;
@@ -1818,6 +1826,7 @@ namespace xsimd
18181826
batch_type R = t2 + t1;
18191827
batch_type hfsq = batch_type(0.5) * f * f;
18201828
batch_type dk = to_float(k);
1829+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
18211830
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
18221831
batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
18231832
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
@@ -1853,6 +1862,7 @@ namespace xsimd
18531862
batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
18541863
batch_type R = t2 + t1;
18551864
batch_type dk = to_float(k);
1865+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
18561866
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
18571867
#ifdef __FAST_MATH__
18581868
return r;
@@ -1900,17 +1910,9 @@ namespace xsimd
19001910
batch_type s = bitofsign(self);
19011911
batch_type v = self ^ s;
19021912
batch_type t2n = constants::twotonmb<batch_type>();
1903-
// Under fast-math, reordering is possible and the compiler optimizes d
1904-
// to v. That's not what we want, so prevent compiler optimization here.
1905-
// FIXME: it may be better to emit a memory barrier here (?).
1906-
#ifdef __FAST_MATH__
19071913
batch_type d0 = v + t2n;
1908-
asm volatile("" ::"r"(&d0) : "memory");
1914+
detail::reassociation_barrier(d0, "prevent collapsing (v + 2^n) - 2^n back to v");
19091915
batch_type d = d0 - t2n;
1910-
#else
1911-
batch_type d0 = v + t2n;
1912-
batch_type d = d0 - t2n;
1913-
#endif
19141916
return s ^ select(v < t2n, d, v);
19151917
}
19161918
}
@@ -2192,12 +2194,16 @@ namespace xsimd
21922194
template <class A>
21932195
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
21942196
{
2195-
return fnma(nearbyint(self / other), other, self);
2197+
batch<float, A> q = nearbyint(self / other);
2198+
detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
2199+
return fnma(q, other, self);
21962200
}
21972201
template <class A>
21982202
XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<common>) noexcept
21992203
{
2200-
return fnma(nearbyint(self / other), other, self);
2204+
batch<double, A> q = nearbyint(self / other);
2205+
detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
2206+
return fnma(q, other, self);
22012207
}
22022208
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
22032209
XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept

include/xsimd/arch/common/xsimd_common_trigo.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,33 +551,45 @@ namespace xsimd
551551
{
552552
auto test = x > constants::pio4<B>();
553553
xr = x - constants::pio2_1<B>();
554+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
554555
xr -= constants::pio2_2<B>();
556+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
555557
xr -= constants::pio2_3<B>();
558+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
556559
xr = select(test, xr, x);
557560
return select(test, B(1.), B(0.));
558561
}
559562
else if (all(x <= constants::twentypi<B>()))
560563
{
561564
B xi = nearbyint(x * constants::twoopi<B>());
565+
detail::reassociation_barrier(xi, "preserve quadrant selection");
562566
xr = fnma(xi, constants::pio2_1<B>(), x);
567+
detail::reassociation_barrier(xr, "compensated range reduction");
563568
xr -= xi * constants::pio2_2<B>();
569+
detail::reassociation_barrier(xr, "compensated range reduction");
564570
xr -= xi * constants::pio2_3<B>();
571+
detail::reassociation_barrier(xr, "compensated range reduction");
565572
return quadrant(xi);
566573
}
567574
else if (all(x <= constants::mediumpi<B>()))
568575
{
569576
B fn = nearbyint(x * constants::twoopi<B>());
577+
detail::reassociation_barrier(fn, "multi-term range reduction");
570578
B r = x - fn * constants::pio2_1<B>();
579+
detail::reassociation_barrier(r, "multi-term range reduction");
571580
B w = fn * constants::pio2_1t<B>();
572581
B t = r;
573582
w = fn * constants::pio2_2<B>();
574583
r = t - w;
584+
detail::reassociation_barrier(r, "multi-term range reduction");
575585
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
576586
t = r;
577587
w = fn * constants::pio2_3<B>();
578588
r = t - w;
589+
detail::reassociation_barrier(r, "multi-term range reduction");
579590
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
580591
xr = r - w;
592+
detail::reassociation_barrier(xr, "multi-term range reduction");
581593
return quadrant(fn);
582594
}
583595
else

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -552,13 +552,7 @@ namespace xsimd
552552
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
553553
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
554554
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
555-
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
556-
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
557-
// The asm barrier forces f into a register before the add, blocking
558-
// the reorder. It emits zero instructions.
559-
#if defined(__GNUC__)
560-
__asm__ volatile("" : "+x"(f));
561-
#endif
555+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
562556
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
563557
}
564558

@@ -574,10 +568,7 @@ namespace xsimd
574568
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
575569
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
576570
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
577-
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
578-
#if defined(__GNUC__)
579-
__asm__ volatile("" : "+x"(f));
580-
#endif
571+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
581572
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
582573
}
583574
}

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ namespace xsimd
101101
// Forward declarations for pack-level helpers
102102
namespace detail
103103
{
104+
template <class T>
105+
XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept;
106+
template <class T, class A>
107+
XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept;
108+
104109
template <typename T, T... Vs>
105110
XSIMD_INLINE constexpr bool is_identity() noexcept;
106111
template <typename T, class A, T... Vs>
@@ -115,7 +120,6 @@ namespace xsimd
115120
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
116121
template <typename T, class A, T... Vs>
117122
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;
118-
119123
}
120124
}
121125
}

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,7 @@ namespace xsimd
716716
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
717717
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
718718
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
719+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
719720
return _mm_add_pd(f, _mm_castsi128_pd(xL));
720721
}
721722

@@ -730,6 +731,7 @@ namespace xsimd
730731
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
731732
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
732733
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
734+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
733735
return _mm_add_pd(f, _mm_castsi128_pd(xL));
734736
}
735737

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,7 @@ namespace xsimd
6262
xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
6363
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
6464
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
65-
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
66-
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
67-
// The asm barrier forces f into a register before the add, blocking
68-
// the reorder. It emits zero instructions.
69-
#if defined(__GNUC__)
70-
__asm__ volatile("" : "+x"(f));
71-
#endif
65+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
7266
return _mm_add_pd(f, _mm_castsi128_pd(xL));
7367
}
7468

@@ -80,10 +74,7 @@ namespace xsimd
8074
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
8175
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
8276
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
83-
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
84-
#if defined(__GNUC__)
85-
__asm__ volatile("" : "+x"(f));
86-
#endif
77+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
8778
return _mm_add_pd(f, _mm_castsi128_pd(xL));
8879
}
8980
}

include/xsimd/config/xsimd_config.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,27 @@
4444
#define XSIMD_TARGET_X86 0
4545
#endif
4646

47+
/**
48+
* @ingroup xsimd_config_macro
49+
*
50+
* Set to 1 if GNU-style inline assembly is available, to 0 otherwise.
51+
*/
52+
/* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in
53+
* MSVC-compatibility mode and does not define __GNUC__ by default, but it
54+
* still defines __clang__. Clang documents __asm__/__asm__ support and broad
55+
* GCC-extension compatibility:
56+
* https://clang.llvm.org/docs/LanguageExtensions.html
57+
* Clang only emits __GNUC__ when GNUCVersion != 0:
58+
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp
59+
* and GNUCVersion defaults to 0:
60+
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def
61+
*/
62+
#if defined(__clang__) || defined(__GNUC__)
63+
#define XSIMD_WITH_INLINE_ASM 1
64+
#else
65+
#define XSIMD_WITH_INLINE_ASM 0
66+
#endif
67+
4768
/**
4869
* @ingroup xsimd_config_macro
4970
*

include/xsimd/config/xsimd_cpu_features_x86.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -533,7 +533,7 @@ namespace xsimd
533533
// It was decided to keep the inline ASM version for maximum compatibility, as the difference
534534
// in ASM is negligible compared to the cost of CPUID.
535535
// https://github.com/xtensor-stack/xsimd/pull/1278
536-
#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
536+
#elif XSIMD_WITH_INLINE_ASM
537537

538538
#if defined(__i386__) && defined(__PIC__)
539539
// %ebx may be the PIC register
@@ -561,7 +561,7 @@ namespace xsimd
561561
#error "_MSC_VER < 1400 is not supported"
562562
#endif
563563

564-
#elif defined(__GNUC__)
564+
#elif XSIMD_WITH_INLINE_ASM
565565
x86_reg32_t xcr0 = {};
566566
__asm__(
567567
"xorl %%ecx, %%ecx\n"

test/test_xsimd_api.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,12 +591,16 @@ struct xsimd_api_float_types_functions
591591
void test_exp()
592592
{
593593
value_type val(2);
594+
#ifdef __FAST_MATH__
595+
CHECK_EQ(extract(xsimd::exp(T(val))), doctest::Approx(std::exp(val)));
596+
#else
594597
CHECK_EQ(extract(xsimd::exp(T(val))), std::exp(val));
598+
#endif
595599
}
596600
void test_exp10()
597601
{
598602
value_type val(2);
599-
#ifdef EMSCRIPTEN
603+
#if defined(EMSCRIPTEN) || defined(__FAST_MATH__)
600604
CHECK_EQ(extract(xsimd::exp10(T(val))), doctest::Approx(std::pow(value_type(10), val)));
601605
#else
602606
CHECK_EQ(extract(xsimd::exp10(T(val))), std::pow(value_type(10), val));

0 commit comments

Comments
 (0)