Skip to content

Commit edba47b

Browse files
committed
fix: harden fast-math reassociation barriers
- Add zero-cost register constraints for all supported architectures: x86 "+x", ARM NEON/SVE "+w", PPC VSX "+wa", RISC-V scalar "+f", RISC-V RVV "+vr" (GCC 15+ / Clang 20+). - Replace old "r"(&x):"memory" fallback with "+m" guarded by new XSIMD_REASSOCIATIVE_MATH macro so unknown targets only spill when the compiler can actually reassociate. - Add XSIMD_REASSOCIATIVE_MATH config macro, auto-detected from __FAST_MATH__ / __ASSOCIATIVE_MATH__, user-overridable for Clang with standalone -fassociative-math. - Add std::array overload so emulated batches get per-element barriers instead of spilling the whole array. - Add missing barriers in exp/exp2/exp10 range reduction (float and double) after nearbyint() to prevent compensated subtraction reordering. - Add missing barriers in log2 (float and double) after to_float(k) to protect Kahan compensated summation.
1 parent e2f0536 commit edba47b

File tree

10 files changed

+166
-39
lines changed

10 files changed

+166
-39
lines changed

include/xsimd/arch/common/xsimd_common_details.hpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,72 @@ namespace xsimd
111111

112112
namespace detail
113113
{
114+
// Prevent -ffast-math from reassociating floating-point
115+
// arithmetic across this point. The reason string
116+
// documents *why* at each call site; unused at runtime.
117+
//
118+
// Zero-cost register constraints per target:
119+
// x86 "+x" (XMM/YMM/ZMM, also scalar float/double)
120+
// ARM "+w" (V-reg / SVE Z-reg, also scalar float/double)
121+
// PPC "+wa" (VS register, also scalar float/double)
122+
// RISC-V "+f" (F/D register, scalar float/double)
123+
// RISC-V RVV "+vr" (V register; GCC 15+ / Clang 20+)
124+
//
125+
// On unknown targets the "+m" fallback spills; it is
126+
// guarded by XSIMD_REASSOCIATIVE_MATH so it is only
127+
// emitted when the compiler can actually reassociate.
128+
template <class T>
129+
XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept
130+
{
131+
#if XSIMD_WITH_INLINE_ASM && !defined(__EMSCRIPTEN__)
132+
#if XSIMD_WITH_SSE2
133+
__asm__ volatile("" : "+x"(x));
134+
#elif XSIMD_WITH_NEON || XSIMD_WITH_SVE
135+
__asm__ volatile("" : "+w"(x));
136+
#elif XSIMD_WITH_VSX
137+
__asm__ volatile("" : "+wa"(x));
138+
#elif XSIMD_REASSOCIATIVE_MATH
139+
__asm__ volatile("" : "+m"(x));
140+
#else
141+
(void)x;
142+
#endif
143+
#else
144+
(void)x;
145+
#endif
146+
}
147+
148+
// RISC-V scalar float/double: use F/D registers instead of
149+
// spilling through "+m". These overloads also serve
150+
// emulated batches on RISC-V via the std::array overload.
151+
#if XSIMD_WITH_INLINE_ASM && defined(__riscv)
152+
XSIMD_INLINE void reassociation_barrier(float& x, const char*) noexcept
153+
{
154+
__asm__ volatile("" : "+f"(x));
155+
}
156+
XSIMD_INLINE void reassociation_barrier(double& x, const char*) noexcept
157+
{
158+
__asm__ volatile("" : "+f"(x));
159+
}
160+
#endif
161+
162+
template <class T, size_t N>
163+
XSIMD_INLINE void reassociation_barrier(std::array<T, N>& arr, const char* reason) noexcept
164+
{
165+
for (auto& v : arr)
166+
reassociation_barrier(v, reason);
167+
}
168+
169+
template <class T, class A>
170+
XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept
171+
{
172+
#if XSIMD_WITH_RVV && XSIMD_WITH_INLINE_ASM && ((__GNUC__ >= 15) || (__clang_major__ >= 20))
173+
__asm__ volatile("" : "+vr"(b.data.value.value));
174+
(void)reason;
175+
#else
176+
reassociation_barrier(b.data, reason);
177+
#endif
178+
}
179+
114180
template <class F, class A, class T, class... Batches>
115181
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
116182
{

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,9 @@ namespace xsimd
743743
static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
744744
{
745745
batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
746+
detail::reassociation_barrier(k, "compensated exp range reduction");
746747
x = fnma(k, constants::log_2hi<batch_type>(), a);
748+
detail::reassociation_barrier(x, "compensated exp range reduction");
747749
x = fnma(k, constants::log_2lo<batch_type>(), x);
748750
return k;
749751
}
@@ -769,7 +771,9 @@ namespace xsimd
769771
static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
770772
{
771773
batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
774+
detail::reassociation_barrier(k, "compensated exp10 range reduction");
772775
x = fnma(k, constants::log10_2hi<batch_type>(), a);
776+
detail::reassociation_barrier(x, "compensated exp10 range reduction");
773777
x -= k * constants::log10_2lo<batch_type>();
774778
return k;
775779
}
@@ -794,6 +798,7 @@ namespace xsimd
794798
static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
795799
{
796800
batch_type k = nearbyint(a);
801+
detail::reassociation_barrier(k, "compensated exp2 range reduction");
797802
x = (a - k);
798803
return k;
799804
}
@@ -819,7 +824,9 @@ namespace xsimd
819824
static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
820825
{
821826
batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
827+
detail::reassociation_barrier(k, "compensated exp range reduction");
822828
hi = fnma(k, constants::log_2hi<batch_type>(), a);
829+
detail::reassociation_barrier(hi, "compensated exp range reduction");
823830
lo = k * constants::log_2lo<batch_type>();
824831
x = hi - lo;
825832
return k;
@@ -846,7 +853,9 @@ namespace xsimd
846853
static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
847854
{
848855
batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
856+
detail::reassociation_barrier(k, "compensated exp10 range reduction");
849857
x = fnma(k, constants::log10_2hi<batch_type>(), a);
858+
detail::reassociation_barrier(x, "compensated exp10 range reduction");
850859
x = fnma(k, constants::log10_2lo<batch_type>(), x);
851860
return k;
852861
}
@@ -878,6 +887,7 @@ namespace xsimd
878887
{
879888
batch_type k = nearbyint(a);
880889
x = (a - k) * constants::log_2<batch_type>();
890+
detail::reassociation_barrier(x, "keep reduced exponent ordered before finalize");
881891
return k;
882892
}
883893

@@ -937,7 +947,10 @@ namespace xsimd
937947
template <class A, class T>
938948
XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<common>) noexcept
939949
{
940-
return detail::exp<detail::exp10_tag>(self);
950+
using batch_type = batch<T, A>;
951+
batch_type out = detail::exp<detail::exp10_tag>(self);
952+
detail::reassociation_barrier(out, "prevent folding exp10 for literal inputs");
953+
return out;
941954
}
942955

943956
// exp2
@@ -1494,6 +1507,7 @@ namespace xsimd
14941507
batch_type R = t2 + t1;
14951508
batch_type hfsq = batch_type(0.5) * f * f;
14961509
batch_type dk = to_float(k);
1510+
detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
14971511
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
14981512
#ifdef __FAST_MATH__
14991513
return r;
@@ -1525,6 +1539,7 @@ namespace xsimd
15251539
hx += 0x3ff00000 - 0x3fe6a09e;
15261540
k += (hx >> 20) - 0x3ff;
15271541
batch_type dk = to_float(k);
1542+
detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
15281543
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
15291544
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
15301545

@@ -1584,6 +1599,7 @@ namespace xsimd
15841599
batch_type R = t1 + t2;
15851600
batch_type hfsq = batch_type(0.5) * f * f;
15861601
batch_type dk = to_float(k);
1602+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
15871603
batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2<batch_type>(), dk);
15881604
#ifdef __FAST_MATH__
15891605
return r;
@@ -1629,7 +1645,9 @@ namespace xsimd
16291645
batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
16301646
batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
16311647
batch_type dk = to_float(k);
1648+
detail::reassociation_barrier(dk, "Kahan compensated log2 summation");
16321649
batch_type w1 = dk + val_hi;
1650+
detail::reassociation_barrier(w1, "Kahan compensated log2 summation");
16331651
val_lo += (dk - w1) + val_hi;
16341652
val_hi = w1;
16351653
batch_type r = val_lo + val_hi;
@@ -1705,6 +1723,7 @@ namespace xsimd
17051723
batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
17061724
batch_type R = t2 + t1;
17071725
batch_type dk = to_float(k);
1726+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
17081727
batch_type hfsq = batch_type(0.5) * f * f;
17091728
batch_type hibits = f - hfsq;
17101729
hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
@@ -1752,10 +1771,11 @@ namespace xsimd
17521771
#endif
17531772
hx += 0x3ff00000 - 0x3fe6a09e;
17541773
k += (hx >> 20) - 0x3ff;
1774+
batch_type dk = to_float(k);
1775+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
17551776
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
17561777
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
17571778
batch_type f = --x;
1758-
batch_type dk = to_float(k);
17591779
batch_type s = f / (batch_type(2.) + f);
17601780
batch_type z = s * s;
17611781
batch_type w = z * z;
@@ -1818,6 +1838,7 @@ namespace xsimd
18181838
batch_type R = t2 + t1;
18191839
batch_type hfsq = batch_type(0.5) * f * f;
18201840
batch_type dk = to_float(k);
1841+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
18211842
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
18221843
batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
18231844
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
@@ -1853,6 +1874,7 @@ namespace xsimd
18531874
batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
18541875
batch_type R = t2 + t1;
18551876
batch_type dk = to_float(k);
1877+
detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
18561878
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
18571879
#ifdef __FAST_MATH__
18581880
return r;
@@ -1900,17 +1922,9 @@ namespace xsimd
19001922
batch_type s = bitofsign(self);
19011923
batch_type v = self ^ s;
19021924
batch_type t2n = constants::twotonmb<batch_type>();
1903-
// Under fast-math, reordering is possible and the compiler optimizes d
1904-
// to v. That's not what we want, so prevent compiler optimization here.
1905-
// FIXME: it may be better to emit a memory barrier here (?).
1906-
#ifdef __FAST_MATH__
19071925
batch_type d0 = v + t2n;
1908-
asm volatile("" ::"r"(&d0) : "memory");
1926+
detail::reassociation_barrier(d0, "prevent collapsing (v + 2^n) - 2^n back to v");
19091927
batch_type d = d0 - t2n;
1910-
#else
1911-
batch_type d0 = v + t2n;
1912-
batch_type d = d0 - t2n;
1913-
#endif
19141928
return s ^ select(v < t2n, d, v);
19151929
}
19161930
}
@@ -2192,12 +2206,16 @@ namespace xsimd
21922206
template <class A>
21932207
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
21942208
{
2195-
return fnma(nearbyint(self / other), other, self);
2209+
batch<float, A> q = nearbyint(self / other);
2210+
detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
2211+
return fnma(q, other, self);
21962212
}
21972213
template <class A>
21982214
XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<common>) noexcept
21992215
{
2200-
return fnma(nearbyint(self / other), other, self);
2216+
batch<double, A> q = nearbyint(self / other);
2217+
detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
2218+
return fnma(q, other, self);
22012219
}
22022220
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
22032221
XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept

include/xsimd/arch/common/xsimd_common_trigo.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,33 +551,45 @@ namespace xsimd
551551
{
552552
auto test = x > constants::pio4<B>();
553553
xr = x - constants::pio2_1<B>();
554+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
554555
xr -= constants::pio2_2<B>();
556+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
555557
xr -= constants::pio2_3<B>();
558+
detail::reassociation_barrier(xr, "ordered pio2 subtraction");
556559
xr = select(test, xr, x);
557560
return select(test, B(1.), B(0.));
558561
}
559562
else if (all(x <= constants::twentypi<B>()))
560563
{
561564
B xi = nearbyint(x * constants::twoopi<B>());
565+
detail::reassociation_barrier(xi, "preserve quadrant selection");
562566
xr = fnma(xi, constants::pio2_1<B>(), x);
567+
detail::reassociation_barrier(xr, "compensated range reduction");
563568
xr -= xi * constants::pio2_2<B>();
569+
detail::reassociation_barrier(xr, "compensated range reduction");
564570
xr -= xi * constants::pio2_3<B>();
571+
detail::reassociation_barrier(xr, "compensated range reduction");
565572
return quadrant(xi);
566573
}
567574
else if (all(x <= constants::mediumpi<B>()))
568575
{
569576
B fn = nearbyint(x * constants::twoopi<B>());
577+
detail::reassociation_barrier(fn, "multi-term range reduction");
570578
B r = x - fn * constants::pio2_1<B>();
579+
detail::reassociation_barrier(r, "multi-term range reduction");
571580
B w = fn * constants::pio2_1t<B>();
572581
B t = r;
573582
w = fn * constants::pio2_2<B>();
574583
r = t - w;
584+
detail::reassociation_barrier(r, "multi-term range reduction");
575585
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
576586
t = r;
577587
w = fn * constants::pio2_3<B>();
578588
r = t - w;
589+
detail::reassociation_barrier(r, "multi-term range reduction");
579590
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
580591
xr = r - w;
592+
detail::reassociation_barrier(xr, "multi-term range reduction");
581593
return quadrant(fn);
582594
}
583595
else

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -552,13 +552,7 @@ namespace xsimd
552552
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
553553
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
554554
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
555-
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
556-
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
557-
// The asm barrier forces f into a register before the add, blocking
558-
// the reorder. It emits zero instructions.
559-
#if defined(__GNUC__)
560-
__asm__ volatile("" : "+x"(f));
561-
#endif
555+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
562556
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
563557
}
564558

@@ -574,10 +568,7 @@ namespace xsimd
574568
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
575569
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
576570
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
577-
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
578-
#if defined(__GNUC__)
579-
__asm__ volatile("" : "+x"(f));
580-
#endif
571+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
581572
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
582573
}
583574
}

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ namespace xsimd
101101
// Forward declarations for pack-level helpers
102102
namespace detail
103103
{
104+
template <class T>
105+
XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept;
106+
template <class T, class A>
107+
XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept;
108+
104109
template <typename T, T... Vs>
105110
XSIMD_INLINE constexpr bool is_identity() noexcept;
106111
template <typename T, class A, T... Vs>
@@ -115,7 +120,6 @@ namespace xsimd
115120
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
116121
template <typename T, class A, T... Vs>
117122
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;
118-
119123
}
120124
}
121125
}

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,7 @@ namespace xsimd
716716
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
717717
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
718718
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
719+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
719720
return _mm_add_pd(f, _mm_castsi128_pd(xL));
720721
}
721722

@@ -730,6 +731,7 @@ namespace xsimd
730731
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
731732
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
732733
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
734+
detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
733735
return _mm_add_pd(f, _mm_castsi128_pd(xL));
734736
}
735737

0 commit comments

Comments
 (0)