Skip to content

Commit 2f9a431

Browse files
committed
fix: harden fast-math reassociation barriers
1 parent 4050d94 commit 2f9a431

File tree

11 files changed

+178
-33
lines changed

11 files changed

+178
-33
lines changed

.github/workflows/windows.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,70 @@ jobs:
9292
run: |
9393
cd _build && ./test/test_xsimd
9494
95+
build-windows-clang-cl:
96+
name: 'clang-cl x64 AVX2'
97+
defaults:
98+
run:
99+
shell: bash {0}
100+
runs-on: windows-2025
101+
steps:
102+
- name: Setup compiler
103+
uses: ilammy/msvc-dev-cmd@v1
104+
with:
105+
arch: amd64
106+
- name: Check clang-cl
107+
run: |
108+
command -v clang-cl
109+
clang-cl --version
110+
- name: Setup Ninja
111+
run: |
112+
python3 -m pip install --upgrade pip setuptools wheel
113+
python3 -m pip install ninja
114+
- name: Checkout xsimd
115+
uses: actions/checkout@v3
116+
- name: Setup
117+
run: |
118+
mkdir _build
119+
cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_CXX_FLAGS="/arch:AVX2" -G Ninja
120+
- name: Build
121+
run: |
122+
cd _build && cmake --build .
123+
- name: Testing xsimd
124+
run: |
125+
cd _build && ./test/test_xsimd
126+
127+
build-windows-clang-cl-fast-math:
128+
name: 'clang-cl x64 /fp:fast'
129+
defaults:
130+
run:
131+
shell: bash {0}
132+
runs-on: windows-2025
133+
steps:
134+
- name: Setup compiler
135+
uses: ilammy/msvc-dev-cmd@v1
136+
with:
137+
arch: amd64
138+
- name: Check clang-cl
139+
run: |
140+
command -v clang-cl
141+
clang-cl --version
142+
- name: Setup Ninja
143+
run: |
144+
python3 -m pip install --upgrade pip setuptools wheel
145+
python3 -m pip install ninja
146+
- name: Checkout xsimd
147+
uses: actions/checkout@v3
148+
- name: Setup
149+
run: |
150+
mkdir _build
151+
cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=OFF -DBUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_CXX_FLAGS="/fp:fast" -G Ninja
152+
- name: Build
153+
run: |
154+
cd _build && cmake --build .
155+
- name: Testing xsimd
156+
run: |
157+
cd _build && ./test/test_xsimd
158+
95159
build-windows-arm64:
96160
name: 'MSVC arm64'
97161
defaults:

include/xsimd/arch/common/xsimd_common_details.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,22 @@ namespace xsimd
111111

112112
namespace detail
113113
{
114+
template <class T>
115+
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept
116+
{
117+
#if XSIMD_WITH_INLINE_ASM
118+
__asm__ volatile("" : : "r"(&x) : "memory");
119+
#else
120+
(void)x;
121+
#endif
122+
}
123+
124+
template <class T, class A>
125+
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept
126+
{
127+
detail::reassociation_barrier(x, memory_barrier_tag {});
128+
}
129+
114130
template <class F, class A, class T, class... Batches>
115131
XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
116132
{

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,8 @@ namespace xsimd
878878
{
879879
batch_type k = nearbyint(a);
880880
x = (a - k) * constants::log_2<batch_type>();
881+
// Keep the reduced exponent offset from being reassociated before finalize().
882+
detail::reassociation_barrier(x, A {});
881883
return k;
882884
}
883885

@@ -937,7 +939,11 @@ namespace xsimd
937939
template <class A, class T>
938940
XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<common>) noexcept
939941
{
940-
return detail::exp<detail::exp10_tag>(self);
942+
using batch_type = batch<T, A>;
943+
batch_type out = detail::exp<detail::exp10_tag>(self);
944+
// Prevent -ffast-math from folding the whole exp10 batch path for literal inputs.
945+
detail::reassociation_barrier(out, A {});
946+
return out;
941947
}
942948

943949
// exp2
@@ -1494,6 +1500,8 @@ namespace xsimd
14941500
batch_type R = t2 + t1;
14951501
batch_type hfsq = batch_type(0.5) * f * f;
14961502
batch_type dk = to_float(k);
1503+
// Keep the compensated k -> float conversion intact before scaling by split log(2).
1504+
detail::reassociation_barrier(dk, A {});
14971505
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
14981506
#ifdef __FAST_MATH__
14991507
return r;
@@ -1525,6 +1533,8 @@ namespace xsimd
15251533
hx += 0x3ff00000 - 0x3fe6a09e;
15261534
k += (hx >> 20) - 0x3ff;
15271535
batch_type dk = to_float(k);
1536+
// Keep the compensated k -> double conversion intact before scaling by split log(2).
1537+
detail::reassociation_barrier(dk, A {});
15281538
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
15291539
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
15301540

@@ -1705,6 +1715,8 @@ namespace xsimd
17051715
batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
17061716
batch_type R = t2 + t1;
17071717
batch_type dk = to_float(k);
1718+
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1719+
detail::reassociation_barrier(dk, A {});
17081720
batch_type hfsq = batch_type(0.5) * f * f;
17091721
batch_type hibits = f - hfsq;
17101722
hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
@@ -1752,10 +1764,12 @@ namespace xsimd
17521764
#endif
17531765
hx += 0x3ff00000 - 0x3fe6a09e;
17541766
k += (hx >> 20) - 0x3ff;
1767+
batch_type dk = to_float(k);
1768+
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1769+
detail::reassociation_barrier(dk, A {});
17551770
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
17561771
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
17571772
batch_type f = --x;
1758-
batch_type dk = to_float(k);
17591773
batch_type s = f / (batch_type(2.) + f);
17601774
batch_type z = s * s;
17611775
batch_type w = z * z;
@@ -1818,6 +1832,8 @@ namespace xsimd
18181832
batch_type R = t2 + t1;
18191833
batch_type hfsq = batch_type(0.5) * f * f;
18201834
batch_type dk = to_float(k);
1835+
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1836+
detail::reassociation_barrier(dk, A {});
18211837
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
18221838
batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
18231839
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
@@ -1853,6 +1869,8 @@ namespace xsimd
18531869
batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
18541870
batch_type R = t2 + t1;
18551871
batch_type dk = to_float(k);
1872+
// Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1873+
detail::reassociation_barrier(dk, A {});
18561874
batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
18571875
#ifdef __FAST_MATH__
18581876
return r;
@@ -1900,17 +1918,10 @@ namespace xsimd
19001918
batch_type s = bitofsign(self);
19011919
batch_type v = self ^ s;
19021920
batch_type t2n = constants::twotonmb<batch_type>();
1903-
// Under fast-math, reordering is possible and the compiler optimizes d
1904-
// to v. That's not what we want, so prevent compiler optimization here.
1905-
// FIXME: it may be better to emit a memory barrier here (?).
1906-
#ifdef __FAST_MATH__
19071921
batch_type d0 = v + t2n;
1908-
asm volatile("" ::"r"(&d0) : "memory");
1922+
// Prevent fast-math from collapsing (v + 2^n) - 2^n back to v.
1923+
detail::reassociation_barrier(d0.data, A {});
19091924
batch_type d = d0 - t2n;
1910-
#else
1911-
batch_type d0 = v + t2n;
1912-
batch_type d = d0 - t2n;
1913-
#endif
19141925
return s ^ select(v < t2n, d, v);
19151926
}
19161927
}
@@ -2192,12 +2203,18 @@ namespace xsimd
21922203
template <class A>
21932204
XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
21942205
{
2195-
return fnma(nearbyint(self / other), other, self);
2206+
batch<float, A> q = nearbyint(self / other);
2207+
// Prevent fast-math from pulling the later multiply back through the rounded quotient.
2208+
detail::reassociation_barrier(q, A {});
2209+
return fnma(q, other, self);
21962210
}
21972211
template <class A>
21982212
XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<common>) noexcept
21992213
{
2200-
return fnma(nearbyint(self / other), other, self);
2214+
batch<double, A> q = nearbyint(self / other);
2215+
// Prevent fast-math from pulling the later multiply back through the rounded quotient.
2216+
detail::reassociation_barrier(q, A {});
2217+
return fnma(q, other, self);
22012218
}
22022219
template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
22032220
XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept

include/xsimd/arch/common/xsimd_common_trigo.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -550,34 +550,49 @@ namespace xsimd
550550
else if (all(x <= constants::pio2<B>()))
551551
{
552552
auto test = x > constants::pio4<B>();
553+
// Keep the compensated pio2 subtraction sequence ordered under -ffast-math.
553554
xr = x - constants::pio2_1<B>();
555+
detail::reassociation_barrier(xr, typename B::arch_type {});
554556
xr -= constants::pio2_2<B>();
557+
detail::reassociation_barrier(xr, typename B::arch_type {});
555558
xr -= constants::pio2_3<B>();
559+
detail::reassociation_barrier(xr, typename B::arch_type {});
556560
xr = select(test, xr, x);
557561
return select(test, B(1.), B(0.));
558562
}
559563
else if (all(x <= constants::twentypi<B>()))
560564
{
561565
B xi = nearbyint(x * constants::twoopi<B>());
566+
// Preserve the quadrant selection and compensated range reduction under -ffast-math.
567+
detail::reassociation_barrier(xi, typename B::arch_type {});
562568
xr = fnma(xi, constants::pio2_1<B>(), x);
569+
detail::reassociation_barrier(xr, typename B::arch_type {});
563570
xr -= xi * constants::pio2_2<B>();
571+
detail::reassociation_barrier(xr, typename B::arch_type {});
564572
xr -= xi * constants::pio2_3<B>();
573+
detail::reassociation_barrier(xr, typename B::arch_type {});
565574
return quadrant(xi);
566575
}
567576
else if (all(x <= constants::mediumpi<B>()))
568577
{
569578
B fn = nearbyint(x * constants::twoopi<B>());
579+
// Keep the multi-term range reduction from being reassociated across correction terms.
580+
detail::reassociation_barrier(fn, typename B::arch_type {});
570581
B r = x - fn * constants::pio2_1<B>();
582+
detail::reassociation_barrier(r, typename B::arch_type {});
571583
B w = fn * constants::pio2_1t<B>();
572584
B t = r;
573585
w = fn * constants::pio2_2<B>();
574586
r = t - w;
587+
detail::reassociation_barrier(r, typename B::arch_type {});
575588
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
576589
t = r;
577590
w = fn * constants::pio2_3<B>();
578591
r = t - w;
592+
detail::reassociation_barrier(r, typename B::arch_type {});
579593
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
580594
xr = r - w;
595+
detail::reassociation_barrier(xr, typename B::arch_type {});
581596
return quadrant(fn);
582597
}
583598
else

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -554,11 +554,8 @@ namespace xsimd
554554
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
555555
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
556556
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
557-
// The asm barrier forces f into a register before the add, blocking
558-
// the reorder. It emits zero instructions.
559-
#if defined(__GNUC__)
560-
__asm__ volatile("" : "+x"(f));
561-
#endif
557+
// Barrier the intermediate before the final add.
558+
detail::reassociation_barrier(f, A {});
562559
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
563560
}
564561

@@ -575,9 +572,7 @@ namespace xsimd
575572
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
576573
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
577574
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
578-
#if defined(__GNUC__)
579-
__asm__ volatile("" : "+x"(f));
580-
#endif
575+
detail::reassociation_barrier(f, A {});
581576
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
582577
}
583578
}

include/xsimd/arch/xsimd_common_fwd.hpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ namespace xsimd
2323
class batch;
2424
template <class T, class A>
2525
class batch_bool;
26+
namespace kernel
27+
{
28+
namespace detail
29+
{
30+
struct memory_barrier_tag
31+
{
32+
};
33+
}
34+
}
2635
template <class T, class A, T... Vs>
2736
struct batch_constant;
2837
template <class T, class A, bool... Vs>
@@ -101,6 +110,12 @@ namespace xsimd
101110
// Forward declarations for pack-level helpers
102111
namespace detail
103112
{
113+
template <class T>
114+
XSIMD_INLINE void reassociation_barrier(T& x, memory_barrier_tag) noexcept;
115+
116+
template <class T, class A>
117+
XSIMD_INLINE void reassociation_barrier(T& x, A const&) noexcept;
118+
104119
template <typename T, T... Vs>
105120
XSIMD_INLINE constexpr bool is_identity() noexcept;
106121
template <typename T, class A, T... Vs>
@@ -115,7 +130,6 @@ namespace xsimd
115130
XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant<T, A, Vs...>) noexcept;
116131
template <typename T, class A, T... Vs>
117132
XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;
118-
119133
}
120134
}
121135
}

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ namespace xsimd
716716
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
717717
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
718718
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
719+
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
720+
detail::reassociation_barrier(f, A {});
719721
return _mm_add_pd(f, _mm_castsi128_pd(xL));
720722
}
721723

@@ -730,6 +732,8 @@ namespace xsimd
730732
__m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
731733
__m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52
732734
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
735+
// Prevent -ffast-math from reassociating (xH-C)+xL into xH+(xL-C).
736+
detail::reassociation_barrier(f, A {});
733737
return _mm_add_pd(f, _mm_castsi128_pd(xL));
734738
}
735739

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,8 @@ namespace xsimd
6464
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
6565
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
6666
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
67-
// The asm barrier forces f into a register before the add, blocking
68-
// the reorder. It emits zero instructions.
69-
#if defined(__GNUC__)
70-
__asm__ volatile("" : "+x"(f));
71-
#endif
67+
// Barrier the intermediate before the final add.
68+
detail::reassociation_barrier(f, A {});
7269
return _mm_add_pd(f, _mm_castsi128_pd(xL));
7370
}
7471

@@ -81,9 +78,7 @@ namespace xsimd
8178
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
8279
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
8380
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
84-
#if defined(__GNUC__)
85-
__asm__ volatile("" : "+x"(f));
86-
#endif
81+
detail::reassociation_barrier(f, A {});
8782
return _mm_add_pd(f, _mm_castsi128_pd(xL));
8883
}
8984
}

include/xsimd/config/xsimd_config.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,27 @@
4444
#define XSIMD_TARGET_X86 0
4545
#endif
4646

47+
/**
48+
* @ingroup xsimd_config_macro
49+
*
50+
* Set to 1 if GNU-style inline assembly is available, to 0 otherwise.
51+
*/
52+
/* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in
53+
* MSVC-compatibility mode and does not define __GNUC__ by default, but it
54+
* still defines __clang__. Clang documents __asm__/__asm__ support and broad
55+
* GCC-extension compatibility:
56+
* https://clang.llvm.org/docs/LanguageExtensions.html
57+
* Clang only emits __GNUC__ when GNUCVersion != 0:
58+
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp
59+
* and GNUCVersion defaults to 0:
60+
* https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def
61+
*/
62+
#if defined(__clang__) || defined(__GNUC__)
63+
#define XSIMD_WITH_INLINE_ASM 1
64+
#else
65+
#define XSIMD_WITH_INLINE_ASM 0
66+
#endif
67+
4768
/**
4869
* @ingroup xsimd_config_macro
4970
*

0 commit comments

Comments
 (0)