@@ -878,6 +878,8 @@ namespace xsimd
878878 {
879879 batch_type k = nearbyint (a);
880880 x = (a - k) * constants::log_2<batch_type>();
881+ // Keep the reduced exponent offset from being reassociated before finalize().
882+ detail::reassociation_barrier (x, A {});
881883 return k;
882884 }
883885
@@ -937,7 +939,11 @@ namespace xsimd
937939 template <class A , class T >
938940 XSIMD_INLINE batch<T, A> exp10 (batch<T, A> const & self, requires_arch<common>) noexcept
939941 {
940- return detail::exp<detail::exp10_tag>(self);
942+ using batch_type = batch<T, A>;
943+ batch_type out = detail::exp<detail::exp10_tag>(self);
944+ // Prevent -ffast-math from folding the whole exp10 batch path for literal inputs.
945+ detail::reassociation_barrier (out, A {});
946+ return out;
941947 }
942948
943949 // exp2
@@ -1494,6 +1500,8 @@ namespace xsimd
14941500 batch_type R = t2 + t1;
14951501 batch_type hfsq = batch_type (0.5 ) * f * f;
14961502 batch_type dk = to_float (k);
1503+ // Keep the compensated k -> float conversion intact before scaling by split log(2).
1504+ detail::reassociation_barrier (dk, A {});
14971505 batch_type r = fma (dk, constants::log_2hi<batch_type>(), fma (s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
14981506#ifdef __FAST_MATH__
14991507 return r;
@@ -1525,6 +1533,8 @@ namespace xsimd
15251533 hx += 0x3ff00000 - 0x3fe6a09e ;
15261534 k += (hx >> 20 ) - 0x3ff ;
15271535 batch_type dk = to_float (k);
1536+ // Keep the compensated k -> double conversion intact before scaling by split log(2).
1537+ detail::reassociation_barrier (dk, A {});
15281538 hx = (hx & i_type (0x000fffff )) + 0x3fe6a09e ;
15291539 x = ::xsimd::bitwise_cast<double >(hx << 32 | (i_type (0xffffffff ) & ::xsimd::bitwise_cast<int_type>(x)));
15301540
@@ -1705,6 +1715,8 @@ namespace xsimd
17051715 batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa , 0x3e91e9ee >(w);
17061716 batch_type R = t2 + t1;
17071717 batch_type dk = to_float (k);
1718+ // Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1719+ detail::reassociation_barrier (dk, A {});
17081720 batch_type hfsq = batch_type (0.5 ) * f * f;
17091721 batch_type hibits = f - hfsq;
17101722 hibits &= ::xsimd::bitwise_cast<float >(i_type (0xfffff000 ));
@@ -1752,10 +1764,12 @@ namespace xsimd
17521764#endif
17531765 hx += 0x3ff00000 - 0x3fe6a09e ;
17541766 k += (hx >> 20 ) - 0x3ff ;
1767+ batch_type dk = to_float (k);
1768+ // Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1769+ detail::reassociation_barrier (dk, A {});
17551770 hx = (hx & i_type (0x000fffff )) + 0x3fe6a09e ;
17561771 x = ::xsimd::bitwise_cast<double >(hx << 32 | (i_type (0xffffffff ) & ::xsimd::bitwise_cast<int_type>(x)));
17571772 batch_type f = --x;
1758- batch_type dk = to_float (k);
17591773 batch_type s = f / (batch_type (2 .) + f);
17601774 batch_type z = s * s;
17611775 batch_type w = z * z;
@@ -1818,6 +1832,8 @@ namespace xsimd
18181832 batch_type R = t2 + t1;
18191833 batch_type hfsq = batch_type (0.5 ) * f * f;
18201834 batch_type dk = to_float (k);
1835+ // Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1836+ detail::reassociation_barrier (dk, A {});
18211837 /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
18221838 batch_type c = select (batch_bool_cast<float >(k >= i_type (2 )), batch_type (1 .) - (uf - self), self - (uf - batch_type (1 .))) / uf;
18231839 batch_type r = fma (dk, constants::log_2hi<batch_type>(), fma (s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
@@ -1853,6 +1869,8 @@ namespace xsimd
18531869 batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll , 0x3fd2492494229359ll , 0x3fc7466496cb03dell , 0x3fc2f112df3e5244ll >(w);
18541870 batch_type R = t2 + t1;
18551871 batch_type dk = to_float (k);
1872+ // Prevent fast-math from distributing later multiplies through the compensated exponent conversion.
1873+ detail::reassociation_barrier (dk, A {});
18561874 batch_type r = fma (dk, constants::log_2hi<batch_type>(), fma (s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
18571875#ifdef __FAST_MATH__
18581876 return r;
@@ -1900,17 +1918,10 @@ namespace xsimd
19001918 batch_type s = bitofsign (self);
19011919 batch_type v = self ^ s;
19021920 batch_type t2n = constants::twotonmb<batch_type>();
1903- // Under fast-math, reordering is possible and the compiler optimizes d
1904- // to v. That's not what we want, so prevent compiler optimization here.
1905- // FIXME: it may be better to emit a memory barrier here (?).
1906- #ifdef __FAST_MATH__
19071921 batch_type d0 = v + t2n;
1908- asm volatile (" " ::" r" (&d0) : " memory" );
1922+ // Prevent fast-math from collapsing (v + 2^n) - 2^n back to v.
1923+ detail::reassociation_barrier (d0.data , A {});
19091924 batch_type d = d0 - t2n;
1910- #else
1911- batch_type d0 = v + t2n;
1912- batch_type d = d0 - t2n;
1913- #endif
19141925 return s ^ select (v < t2n, d, v);
19151926 }
19161927 }
@@ -2192,12 +2203,18 @@ namespace xsimd
21922203 template <class A >
21932204 XSIMD_INLINE batch<float , A> remainder (batch<float , A> const & self, batch<float , A> const & other, requires_arch<common>) noexcept
21942205 {
2195- return fnma (nearbyint (self / other), other, self);
2206+ batch<float , A> q = nearbyint (self / other);
2207+ // Prevent fast-math from pulling the later multiply back through the rounded quotient.
2208+ detail::reassociation_barrier (q, A {});
2209+ return fnma (q, other, self);
21962210 }
21972211 template <class A >
21982212 XSIMD_INLINE batch<double , A> remainder (batch<double , A> const & self, batch<double , A> const & other, requires_arch<common>) noexcept
21992213 {
2200- return fnma (nearbyint (self / other), other, self);
2214+ batch<double , A> q = nearbyint (self / other);
2215+ // Prevent fast-math from pulling the later multiply back through the rounded quotient.
2216+ detail::reassociation_barrier (q, A {});
2217+ return fnma (q, other, self);
22012218 }
22022219 template <class A , class T , class = std::enable_if_t <std::is_integral<T>::value>>
22032220 XSIMD_INLINE batch<T, A> remainder (batch<T, A> const & self, batch<T, A> const & other, requires_arch<common>) noexcept
0 commit comments