Skip to content

Commit c3a8d37

Browse files
DiamonDinoiaserge-sans-paille
authored andcommitted
Fix fast_cast int64/uint64→double under -ffast-math
With -ffast-math (-fassociative-math), the compiler may reassociate (xH-C)+xL into xH+(xL-C). Since xL<<C this causes catastrophic cancellation, zeroing the result. Add an asm register barrier between the sub and add to prevent reassociation. The barrier forces the intermediate into a register without emitting any instructions. Fixes #1264
1 parent 548b05f commit c3a8d37

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,13 @@ namespace xsimd
528528
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
529529
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
530530
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
531+
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
532+
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
533+
// The asm barrier forces f into a register before the add, blocking
534+
// the reorder. It emits zero instructions.
535+
#if defined(__GNUC__)
536+
__asm__ volatile("" : "+x"(f));
537+
#endif
531538
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
532539
}
533540

@@ -543,6 +550,10 @@ namespace xsimd
543550
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
544551
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
545552
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
553+
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
554+
#if defined(__GNUC__)
555+
__asm__ volatile("" : "+x"(f));
556+
#endif
546557
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
547558
}
548559
}

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ namespace xsimd
5353
xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
5454
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
5555
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
56+
// With -ffast-math, the compiler may reassociate (xH-C)+xL into
57+
// xH+(xL-C). Since xL<<C this causes catastrophic cancellation.
58+
// The asm barrier forces f into a register before the add, blocking
59+
// the reorder. It emits zero instructions.
60+
#if defined(__GNUC__)
61+
__asm__ volatile("" : "+x"(f));
62+
#endif
5663
return _mm_add_pd(f, _mm_castsi128_pd(xL));
5764
}
5865

@@ -64,6 +71,10 @@ namespace xsimd
6471
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
6572
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
6673
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
74+
// See above: prevent -ffast-math from reassociating (xH-C)+xL.
75+
#if defined(__GNUC__)
76+
__asm__ volatile("" : "+x"(f));
77+
#endif
6778
return _mm_add_pd(f, _mm_castsi128_pd(xL));
6879
}
6980
}

0 commit comments

Comments
 (0)