@@ -598,6 +598,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(const int128::uint128_t rhs) noexcep
598598 return result;
599599}
600600
601+ namespace detail {
602+
603+ // IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal128_t,
604+ // skipping the constructor's bounds check + dead-branch handling. For d128
605+ // the precision (34 digits, 113 bits) always fits in not_11_significand_mask,
606+ // so no combination-field branch is needed. Caller guarantees
607+ // coeff <= d128_max_significand_value and (exp + bias) is in [0, d128_max_biased_exponent].
608+ template <typename T2>
609+ BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d128 (int128::uint128_t coeff, T2 exp, bool sign) noexcept -> decimal128_t
610+ {
611+ const auto biased_exp {static_cast <std::uint64_t >(static_cast <int >(exp) + bias_v<decimal128_t >)};
612+ int128::uint128_t bits {coeff & d128_not_11_significand_mask};
613+ bits.high |= (sign ? d128_sign_mask : UINT64_C (0 ));
614+ bits.high |= (biased_exp << d128_not_11_exp_high_word_shift) & d128_not_11_exp_mask;
615+ return from_bits (bits);
616+ }
617+
618+ // Definition of the pack_in_range<decimal128_t> overload declared in
619+ // add_impl.hpp. Lives here so the `decimal128_t{coeff, exp, sign}` fallback
620+ // is parsed only after decimal128_t is complete (see add_impl.hpp rationale).
621+ template <typename ReturnType, typename SigType, typename ExpType>
622+ BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range (SigType coeff, ExpType exp, bool sign) noexcept
623+ -> std::enable_if_t<std::is_same<ReturnType, decimal128_t>::value, decimal128_t>
624+ {
625+ const auto biased_exp_check {static_cast <int >(exp) + bias_v<decimal128_t >};
626+ if (BOOST_DECIMAL_LIKELY (biased_exp_check >= 0
627+ && biased_exp_check <= static_cast <int >(max_biased_exp_v<decimal128_t >)))
628+ {
629+ return direct_pack_d128 (static_cast <int128::uint128_t >(coeff), exp, sign);
630+ }
631+ return decimal128_t {coeff, exp, sign};
632+ }
633+
634+ } // namespace detail
635+
601636BOOST_DECIMAL_CUDA_CONSTEXPR auto decimal128_t::unbiased_exponent () const noexcept -> exponent_type
602637{
603638 exponent_type expval {};
@@ -819,9 +854,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const
819854 }
820855 else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision_v<decimal128_t >)
821856 {
857+ // Same overflow-fold pattern as d32/d64: post-shift coeff is <= max_significand_v
858+ // and biased_exp lands in [0, max], so pack_in_range routes to direct_pack.
822859 exp -= digit_delta;
823860 reduced_coeff *= detail::pow10 (static_cast <significand_type>(digit_delta));
824- *this = decimal128_t (reduced_coeff, exp, is_negative);
861+ *this = detail::pack_in_range< decimal128_t > (reduced_coeff, exp, is_negative);
825862 }
826863 else if (coeff_digits + biased_exp <= detail::precision_v<decimal128_t >)
827864 {
@@ -856,10 +893,12 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const
856893 }
857894 else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision_v<decimal128_t >)
858895 {
896+ // Expand to use the full precision; biased_exp ends up in [0, max] and
897+ // coeff <= max_significand_v. pack_in_range routes to direct_pack.
859898 const auto offset {detail::precision_v<decimal128_t > - coeff_digits};
860899 exp -= offset;
861900 reduced_coeff *= detail::pow10 (static_cast <significand_type>(offset));
862- *this = decimal128_t (reduced_coeff, exp, is_negative);
901+ *this = detail::pack_in_range< decimal128_t > (reduced_coeff, exp, is_negative);
863902 }
864903 else
865904 {
@@ -1337,6 +1376,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<(const decimal128_t& lhs, const decim
13371376 }
13381377 #endif
13391378
1379+ if (BOOST_DECIMAL_UNLIKELY (lhs.bits_ == rhs.bits_ ))
1380+ {
1381+ return false ;
1382+ }
1383+
13401384 return less_parts_impl<decimal128_t >(lhs.full_significand (), lhs.biased_exponent (), lhs.isneg (),
13411385 rhs.full_significand (), rhs.biased_exponent (), rhs.isneg ());
13421386}
@@ -1469,16 +1513,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal128_t& lhs, const dec
14691513 {
14701514 return std::partial_ordering::less;
14711515 }
1472- else if (lhs > rhs )
1516+ if (rhs < lhs )
14731517 {
14741518 return std::partial_ordering::greater;
14751519 }
1476- else if (lhs == rhs)
1520+ #ifndef BOOST_DECIMAL_FAST_MATH
1521+ if (isnan (lhs) || isnan (rhs))
14771522 {
1478- return std::partial_ordering::equivalent ;
1523+ return std::partial_ordering::unordered ;
14791524 }
1480-
1481- return std::partial_ordering::unordered ;
1525+ # endif
1526+ return std::partial_ordering::equivalent ;
14821527}
14831528
14841529template <typename Integer>
@@ -1682,11 +1727,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal128_t& lhs, const decim
16821727 {
16831728 return from_bits (detail::d128_nan_mask);
16841729 }
1685-
1730+
16861731 return detail::check_non_finite (lhs, rhs);
16871732 }
16881733 #endif
16891734
1735+ // Two fast paths (see decimal64_t.hpp:operator+ for full explanation).
1736+ // Both gated on non-zero operands so zero short-circuit logic is preserved
1737+ // by falling through to d128_add_impl_new.
1738+ {
1739+ const auto lhs_sig {lhs.full_significand ()};
1740+ const auto rhs_sig {rhs.full_significand ()};
1741+ if (BOOST_DECIMAL_LIKELY (lhs_sig != 0U && rhs_sig != 0U ))
1742+ {
1743+ const auto lhs_exp {lhs.biased_exponent ()};
1744+ const auto rhs_exp {rhs.biased_exponent ()};
1745+ const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
1746+ if (exp_diff > 75 || exp_diff <= 3 )
1747+ {
1748+ auto round {_boost_decimal_global_rounding_mode};
1749+ #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
1750+ if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED (lhs))
1751+ {
1752+ round = _boost_decimal_global_runtime_rounding_mode;
1753+ }
1754+ #endif
1755+ if (BOOST_DECIMAL_LIKELY (round == rounding_mode::fe_dec_to_nearest))
1756+ {
1757+ if (exp_diff > 75 )
1758+ {
1759+ return lhs_exp > rhs_exp ? lhs : rhs;
1760+ }
1761+ return detail::aligned_add_kernel<decimal128_t , int128::uint128_t >(
1762+ lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast <unsigned >(exp_diff),
1763+ lhs.isneg (), rhs.isneg ());
1764+ }
1765+ }
1766+ }
1767+ }
1768+
16901769 auto lhs_components {lhs.to_components ()};
16911770 detail::expand_significand<decimal128_t >(lhs_components.sig , lhs_components.exp );
16921771 auto rhs_components {rhs.to_components ()};
@@ -1741,11 +1820,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal128_t& lhs, const decim
17411820 {
17421821 return -rhs;
17431822 }
1744-
1823+
17451824 return detail::check_non_finite (lhs, rhs);
17461825 }
17471826 #endif
17481827
1828+ // Two fast paths; see operator+ above. Both gated on non-zero operands so
1829+ // zero short-circuit logic is preserved by falling through. For operator-,
1830+ // rhs sign is flipped before kernel dispatch.
1831+ {
1832+ const auto lhs_sig {lhs.full_significand ()};
1833+ const auto rhs_sig {rhs.full_significand ()};
1834+ if (BOOST_DECIMAL_LIKELY (lhs_sig != 0U && rhs_sig != 0U ))
1835+ {
1836+ const auto lhs_exp {lhs.biased_exponent ()};
1837+ const auto rhs_exp {rhs.biased_exponent ()};
1838+ const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
1839+ if (exp_diff > 75 || exp_diff <= 3 )
1840+ {
1841+ auto round {_boost_decimal_global_rounding_mode};
1842+ #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
1843+ if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED (lhs))
1844+ {
1845+ round = _boost_decimal_global_runtime_rounding_mode;
1846+ }
1847+ #endif
1848+ if (BOOST_DECIMAL_LIKELY (round == rounding_mode::fe_dec_to_nearest))
1849+ {
1850+ if (exp_diff > 75 )
1851+ {
1852+ return lhs_exp > rhs_exp ? lhs : -rhs;
1853+ }
1854+ return detail::aligned_add_kernel<decimal128_t , int128::uint128_t >(
1855+ lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast <unsigned >(exp_diff),
1856+ lhs.isneg (), !rhs.isneg ());
1857+ }
1858+ }
1859+ }
1860+ }
1861+
17491862 auto lhs_components {lhs.to_components ()};
17501863 detail::expand_significand<decimal128_t >(lhs_components.sig , lhs_components.exp );
17511864 auto rhs_components {rhs.to_components ()};
@@ -1861,14 +1974,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator*(const decimal128_t lhs, const Intege
18611974
18621975 auto lhs_sig {lhs.full_significand ()};
18631976 auto lhs_exp {lhs.biased_exponent ()};
1864- const auto lhs_zeros {detail::remove_trailing_zeros (lhs_sig)};
1865- lhs_sig = lhs_zeros.trimmed_number ;
1866- lhs_exp += static_cast <std::int32_t >(lhs_zeros.number_of_removed_zeros );
1977+ detail::expand_significand<decimal128_t >(lhs_sig, lhs_exp);
18671978
18681979 auto rhs_sig {static_cast <int128::uint128_t >(detail::make_positive_unsigned (rhs))};
1869- const auto rhs_zeros {detail::remove_trailing_zeros (rhs_sig)};
1870- rhs_sig = rhs_zeros.trimmed_number ;
1871- const auto rhs_exp = static_cast <exp_type>(rhs_zeros.number_of_removed_zeros );
1980+ exp_type rhs_exp {0 };
1981+ detail::normalize<decimal128_t >(rhs_sig, rhs_exp);
18721982
18731983 return detail::d128_mul_impl<decimal128_t >(
18741984 lhs_sig, lhs_exp, lhs.isneg (),
0 commit comments