diff --git a/.drone.jsonnet b/.drone.jsonnet index d7b0c56f..6e443018 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -24,9 +24,8 @@ local linux_pipeline(name, image, environment, packages = "", sources = [], arch os: "linux", arch: arch }, - clone: - { - retries: 5, + "clone": { + "retries": 5 }, steps: [ @@ -38,7 +37,9 @@ local linux_pipeline(name, image, environment, packages = "", sources = [], arch commands: [ 'set -e', - 'wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -', + 'echo $DRONE_STAGE_MACHINE', + 'uname -a', + 'curl -sSL --retry 5 https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/llvm-snapshot.gpg', ] + (if sources != [] then [ ('apt-add-repository "' + source + '"') for source in sources ] else []) + (if packages != "" then [ 'apt-get update', 'apt-get -y install ' + packages ] else []) + @@ -268,6 +269,34 @@ local windows_pipeline(name, image, environment, arch = "amd64") = "g++-14-multilib", ), + linux_pipeline( + "Linux 26.04 GCC 15 32", + "cppalliance/droneubuntu2604:1", + { TOOLSET: 'gcc', COMPILER: 'g++-15', CXXSTD: '03,11,14,17,20,23', ADDRMD: '32', CXXFLAGS: "-fexcess-precision=fast" }, + "g++-15-multilib", + ), + + linux_pipeline( + "Linux 26.04 GCC 15 64", + "cppalliance/droneubuntu2604:1", + { TOOLSET: 'gcc', COMPILER: 'g++-15', CXXSTD: '03,11,14,17,20,23', ADDRMD: '64', CXXFLAGS: "-fexcess-precision=fast" }, + "g++-15-multilib", + ), + + linux_pipeline( + "Linux 26.04 GCC 16 32", + "cppalliance/droneubuntu2604:1", + { TOOLSET: 'gcc', COMPILER: 'g++-16', CXXSTD: '03,11,14,17,20,23', ADDRMD: '32', CXXFLAGS: "-fexcess-precision=fast" }, + "g++-16-multilib", + ), + + linux_pipeline( + "Linux 26.04 GCC 16 64", + "cppalliance/droneubuntu2604:1", + { TOOLSET: 'gcc', COMPILER: 'g++-16', CXXSTD: '03,11,14,17,20,23', ADDRMD: '64', CXXFLAGS: "-fexcess-precision=fast" }, + "g++-16-multilib libabsl-dev", + ), + linux_pipeline( "Linux 18.04 Clang 5.0", "cppalliance/droneubuntu1804:1", @@ -387,17 +416,27 @@ local windows_pipeline(name, image, environment, arch = "amd64") = ), linux_pipeline( - "Linux 24.04 Clang 20 ASAN", + "Linux 24.04 Clang 21", "cppalliance/droneubuntu2404:1", - { TOOLSET: 'clang', COMPILER: 'clang++-20', CXXSTD: '03,11,14,17,20,23,2c' } + asan, - "clang-20", - ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main"], + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' }, + "clang-21", + ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], + ), + + linux_pipeline( + "Linux 24.04 Clang 21 UBSAN", + "cppalliance/droneubuntu2404:1", + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + ubsan, + "clang-21", + ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], ), - macos_pipeline( - "MacOS Xcode 14.3.1", - { TOOLSET: 'clang', COMPILER: 'clang++', CXXSTD: '03,11,14,17,20,2b' } + asan, - xcode_version = "14.3.1", osx_version = "sonoma", arch = "arm64", + linux_pipeline( + "Linux 24.04 Clang 21 ASAN", + "cppalliance/droneubuntu2404:1", + { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + asan, + "clang-21", + ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"], ), windows_pipeline( diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png index dc6a7617..c3bcc0cb 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png index 83c8c583..f415d0f3 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png index 24535136..9763188a 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png index 53de90ef..a0850e12 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png index fcda4936..be27885c 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png index d3882063..9bc0fa08 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png index 4c6e4a0b..a4acb9e8 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png index 6e4afd1f..48b13255 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png index 04006510..a617985c 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png index b0c804dc..ff8ea077 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png index 21d31774..ba00ac99 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png index 426e25e2..070f0ec9 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png index 7c489191..a87c426d 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png index 623bb465..19eb0bd6 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png deleted file mode 100644 index e377a803..00000000 Binary files a/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png and /dev/null differ diff --git a/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png deleted file mode 100644 index f33de2ca..00000000 Binary files a/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png and /dev/null differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png index 136cbe9e..1771b3af 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png index 02fb8df2..28156d8b 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png index 811ed34b..d12d4ad9 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png index 6d8d4b7b..44cab7da 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png index f267154e..f6061bff 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png differ diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png index aadf4d5f..7b05c54e 100644 Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png index 35b756ef..656c35ad 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png index ab77fb2b..43f03412 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png index 7144e238..60353279 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png index 706a4de8..c86d7035 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png index ad886c80..a2142d97 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png index b3b87b3b..6950202b 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png index e99ab249..1f8be96e 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png index 08ed1a3c..db03e704 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png index 5d6194f9..9dfc748e 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png index ed9cbc71..eb200f34 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png index d3567a5e..93b48c97 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png index 3780492b..5b498e0e 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png index 989c040f..756dc31a 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png index 15f49776..36047908 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png deleted file mode 100644 index fd5c43e1..00000000 Binary files a/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png and /dev/null differ diff --git a/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png deleted file mode 100644 index f8123403..00000000 Binary files a/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png and /dev/null differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png new file mode 100644 index 00000000..0ccdcf58 Binary files /dev/null and b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png new file mode 100644 index 00000000..75ef018b Binary files /dev/null and b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png deleted file mode 100644 index 7f6b0ff7..00000000 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png and /dev/null differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png deleted file mode 100644 index 3338a211..00000000 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png and /dev/null differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png index 9c6fba5b..aa3d9c30 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png index 514ce6f3..5dc1c090 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png index 1841898b..038ff287 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png differ diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png index 9352658a..2446939c 100644 Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png differ diff --git a/doc/modules/ROOT/pages/i128_benchmarks.adoc b/doc/modules/ROOT/pages/i128_benchmarks.adoc index 915e1be5..af9b1f3c 100644 --- a/doc/modules/ROOT/pages/i128_benchmarks.adoc +++ b/doc/modules/ROOT/pages/i128_benchmarks.adoc @@ -24,12 +24,12 @@ On MSVC platforms we use as reference `std::_Signed128` from the header `<__msvc |=== | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128` -| Comparisons | 879535 | 748787 | 2210502 | 741269 -| Addition | 92165 | 92441 | 283528 | 92323 -| Subtraction | 92514 | 88390 | 668953 | 90394 -| Multiplication | 115727 | 90897 | 312723 | 89558 -| Division | 1234838 | 1352795 | 1320695 | 1200439 -| Modulo | 1193529 | 1256687 | 1287093 | 1293439 +| Comparisons | 2232997 | 1970941 | 5478483 | 1944089 +| Addition | 244246 | 292081 | 650160 | 227720 +| Subtraction | 220957 | 196953 | 1625774 | 315611 +| Multiplication | 433431 | 321168 | 1595688 | 304069 +| Division | 4462364 | 4983165 | 4992819 | 4986970 +| Modulo | 4803576 | 5257406 | 4988844 | 5081814 |=== //// @@ -44,12 +44,12 @@ image::i128_graphs/linux/x64_relative_performance.png[x64 Relative Performance, |=== | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128` -| Comparisons | 3495621 | 2279914 | 5910287 | 3749448 -| Addition | 191514 | 133319 | 566860 | 164848 -| Subtraction | 131380 | 193984 | 1066509 | 193467 -| Multiplication | 236071 | 234594 | 864526 | 237676 -| Division | 2412757 | 2434752 | 2508755 | 2484139 -| Modulo | 2501357 | 2171828 | 2571959 | 2158203 +| Comparisons | 4115337 | 2169531 | 5914108 | 3725321 +| Addition | 194461 | 196244 | 543680 | 195216 +| Subtraction | 151441 | 97565 | 1161677 | 192729 +| Multiplication | 334847 | 232518 | 904461 | 240980 +| Division | 2403064 | 1848517 | 2493904 | 2431322 +| Modulo | 2235322 | 2159401 | 2535438 | 2321638 |=== //// @@ -64,12 +64,12 @@ image::i128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performan |=== | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128` -| Comparisons | 14099505 | 12588237 | 21074294 | 13972778 -| Addition | 1151086 | 1374984 | 3303931 | 1195725 -| Subtraction | 1223119 | 753561 | 4224613 | 1295929 -| Multiplication | 1904542 | 2060986 | 3034387 | 1733150 -| Division | 8768877 | 7080113 | 7306287 | 7968543 -| Modulo | 8661233 | 7180650 | 8801605 | 8175497 +| Comparisons | 5171094 | 5069329 | 7457296 | 5343843 +| Addition | 625328 | 785936 | 1286888 | 670826 +| Subtraction | 667538 | 356865 | 2555881 | 741947 +| Multiplication | 904480 | 729911 | 1562062 | 786829 +| Division | 3758577 | 2211087 | 3095993 | 3940264 +| Modulo | 4218409 | 2330114 | 3684163 | 3849849 |=== //// @@ -107,12 +107,12 @@ NOTE: This platform has no hardware type so we compare relative to `boost::mp::i |=== | Operation | `int128_t` | `boost::mp::int128_t` -| Comparisons | 9530060 | 12168353 -| Addition | 785799 | 7777469 -| Subtraction | 778881 | 8214089 -| Multiplication | 1148024 | 9477355 -| Division | 10337258 | 22857709 -| Modulo | 10438037 | 14848256 +| Comparisons | 10310201 | 14160000 +| Addition | 786499 | 7379646 +| Subtraction | 907051 | 7890190 +| Multiplication | 855780 | 10826565 +| Division | 10254664 | 24702433 +| Modulo | 10851123 | 17348307 |=== //// @@ -152,12 +152,12 @@ image::i128_graphs/linux/ARM32_relative_performance.png[ARM32 Relative Performan |=== | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t` -| Comparisons | 2186843 | 2142626 | 4854983 -| Addition | 186771 | 184598 | 2645943 -| Subtraction | 193660 | 186335 | 2925784 -| Multiplication | 402806 | 117413 | 3887479 -| Division | 1612873 | 2369701 | 6437280 -| Modulo | 1637135 | 2218627 | 6236026 +| Comparisons | 1879694 | 1894168 | 5198915 +| Addition | 141120 | 143877 | 2846799 +| Subtraction | 157649 | 156965 | 3027203 +| Multiplication | 266740 | 138754 | 4080611 +| Division | 1387560 | 1752869 | 6924406 +| Modulo | 1616895 | 1908345 | 6397442 |=== //// image::i128_graphs/windows/x64_benchmarks.png[x64 Benchmark Results, width=100%] @@ -171,12 +171,12 @@ image::i128_graphs/windows/x64_relative_performance.png[x64 Relative Performance |=== | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t` -| Comparisons | 911829 | 368104 | 2376802 -| Addition | 33233 | 34001 | 121700 -| Subtraction | 33411 | 34130 | 1488822 -| Multiplication | 117586 | 56324 | 1564799 -| Division | 1127267 | 1500725 | 2808293 -| Modulo | 1287100 | 1548073 | 2997474 +| Comparisons | 991273 | 391918 | 2551137 +| Addition | 34519 | 48953 | 1243326 +| Subtraction | 34184 | 36278 | 1387708 +| Multiplication | 126490 | 36781 | 1632232 +| Division | 1128432 | 1107571 | 2472959 +| Modulo | 1427629 | 1310481 | 2926904 |=== //// image::i128_graphs/windows/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%] @@ -190,12 +190,12 @@ image::i128_graphs/windows/ARM64_relative_performance.png[ARM64 Relative Perform |=== | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t` -| Comparisons | 3187340 | 3046252 | 4269507 -| Addition | 185960 | 189165 | 2488618 -| Subtraction | 979025 | 192609 | 2783600 -| Multiplication | 1896082 | 3569921 | 4908622 -| Division | 5566403 | 4348306 | 6835035 -| Modulo | 4697289 | 4793845 | 6476032 +| Comparisons | 3832024 | 3823023 | 5568151 +| Addition | 232554 | 197092 | 3488510 +| Subtraction | 1198377 | 145823 | 4011233 +| Multiplication | 2921104 | 428925 | 6219931 +| Division | 7174578 | 7189000 | 9748526 +| Modulo | 5528639 | 7028725 | 9205892 |=== //// image::i128_graphs/windows/x86_benchmarks.png[x86_32 Benchmark Results, width=100%] @@ -212,12 +212,12 @@ image::i128_graphs/windows/x86_relative_performance.png[x86_32 Relative Performa |=== | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128` -| Comparisons | 133275 | 131953 | 340555 | 133509 -| Addition | 20203 | 17797 | 169909 | 20208 -| Subtraction | 20203 | 17832 | 172497 | 22199 -| Multiplication | 21496 | 20202 | 78269 | 20364 -| Division | 662767 | 682891 | 969277 | 663602 -| Modulo | 719179 | 692509 | 1026090 | 717897 +| Comparisons | 135259 | 134127 | 340037 | 136845 +| Addition | 20399 | 18575 | 169575 | 20429 +| Subtraction | 20156 | 18983 | 168041 | 20875 +| Multiplication | 20654 | 20860 | 69443 | 20651 +| Division | 668004 | 659823 | 976248 | 660963 +| Modulo | 664356 | 662282 | 1026487 | 665474 |=== //// @@ -225,23 +225,3 @@ image::i128_graphs/macos/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100 //// image::i128_graphs/macos/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%] - -=== x86_64 - -[cols="1,1,1,1"] -|=== -| Operation | `__int128` | `int128_t` | `boost::mp::int128_t` - -| Comparisons | 1628142 | 1748005 | 4318109 -| Addition | 224648 | 180393 | 925013 -| Subtraction | 212849 | 131062 | 1876834 -| Multiplication | 432205 | 407829 | 651209 -| Division | 3924951 | 2409106 | 3719183 -| Modulo | 3042060 | 2423738 | 4443402 -|=== - -//// -image::i128_graphs/macos/x64_benchmarks.png[x64 Benchmark Results, width=100%] -//// - -image::i128_graphs/macos/x64_relative_performance.png[x64 Relative Performance, width=100%] diff --git a/doc/modules/ROOT/pages/u128_benchmarks.adoc b/doc/modules/ROOT/pages/u128_benchmarks.adoc index 88f9a03b..ff2e0089 100644 --- a/doc/modules/ROOT/pages/u128_benchmarks.adoc +++ b/doc/modules/ROOT/pages/u128_benchmarks.adoc @@ -24,12 +24,12 @@ On MSVC platforms we use as reference `std::_Unsigned128` from the header `<__ms |=== | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128` -| Comparisons | 785130 | 765065 | 1363581 | 766205 -| Addition | 90260 | 85758 | 89958 | 89255 -| Subtraction | 91143 | 91449 | 91224 | 89716 -| Multiplication | 111803 | 90069 | 113559 | 89660 -| Division | 1058435 | 901516 | 1040071 | 1044710 -| Modulo | 1003366 | 830830 | 1001701 | 978533 +| Comparisons | 2555576 | 2404372 | 3576079 | 2099066 +| Addition | 242772 | 241336 | 328546 | 301186 +| Subtraction | 372481 | 260064 | 287267 | 282908 +| Multiplication | 356366 | 312736 | 326328 | 277284 +| Division | 4481403 | 4498211 | 4602586 | 4290212 +| Modulo | 3965562 | 4506879 | 4487023 | 4247367 |=== //// @@ -44,12 +44,12 @@ image::u128_graphs/linux/x64_relative_performance.png[x64 Relative Performance, |=== | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128` -| Comparisons | 3427201 | 2078586 | 5026689 | 3753922 -| Addition | 194968 | 159662 | 587373 | 194070 -| Subtraction | 193067 | 161903 | 330052 | 140777 -| Multiplication | 263187 | 201333 | 972009 | 244420 -| Division | 2338258 | 2247175 | 2190856 | 2223032 -| Modulo | 2260200 | 2097760 | 2227961 | 2186750 +| Comparisons | 4077924 | 2335044 | 5360167 | 4184235 +| Addition | 137276 | 151553 | 184406 | 151276 +| Subtraction | 155498 | 133470 | 186793 | 149111 +| Multiplication | 218009 | 233811 | 324341 | 293431 +| Division | 2254781 | 1819447 | 2211225 | 2152312 +| Modulo | 2274294 | 1743274 | 2324356 | 2381378 |=== //// @@ -64,12 +64,12 @@ image::u128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performan |=== | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128` -| Comparisons | 6803419 | 6280326 | 7965082 | 10515929 -| Addition | 546801 | 618774 | 621572 | 1744226 -| Subtraction | 590011 | 359100 | 691515 | 1527622 -| Multiplication | 891753 | 1192196 | 944289 | 1839038 -| Division | 3827125 | 3201674 | 3997037 | 4913142 -| Modulo | 4925696 | 3360251 | 5144403 | 5422155 +| Comparisons | 7293935 | 6198402 | 8182815 | 13820009 +| Addition | 636224 | 707436 | 611849 | 1530136 +| Subtraction | 572225 | 350035 | 595266 | 1211168 +| Multiplication | 1040424 | 741789 | 899957 | 1843000 +| Division | 4191637 | 2593472 | 4106663 | 4883553 +| Modulo | 4156643 | 2133029 | 4398856 | 5011442 |=== //// @@ -107,12 +107,12 @@ NOTE: This platform has no hardware type so we compare relative to `boost::mp::u |=== | Operation | `uint128_t` | `boost::mp::uint128_t` -| Comparisons | 9000979 | 8722814 -| Addition | 898718 | 9912175 -| Subtraction | 778881 | 9773677 -| Multiplication | 1778273 | 8678420 -| Division | 8496503 | 18133965 -| Modulo | 9081442 | 11257837 +| Comparisons | 9545542 | 8582001 +| Addition | 686648 | 7261481 +| Subtraction | 618456 | 7968678 +| Multiplication | 859253 | 6746697 +| Division | 8271920 | 15931092 +| Modulo | 9932867 | 10242720 |=== //// @@ -152,12 +152,12 @@ image::u128_graphs/linux/ARM32_relative_performance.png[ARM32 Relative Performan |=== | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t` -| Comparisons | 2060556 | 1921174 | 3009890 -| Addition | 261475 | 106545 | 2710279 -| Subtraction | 178724 | 124181 | 3059187 -| Multiplication | 146063 | 136115 | 3495634 -| Division | 1332838 | 1360295 | 4852899 -| Modulo | 1465138 | 1471169 | 3926336 +| Comparisons | 2055229 | 1714007 | 2490543 +| Addition | 152603 | 116444 | 2596037 +| Subtraction | 150576 | 116367 | 2901567 +| Multiplication | 131223 | 123694 | 3300491 +| Division | 1476783 | 1489919 | 4898388 +| Modulo | 1421066 | 1411521 | 3793762 |=== //// image::u128_graphs/windows/x64_benchmarks.png[x64 Benchmark Results, width=100%] @@ -171,18 +171,18 @@ image::u128_graphs/windows/x64_relative_performance.png[x64 Relative Performance |=== | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t` -| Comparisons | 3424403 | 2062167 | 5026689 -| Addition | 123659 | 133084 | 587373 -| Subtraction | 171721 | 99453 | 330052 -| Multiplication | 329287 | 283443 | 972009 -| Division | 2044821 | 1825020 | 2190856 -| Modulo | 2176318 | 1897933 | 2227961 +| Comparisons | 945196 | 405891 | 1306884 +| Addition | 37403 | 40039 | 1351728 +| Subtraction | 33927 | 38887 | 1594845 +| Multiplication | 74384 | 46406 | 1281286 +| Division | 992963 | 790846 | 2035065 +| Modulo | 1087702 | 861121 | 1702396 |=== //// -image::u128_graphs/windows/arm64_benchmarks.png[ARM64 Benchmark Results, width=100%] +image::u128_graphs/windows/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%] //// -image::u128_graphs/windows/arm64_relative_performance.png[ARM64 Relative Performance, width=100%] +image::u128_graphs/windows/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%] === x86_32 @@ -190,12 +190,12 @@ image::u128_graphs/windows/arm64_relative_performance.png[ARM64 Relative Perform |=== | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t` -| Comparisons | 4215438 | 3883846 | 2852442 -| Addition | 199945 | 208436 | 3242910 -| Subtraction | 1206168 | 210874 | 3851129 -| Multiplication | 2282869 | 2680359 | 5378001 -| Division | 5516964 | 4328917 | 6948267 -| Modulo | 4551146 | 4330152 | 6294325 +| Comparisons | 4806287 | 3940703 | 2624013 +| Addition | 254275 | 202421 | 2961566 +| Subtraction | 1322877 | 207351 | 3703369 +| Multiplication | 2327500 | 2312040 | 4375417 +| Division | 5596877 | 5629510 | 6756883 +| Modulo | 4616488 | 5696116 | 6409969 |=== //// image::u128_graphs/windows/x86_benchmarks.png[x86_32 Benchmark Results, width=100%] @@ -212,12 +212,12 @@ image::u128_graphs/windows/x86_relative_performance.png[x86_32 Relative Performa |=== | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128` -| Comparisons | 131902 | 133564 | 134182 | 132366 -| Addition | 20613 | 17912 | 40176 | 20178 -| Subtraction | 20484 | 18237 | 40311 | 20207 -| Multiplication | 20160 | 20580 | 43285 | 20049 -| Division | 686521 | 699201 | 945928 | 672398 -| Modulo | 777084 | 724648 | 953117 | 734229 +| Comparisons | 134425 | 134742 | 133107 | 135182 +| Addition | 20754 | 18389 | 20653 | 20929 +| Subtraction | 20552 | 18573 | 20590 | 20439 +| Multiplication | 20264 | 20150 | 20181 | 20228 +| Division | 685358 | 740877 | 913877 | 718985 +| Modulo | 733080 | 699666 | 951657 | 719500 |=== //// @@ -225,23 +225,3 @@ image::u128_graphs/macos/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100 //// image::u128_graphs/macos/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%] - -=== x86_64 - -[cols="1,1,1,1"] -|=== -| Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` - -| Comparisons | 688225 | 712352 | 689146 -| Addition | 104921 | 124992 | 137819 -| Subtraction | 129150 | 102302 | 153484 -| Multiplication | 120363 | 119652 | 164100 -| Division | 2333812 | 1981469 | 2784139 -| Modulo | 2621949 | 2219481 | 2736682 -|=== - -//// -image::u128_graphs/macos/x64_benchmarks.png[x64 Benchmark Results, width=100%] -//// - -image::u128_graphs/macos/x64_relative_performance.png[x64 Relative Performance, width=100%] diff --git a/doc/plots.py b/doc/plots.py index f1150102..5196c0ab 100644 --- a/doc/plots.py +++ b/doc/plots.py @@ -1,225 +1,356 @@ +#!/usr/bin/env python3 +"""Generate every Boost.Int128 benchmark graph and write it straight into the +documentation images tree. + +Each entry produces two PNGs whose names match the image:: directives in the +.adoc pages: + + modules/ROOT/images/_graphs//_benchmarks.png + modules/ROOT/images/_graphs//_relative_performance.png + +To refresh a platform's numbers, edit its 'data' block here and re-run; the +right file is overwritten automatically. +""" + +import os + +import matplotlib +matplotlib.use('Agg') # headless backend: write files, never open a window import matplotlib.pyplot as plt import numpy as np import pandas as pd -""" -# ARM64 MSVC -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Unsigned128': [878929, 32788, 33627, 68120, 925583, 1104772], - 'uint128_t': [259725, 33723, 36799, 35334, 1020148, 1143344], - 'boost::mp::uint128_t': [1246502, 1437452, 1648131, 1459418, 2216648, 2089105] -} -""" +# Operation order shared by every dataset (matches the x-axis of all charts). +OPERATIONS = ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'] -""" -# x86 MSVC -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Unsigned128': [4215438, 199945, 1206168, 2282869, 5516964, 4551146], - 'uint128_t': [3883846, 208436, 210874, 2680359, 4328917, 4330152], - 'boost::mp::uint128_t': [2852442, 3242910, 3851129, 5378001, 6948267, 6294325] -} -""" +# One entry per published graph. Fields: +# sign : 'u128' (unsigned) or 'i128' (signed) -> selects the *_graphs folder +# os : 'linux' | 'macos' | 'windows' -> selects the sub-folder +# arch : file stem used by the .adoc image:: directives (casing must match the +# image:: targets exactly; ARM stems are always upper-case, e.g. 'ARM64') +# title : chart heading prefix, e.g. 'GCC 14 - x64' +# data : implementation -> timings in microseconds, in OPERATIONS order +# The normalization baseline for the relative chart is detected automatically +# (native type where present, otherwise Boost.Multiprecision). +DATASETS = [ + # ----------------------------- unsigned, Linux ----------------------------- + { + 'sign': 'u128', 'os': 'linux', 'arch': 'x64', 'title': 'GCC 16 - x64', + 'data': { + 'unsigned __int128': [2555576, 242772, 372481, 356366, 4481403, 3965562], + 'uint128_t': [2404372, 241336, 260064, 312736, 4498211, 4506879], + 'boost::mp::uint128_t': [3576079, 328546, 287267, 326328, 4602586, 4487023], + 'absl::uint128': [2099066, 301186, 282908, 277284, 4290212, 4247367], + }, + }, + { + 'sign': 'u128', 'os': 'linux', 'arch': 'ARM64', 'title': 'GCC 13 - ARM64', + 'data': { + 'unsigned __int128': [4077924, 137276, 155498, 218009, 2254781, 2274294], + 'uint128_t': [2335044, 151553, 133470, 233811, 1819447, 1743274], + 'boost::mp::uint128_t': [5360167, 184406, 186793, 324341, 2211225, 2324356], + 'absl::uint128': [4184235, 151276, 149111, 293431, 2152312, 2381378], + }, + }, + { + 'sign': 'u128', 'os': 'linux', 'arch': 's390x', 'title': 'GCC 13 - s390x', + 'data': { + 'unsigned __int128': [7293935, 636224, 572225, 1040424, 4191637, 4156643], + 'uint128_t': [6198402, 707436, 350035, 741789, 2593472, 2133029], + 'boost::mp::uint128_t': [8182815, 611849, 595266, 899957, 4106663, 4398856], + 'absl::uint128': [13820009, 1530136, 1211168, 1843000, 4883553, 5011442], + }, + }, + { + 'sign': 'u128', 'os': 'linux', 'arch': 'ppc64le', 'title': 'GCC 14 - ppc64le', + 'data': { + 'unsigned __int128': [5242604, 221776, 222894, 194494, 4821119, 4955570], + 'uint128_t': [4450958, 193063, 175259, 192929, 4896360, 4273487], + 'boost::mp::uint128_t': [5704848, 847504, 786659, 795187, 5344637, 5407877], + }, + }, + { + 'sign': 'u128', 'os': 'linux', 'arch': 'x86', 'title': 'GCC 16 - x86_32', + 'data': { + 'uint128_t': [9545542, 686648, 618456, 859253, 8271920, 9932867], + 'boost::mp::uint128_t': [8582001, 7261481, 7968678, 6746697, 15931092, 10242720], + }, + }, + { + 'sign': 'u128', 'os': 'linux', 'arch': 'ARM32', 'title': 'GCC 14 - ARM32', + 'data': { + 'uint128_t': [5286033, 454715, 487190, 1471479, 19868087, 20332627], + 'boost::mp::uint128_t': [4538707, 5543856, 6465126, 8246098, 32820805, 27238658], + }, + }, + # ---------------------------- unsigned, Windows ---------------------------- + { + 'sign': 'u128', 'os': 'windows', 'arch': 'x64', 'title': 'MSVC 14.5 - x64', + 'data': { + 'std::_Unsigned128': [2055229, 152603, 150576, 131223, 1476783, 1421066], + 'uint128_t': [1714007, 116444, 116367, 123694, 1489919, 1411521], + 'boost::mp::uint128_t': [2490543, 2596037, 2901567, 3300491, 4898388, 3793762], + }, + }, + { + 'sign': 'u128', 'os': 'windows', 'arch': 'ARM64', 'title': 'MSVC 14.5 - ARM64', + 'data': { + 'std::_Unsigned128': [945196, 37403, 33927, 74384, 992963, 1087702], + 'uint128_t': [405891, 40039, 38887, 46406, 790846, 861121], + 'boost::mp::uint128_t': [1306884, 1351728, 1594845, 1281286, 2035065, 1702396], + }, + }, + { + 'sign': 'u128', 'os': 'windows', 'arch': 'x86', 'title': 'MSVC 14.5 - x86_32', + 'data': { + 'std::_Unsigned128': [4806287, 254275, 1322877, 2327500, 5596877, 4616488], + 'uint128_t': [3940703, 202421, 207351, 2312040, 5629510, 5696116], + 'boost::mp::uint128_t': [2624013, 2961566, 3703369, 4375417, 6756883, 6409969], + }, + }, + # ----------------------------- unsigned, macOS ----------------------------- + { + 'sign': 'u128', 'os': 'macos', 'arch': 'ARM64', 'title': 'Clang 22 - ARM64', + 'data': { + 'unsigned __int128': [134425, 20754, 20552, 20264, 685358, 733080], + 'uint128_t': [134742, 18389, 18573, 20150, 740877, 699666], + 'boost::mp::uint128_t': [133107, 20653, 20590, 20181, 913877, 951657], + 'absl::uint128': [135182, 20929, 20439, 20228, 718985, 719500], + }, + }, + # ------------------------------ signed, Linux ------------------------------ + { + 'sign': 'i128', 'os': 'linux', 'arch': 'x64', 'title': 'GCC 16 - x64', + 'data': { + '`__int128`': [2232997, 244246, 220957, 433431, 4462364, 4803576], + 'int128_t': [1970941, 292081, 196953, 321168, 4983165, 5257406], + 'boost::mp::int128_t': [5478483, 650160, 1625774, 1595688, 4992819, 4988844], + 'absl::int128': [1944089, 227720, 315611, 304069, 4986970, 5081814], + }, + }, + { + 'sign': 'i128', 'os': 'linux', 'arch': 'ARM64', 'title': 'GCC 13 - ARM64', + 'data': { + '`__int128`': [4115337, 194461, 151441, 334847, 2403064, 2235322], + 'int128_t': [2169531, 196244, 97565, 232518, 1848517, 2159401], + 'boost::mp::int128_t': [5914108, 543680, 1161677, 904461, 2493904, 2535438], + 'absl::int128': [3725321, 195216, 192729, 240980, 2431322, 2321638], + }, + }, + { + 'sign': 'i128', 'os': 'linux', 'arch': 's390x', 'title': 'GCC 13 - s390x', + 'data': { + '`__int128`': [5171094, 625328, 667538, 904480, 3758577, 4218409], + 'int128_t': [5069329, 785936, 356865, 729911, 2211087, 2330114], + 'boost::mp::int128_t': [7457296, 1286888, 2555881, 1562062, 3095993, 3684163], + 'absl::int128': [5343843, 670826, 741947, 786829, 3940264, 3849849], + }, + }, + { + 'sign': 'i128', 'os': 'linux', 'arch': 'ppc64le', 'title': 'GCC 14 - ppc64le', + 'data': { + '`__int128`': [4538094, 221708, 222629, 193315, 5607581, 5623562], + 'int128_t': [5796198, 191841, 174273, 191785, 4669820, 4750314], + 'boost::mp::int128_t': [13907323, 1177034, 1861166, 878393, 5616217, 5641480], + }, + }, + { + 'sign': 'i128', 'os': 'linux', 'arch': 'x86', 'title': 'GCC 16 - x86_32', + 'data': { + 'int128_t': [10310201, 786499, 907051, 855780, 10254664, 10851123], + 'boost::mp::int128_t': [14160000, 7379646, 7890190, 10826565, 24702433, 17348307], + }, + }, + { + 'sign': 'i128', 'os': 'linux', 'arch': 'ARM32', 'title': 'GCC 14 - ARM32', + 'data': { + 'int128_t': [6149439, 457850, 488321, 1793874, 17738614, 18064819], + 'boost::mp::int128_t': [6432579, 5669571, 7464427, 11410321, 38956122, 30144743], + }, + }, + # ----------------------------- signed, Windows ----------------------------- + { + 'sign': 'i128', 'os': 'windows', 'arch': 'x64', 'title': 'MSVC 14.5 - x64', + 'data': { + 'std::_Signed128': [1879694, 141120, 157649, 266740, 1387560, 1616895], + 'int128_t': [1894168, 143877, 156965, 138754, 1752869, 1908345], + 'boost::mp::int128_t': [5198915, 2846799, 3027203, 4080611, 6924406, 6397442], + }, + }, + { + 'sign': 'i128', 'os': 'windows', 'arch': 'ARM64', 'title': 'MSVC 14.3 - ARM64', + 'data': { + 'std::_Signed128': [991273, 34519, 34184, 126490, 1128432, 1427629], + 'int128_t': [391918, 48953, 36278, 36781, 1107571, 1310481], + 'boost::mp::int128_t': [2551137, 1243326, 1387708, 1632232, 2472959, 2926904], + }, + }, + { + 'sign': 'i128', 'os': 'windows', 'arch': 'x86', 'title': 'MSVC 14.5 - x86_32', + 'data': { + 'std::_Signed128': [3832024, 232554, 1198377, 2921104, 7174578, 5528639], + 'int128_t': [3823023, 197092, 145823, 428925, 7189000, 7028725], + 'boost::mp::int128_t': [5568151, 3488510, 4011233, 6219931, 9748526, 9205892], + }, + }, + # ------------------------------ signed, macOS ------------------------------ + { + 'sign': 'i128', 'os': 'macos', 'arch': 'ARM64', 'title': 'Clang 22 - ARM64', + 'data': { + '`__int128`': [135259, 20399, 20156, 20654, 668004, 664356], + 'int128_t': [134127, 18575, 18983, 20860, 659823, 662282], + 'boost::mp::int128_t': [340037, 169575, 168041, 69443, 976248, 1026487], + 'absl::int128': [136845, 20429, 20875, 20651, 660963, 665474], + }, + }, +] -""" -# x64 MSVC -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Unsigned128': [2060556, 261475, 178724, 146063, 1332838, 1465138], - 'uint128_t': [1921174, 106545, 124181, 136115, 1360295, 1471169], - 'boost::mp::uint128_t': [3009890, 2710279, 3059187, 3495634, 4852899, 3926336] -} -""" -""" -# ARM64 macOS -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [131902, 20613, 20484, 20160, 686521, 777084], - 'uint128_t': [133564, 17912, 18237, 20580, 699201, 724648], - 'boost::mp::uint128_t': [134182, 40176, 40311, 43285, 945928, 953117], - 'absl::uint128': [132366, 20178, 20207, 20049, 672398, 734229] -} -""" -""" -# x64 macOS -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [688225, 104921, 129150, 120363, 2333812, 2621949], - 'uint128_t': [712352, 124992, 102302, 119652, 1981469, 2219481], - 'boost::mp::uint128_t': [689146, 137819, 153484, 164100, 2784139, 2736682] -} -""" +# Bar colors by speed rank within an operation: green best, yellow second, red rest. +RANK_COLORS = {1: '#90EE90', 2: '#FFFFE0'} +SLOW_COLOR = '#FFB6C1' -# Linux x64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [785130, 90260, 91143, 111803, 1058435, 1003366], - 'uint128_t': [765065, 85758, 91449, 90069, 901516, 830830], - 'boost::mp::uint128_t': [1363581, 89958, 91224, 113559, 1040071, 1001701], - 'absl::uint128': [766205, 89255, 89716, 89660, 1044710, 978533] +# Baseline candidates in priority order; first one present in a dataset wins. +BASELINE_PRIORITY = { + 'u128': ['unsigned __int128', 'std::_Unsigned128', 'boost::mp::uint128_t'], + 'i128': ['`__int128`', '__int128', 'std::_Signed128', 'boost::mp::int128_t'], } -""" -# Linux ARM64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [3427201, 194968, 193067, 263187, 2338258, 2260200], - 'uint128_t': [2078586, 159662, 161903, 201333, 2247175, 2097760], - 'boost::mp::uint128_t': [5026689, 587373, 330052, 972009, 2190856, 2227961], - 'absl::uint128': [3753922, 194070, 140777, 244420, 2223032, 2186750] -} -""" -""" -# Linux S390x -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [6803419, 546801, 590011, 891753, 3827125, 4925696], - 'uint128_t': [6280326, 618774, 359100, 1192196, 3201674, 3360251], - 'boost::mp::uint128_t': [7965082, 621572, 691515, 944289, 3997037, 5144403], - 'absl::uint128': [10515929, 1744226, 1527622, 1839038, 4913142, 5422155] -} -""" -""" -# Linux ppc64le -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'unsigned __int128': [5242604, 221776, 222894, 194494, 4821119, 4955570], - 'uint128_t': [4450958, 193063, 175259, 192929, 4896360, 4273487], - 'boost::mp::uint128_t': [5704848, 847504, 786659, 795187, 5344637, 5407877] -} -""" -df = pd.DataFrame(data) - -# Function to determine color based on ranking -def get_colors_by_rank(row): - values = row[1:].values - ranks = np.argsort(values) + 1 - colors = [] - for rank in ranks: - if rank == 1: - colors.append('#90EE90') # Light Green - Best - elif rank == 2: - colors.append('#FFFFE0') # Light Yellow - Second - else: - colors.append('#FFB6C1') # Light Red - Third - return colors - -# Create figure with subplots -fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) - -# Prepare data -operations = df['Operation'] -x = np.arange(len(operations)) -width = 0.25 - -# Get implementation names -implementations = df.columns[1:] - -# Plot 1: Regular scale bar chart with color coding -for i, (idx, row) in enumerate(df.iterrows()): - colors = get_colors_by_rank(row) - for j, impl in enumerate(implementations): - ax1.bar(x[i] + (j-1)*width, row[impl], width, - color=colors[j], edgecolor='black', linewidth=0.5, - label=impl if i == 0 else "") - -ax1.set_xlabel('Operations', fontsize=12) -ax1.set_ylabel('Time (nanoseconds)', fontsize=12) -ax1.set_title('GCC 14 - x64 Benchmark Results', fontsize=14, fontweight='bold') -ax1.set_xticks(x) -ax1.set_xticklabels(operations, rotation=45, ha='right') -ax1.legend(loc='upper left') -ax1.grid(axis='y', alpha=0.3) - -# Add value labels on bars -for i, (idx, row) in enumerate(df.iterrows()): - for j, impl in enumerate(implementations): - ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}', - ha='center', va='bottom', fontsize=8, rotation=90) - -# Plot 2: Log scale for better visualization -for i, impl in enumerate(implementations): - bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5) - - # Color each bar based on its rank within operation - for j, bar in enumerate(bars): - operation_values = df.iloc[j, 1:].values - rank = np.argsort(operation_values).tolist().index(i) + 1 - if rank == 1: - bar.set_facecolor('#90EE90') - elif rank == 2: - bar.set_facecolor('#FFFFE0') - else: - bar.set_facecolor('#FFB6C1') - -ax2.set_xlabel('Operations', fontsize=12) -ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12) -ax2.set_title('GCC 14 - x64 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold') -ax2.set_yscale('log') -ax2.set_xticks(x) -ax2.set_xticklabels(operations, rotation=45, ha='right') -ax2.legend(loc='upper left') -ax2.grid(axis='y', alpha=0.3, which='both') - -plt.tight_layout() -plt.savefig('x64_benchmarks.png', dpi=300, bbox_inches='tight') -plt.show() - -# Create a normalized performance chart -fig3, ax3 = plt.subplots(figsize=(10, 6)) - -# Normalize data relative to unsigned __int128 -normalized_df = df.copy() -for col in implementations: - normalized_df[col] = df[col] / df['unsigned __int128'] - -# Plot normalized bars -for i, impl in enumerate(implementations): - if impl == 'unsigned __int128': - continue # Skip since it's always 1.0 - bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width, - label=impl, edgecolor='black', linewidth=0.5) - - # Add value labels - for j, bar in enumerate(bars): - height = bar.get_height() - ax3.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}x', ha='center', va='bottom', fontsize=9) - -# Add reference line at 1.0 -ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='unsigned __int128 baseline') - -ax3.set_xlabel('Operations', fontsize=12) -ax3.set_ylabel('Relative Performance (vs unsigned __int128)', fontsize=12) -ax3.set_title('Relative Performance Comparison - x64', fontsize=14, fontweight='bold') -ax3.set_xticks(x) -ax3.set_xticklabels(operations, rotation=45, ha='right') -ax3.legend() -ax3.grid(axis='y', alpha=0.3) - -# Add interpretation text -ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes, - fontsize=10, verticalalignment='top', style='italic') - -plt.tight_layout() -plt.savefig('x64_relative_performance.png', dpi=300, bbox_inches='tight') -plt.show() - -# Generate summary statistics -print("\nPerformance Summary (x64):") -print("-" * 50) -for impl in implementations: - if impl == 'unsigned __int128': - continue - avg_ratio = normalized_df[impl].mean() - print(f"{impl}: {avg_ratio:.2f}x average vs unsigned __int128") - -print("\nBest performer by operation:") -print("-" * 50) -for i, op in enumerate(operations): - row_data = df.iloc[i, 1:] - best_impl = row_data.idxmin() - best_time = row_data.min() - print(f"{op}: {best_impl} ({best_time:,} ns)") +# Pick the column every other implementation is compared against. +def detect_baseline(impls, sign): + for candidate in BASELINE_PRIORITY[sign]: + if candidate in impls: + return candidate + return impls[0] + + +# 1-based speed rank per implementation for one operation row (1 == fastest). +def speed_ranks(values): + return np.argsort(np.argsort(values)) + 1 + + +def color_for_rank(rank): + return RANK_COLORS.get(rank, SLOW_COLOR) + + +# Build the two-panel benchmark figure (linear + log) and save it. +def save_benchmark_chart(df, impls, x, width, title, path): + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) + + # Speed rank (1 == fastest) of each implementation, per operation row. + rank_by_op = [speed_ranks(df.iloc[op][impls].values) for op in range(len(df))] + + # Linear panel: one rank-colored bar per implementation within each operation. + for op_idx, (_, row) in enumerate(df.iterrows()): + ranks = rank_by_op[op_idx] + for j, impl in enumerate(impls): + ax1.bar(x[op_idx] + (j - 1) * width, row[impl], width, + color=color_for_rank(ranks[j]), edgecolor='black', linewidth=0.5, + label=impl if op_idx == 0 else "") + ax1.text(x[op_idx] + (j - 1) * width, row[impl], f'{row[impl]:,}', + ha='center', va='bottom', fontsize=8, rotation=90) + + ax1.set_xlabel('Operations', fontsize=12) + ax1.set_ylabel('Time (microseconds)', fontsize=12) + ax1.set_title(f'{title} Benchmark Results', fontsize=14, fontweight='bold') + ax1.set_xticks(x) + ax1.set_xticklabels(OPERATIONS, rotation=45, ha='right') + ax1.legend(loc='upper left') + ax1.grid(axis='y', alpha=0.3) + + # Log panel: same bars and rank colors, log y-axis for the wide dynamic range. + # Draw once per implementation (carries the legend label), then recolor each + # bar by its rank so the legend entry stays attached. + for j, impl in enumerate(impls): + bars = ax2.bar(x + (j - 1) * width, df[impl], width, label=impl, + edgecolor='black', linewidth=0.5) + for op_idx, bar in enumerate(bars): + bar.set_facecolor(color_for_rank(rank_by_op[op_idx][j])) + + ax2.set_xlabel('Operations', fontsize=12) + ax2.set_ylabel('Time (microseconds) - Log Scale', fontsize=12) + ax2.set_title(f'{title} Benchmark Results (Log Scale)', fontsize=14, fontweight='bold') + ax2.set_yscale('log') + ax2.set_xticks(x) + ax2.set_xticklabels(OPERATIONS, rotation=45, ha='right') + ax2.legend(loc='upper left') + ax2.grid(axis='y', alpha=0.3, which='both') + + fig.tight_layout() + fig.savefig(path, dpi=300, bbox_inches='tight') + plt.close(fig) + + +# Build the relative-performance figure (everything normalized to baseline) and save it. +def save_relative_chart(df, impls, x, width, title, baseline, path): + fig, ax = plt.subplots(figsize=(10, 6)) + + normalized = df[impls].div(df[baseline], axis=0) + for i, impl in enumerate(impls): + if impl == baseline: + continue + bars = ax.bar(x + (i - 1.5) * width, normalized[impl], width, + label=impl, edgecolor='black', linewidth=0.5) + for bar in bars: + height = bar.get_height() + ax.text(bar.get_x() + bar.get_width() / 2., height, + f'{height:.2f}x', ha='center', va='bottom', fontsize=9) + + # Headroom above the tallest bar so its value label and the "lower is better" + # note in the top-left corner never collide with the bars. + plotted = [impl for impl in impls if impl != baseline] + tallest = float(normalized[plotted].to_numpy().max()) + ax.set_ylim(top=max(tallest * 1.20, 1.12)) + + ax.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, + label=f'{baseline} baseline') + ax.set_xlabel('Operations', fontsize=12) + ax.set_ylabel(f'Relative Performance (vs {baseline})', fontsize=12) + ax.set_title(f'Relative Performance Comparison - {title}', fontsize=14, fontweight='bold') + ax.set_xticks(x) + ax.set_xticklabels(OPERATIONS, rotation=45, ha='right') + ax.legend() + ax.grid(axis='y', alpha=0.3) + ax.text(0.02, 0.98, 'Lower is better', transform=ax.transAxes, + fontsize=10, verticalalignment='top', style='italic') + + fig.tight_layout() + fig.savefig(path, dpi=300, bbox_inches='tight') + plt.close(fig) + + +# Render and save both charts for a single dataset; return the two output paths. +def render_dataset(entry, images_dir): + impls = list(entry['data'].keys()) + df = pd.DataFrame({'Operation': OPERATIONS, **entry['data']}) + baseline = detect_baseline(impls, entry['sign']) + + x = np.arange(len(OPERATIONS)) + width = 0.25 + + out_dir = os.path.join(images_dir, f"{entry['sign']}_graphs", entry['os']) + os.makedirs(out_dir, exist_ok=True) + bench_path = os.path.join(out_dir, f"{entry['arch']}_benchmarks.png") + rel_path = os.path.join(out_dir, f"{entry['arch']}_relative_performance.png") + + save_benchmark_chart(df, impls, x, width, entry['title'], bench_path) + save_relative_chart(df, impls, x, width, entry['title'], baseline, rel_path) + return bench_path, rel_path + + +def main(): + script_dir = os.path.dirname(os.path.abspath(__file__)) + images_dir = os.path.join(script_dir, 'modules', 'ROOT', 'images') + + written = 0 + for entry in DATASETS: + bench_path, rel_path = render_dataset(entry, images_dir) + for path in (bench_path, rel_path): + print(f"wrote {os.path.relpath(path, script_dir)}") + written += 1 + + print(f"\nDone: {written} images across {len(DATASETS)} platforms.") + + +if __name__ == '__main__': + main() diff --git a/doc/plots_32bit.py b/doc/plots_32bit.py deleted file mode 100644 index 4e98830e..00000000 --- a/doc/plots_32bit.py +++ /dev/null @@ -1,153 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -""" -# Linux x86_32 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'uint128_t': [9000979, 898718, 778881, 1778273, 8496503, 9081442], - 'boost::mp::uint128_t': [8722814, 9912175, 9773677, 8678420, 18133965, 11257837] -} -""" -# Linux ARM32 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'uint128_t': [5286033, 454715, 487190, 1471479, 19868087, 20332627], - 'boost::mp::uint128_t': [4538707, 5543856, 6465126, 8246098, 32820805, 27238658] -} - -df = pd.DataFrame(data) - -# Function to determine color based on ranking -def get_colors_by_rank(row): - values = row[1:].values - ranks = np.argsort(values) + 1 - colors = [] - for rank in ranks: - if rank == 1: - colors.append('#90EE90') # Light Green - Best - elif rank == 2: - colors.append('#FFFFE0') # Light Yellow - Second - else: - colors.append('#FFB6C1') # Light Red - Third - return colors - -# Create figure with subplots -fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) - -# Prepare data -operations = df['Operation'] -x = np.arange(len(operations)) -width = 0.25 - -# Get implementation names -implementations = df.columns[1:] - -# Plot 1: Regular scale bar chart with color coding -for i, (idx, row) in enumerate(df.iterrows()): - colors = get_colors_by_rank(row) - for j, impl in enumerate(implementations): - ax1.bar(x[i] + (j-1)*width, row[impl], width, - color=colors[j], edgecolor='black', linewidth=0.5, - label=impl if i == 0 else "") - -ax1.set_xlabel('Operations', fontsize=12) -ax1.set_ylabel('Time (nanoseconds)', fontsize=12) -ax1.set_title('GCC 14 - ARM32 Benchmark Results', fontsize=14, fontweight='bold') -ax1.set_xticks(x) -ax1.set_xticklabels(operations, rotation=45, ha='right') -ax1.legend(loc='upper left') -ax1.grid(axis='y', alpha=0.3) - -# Add value labels on bars -for i, (idx, row) in enumerate(df.iterrows()): - for j, impl in enumerate(implementations): - ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}', - ha='center', va='bottom', fontsize=8, rotation=90) - -# Plot 2: Log scale for better visualization -for i, impl in enumerate(implementations): - bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5) - - # Color each bar based on its rank within operation - for j, bar in enumerate(bars): - operation_values = df.iloc[j, 1:].values - rank = np.argsort(operation_values).tolist().index(i) + 1 - if rank == 1: - bar.set_facecolor('#90EE90') - elif rank == 2: - bar.set_facecolor('#FFFFE0') - else: - bar.set_facecolor('#FFB6C1') - -ax2.set_xlabel('Operations', fontsize=12) -ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12) -ax2.set_title('GCC 14 - ARM32 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold') -ax2.set_yscale('log') -ax2.set_xticks(x) -ax2.set_xticklabels(operations, rotation=45, ha='right') -ax2.legend(loc='upper left') -ax2.grid(axis='y', alpha=0.3, which='both') - -plt.tight_layout() -plt.savefig('ARM32_benchmarks.png', dpi=300, bbox_inches='tight') -plt.show() - -# Create a normalized performance chart -fig3, ax3 = plt.subplots(figsize=(10, 6)) - -# Normalize data relative to boost::mp::uint128_t -normalized_df = df.copy() -for col in implementations: - normalized_df[col] = df[col] / df['boost::mp::uint128_t'] - -# Plot normalized bars -for i, impl in enumerate(implementations): - if impl == 'boost::mp::uint128_t': - continue # Skip since it's always 1.0 - bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width, - label=impl, edgecolor='black', linewidth=0.5) - - # Add value labels - for j, bar in enumerate(bars): - height = bar.get_height() - ax3.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}x', ha='center', va='bottom', fontsize=9) - -# Add reference line at 1.0 -ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='boost::mp::uint128_t baseline') - -ax3.set_xlabel('Operations', fontsize=12) -ax3.set_ylabel('Relative Performance (vs boost::mp::uint128_t)', fontsize=12) -ax3.set_title('Relative Performance Comparison - ARM3232', fontsize=14, fontweight='bold') -ax3.set_xticks(x) -ax3.set_xticklabels(operations, rotation=45, ha='right') -ax3.legend() -ax3.grid(axis='y', alpha=0.3) - -# Add interpretation text -ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes, - fontsize=10, verticalalignment='top', style='italic') - -plt.tight_layout() -plt.savefig('ARM32_relative_performance.png', dpi=300, bbox_inches='tight') -plt.show() - -# Generate summary statistics -print("\nPerformance Summary (x64):") -print("-" * 50) -for impl in implementations: - if impl == 'unsigned __int128': - continue - avg_ratio = normalized_df[impl].mean() - print(f"{impl}: {avg_ratio:.2f}x average vs unsigned __int128") - -print("\nBest performer by operation:") -print("-" * 50) -for i, op in enumerate(operations): - row_data = df.iloc[i, 1:] - best_impl = row_data.idxmin() - best_time = row_data.min() - print(f"{op}: {best_impl} ({best_time:,} ns)") - diff --git a/doc/signed_plots.py b/doc/signed_plots.py deleted file mode 100644 index d34e14fb..00000000 --- a/doc/signed_plots.py +++ /dev/null @@ -1,223 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -""" -# Linux x64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [879535, 92165, 92514, 115727, 1234838, 1193529], - 'int128_t': [748787, 92441, 88390, 90897, 1352795, 1256687], - 'boost::mp::int128_t': [2210502, 283528, 668953, 312723, 1320695, 1287093], - 'absl::int128': [741269, 92323, 90394, 89558, 1200439, 1293439], -} -""" -""" -# Linux ARM64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [3495621, 191514, 131380, 236071, 2412757, 2501357], - 'int128_t': [2279914, 133319, 193984, 234594, 2434752, 2171828], - 'boost::mp::int128_t': [5910287, 566860, 1066509, 864526, 2508755, 2571959], - 'absl::int128': [3749448, 164848, 193467, 237676, 2484139, 2158203] -} - -""" - -# Linux s390x -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [14099505, 1151086, 1223119, 1904542, 8768877, 8661233], - 'int128_t': [12588237, 1374984, 753561, 2060986, 7080113, 7180650], - 'boost::mp::int128_t': [21074294, 3303931, 4224613, 3034387, 7306287, 8801605], - 'absl::int128': [13972778, 1195725, 1295929, 1733150, 7968543, 8175497], -} - -""" -# Linux ppc64le -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [4538094, 221708, 222629, 193315, 5607581, 5623562], - 'int128_t': [5796198, 191841, 174273, 191785, 4669820, 4750314], - 'boost::mp::int128_t': [13907323, 1177034, 1861166, 878393, 5616217, 5641480] -} -""" -""" -# macos x64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [1628142, 224648, 212849, 432205, 3924951, 3042060], - 'int128_t': [1748005, 180393, 131062, 407829, 2409106, 2423738], - 'boost::mp::int128_t': [4318109, 925013, 1876834, 651209, 3719183, 4443402] -} -""" -""" -# macos ARM -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - '`__int128`': [133275, 20203, 20203, 21496, 662767, 719179], - 'int128_t': [131953, 17797, 17832, 20202, 682891, 692509], - 'boost::mp::int128_t': [340555, 169909, 172497, 78269, 969277, 1026090], - 'absl::int128': [133509, 20208, 22199, 20364, 663602, 717897] -} -""" -""" -# MSVC 14.3 - ARM64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Signed128': [911829, 33233, 33411, 117586, 1127267, 1287100], - 'int128_t': [368104, 34001, 34130, 56324, 1500725, 1548073], - 'boost::mp::int128_t': [2376802, 121700, 1488822, 1564799, 2808293, 2997474] -} -""" -""" -# MSVC 14.3 - x64 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Signed128': [2186843, 186771, 193660, 402806, 1612873, 1637135], - 'int128_t': [2142626, 184598, 186335, 117413, 2369701, 2218627], - 'boost::mp::int128_t': [4854983, 2645943, 2925784, 3887479, 6437280, 6236026] -} -""" -""" -# MSVC 14.3 - x86 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'std::_Signed128': [3495288, 199936, 1089785, 2653505, 7267297, 5779771], - 'int128_t': [3520950, 212116, 210354, 2595285, 5516460, 5842785], - 'boost::mp::int128_t': [7877534, 3477656, 4108539, 7030276, 10229356, 9069360] -} -""" -df = pd.DataFrame(data) - -# Function to determine color based on ranking -def get_colors_by_rank(row): - values = row[1:].values - ranks = np.argsort(values) + 1 - colors = [] - for rank in ranks: - if rank == 1: - colors.append('#90EE90') # Light Green - Best - elif rank == 2: - colors.append('#FFFFE0') # Light Yellow - Second - else: - colors.append('#FFB6C1') # Light Red - Third - return colors - -# Create figure with subplots -fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) - -# Prepare data -operations = df['Operation'] -x = np.arange(len(operations)) -width = 0.25 - -# Get implementation names -implementations = df.columns[1:] - -# Plot 1: Regular scale bar chart with color coding -for i, (idx, row) in enumerate(df.iterrows()): - colors = get_colors_by_rank(row) - for j, impl in enumerate(implementations): - ax1.bar(x[i] + (j-1)*width, row[impl], width, - color=colors[j], edgecolor='black', linewidth=0.5, - label=impl if i == 0 else "") - -ax1.set_xlabel('Operations', fontsize=12) -ax1.set_ylabel('Time (nanoseconds)', fontsize=12) -ax1.set_title('GCC 13 - s390x Benchmark Results', fontsize=14, fontweight='bold') -ax1.set_xticks(x) -ax1.set_xticklabels(operations, rotation=45, ha='right') -ax1.legend(loc='upper left') -ax1.grid(axis='y', alpha=0.3) - -# Add value labels on bars -for i, (idx, row) in enumerate(df.iterrows()): - for j, impl in enumerate(implementations): - ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}', - ha='center', va='bottom', fontsize=8, rotation=90) - -# Plot 2: Log scale for better visualization -for i, impl in enumerate(implementations): - bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5) - - # Color each bar based on its rank within operation - for j, bar in enumerate(bars): - operation_values = df.iloc[j, 1:].values - rank = np.argsort(operation_values).tolist().index(i) + 1 - if rank == 1: - bar.set_facecolor('#90EE90') - elif rank == 2: - bar.set_facecolor('#FFFFE0') - else: - bar.set_facecolor('#FFB6C1') - -ax2.set_xlabel('Operations', fontsize=12) -ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12) -ax2.set_title('GCC 13 - s390x Benchmark Results (Log Scale)', fontsize=14, fontweight='bold') -ax2.set_yscale('log') -ax2.set_xticks(x) -ax2.set_xticklabels(operations, rotation=45, ha='right') -ax2.legend(loc='upper left') -ax2.grid(axis='y', alpha=0.3, which='both') - -plt.tight_layout() -plt.savefig('s390x_benchmarks.png', dpi=300, bbox_inches='tight') -plt.show() - -# Create a normalized performance chart -fig3, ax3 = plt.subplots(figsize=(10, 6)) - -# Normalize data relative to __int128 -normalized_df = df.copy() -for col in implementations: - normalized_df[col] = df[col] / df['`__int128`'] - -# Plot normalized bars -for i, impl in enumerate(implementations): - if impl == '`__int128`': - continue # Skip since it's always 1.0 - bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width, - label=impl, edgecolor='black', linewidth=0.5) - - # Add value labels - for j, bar in enumerate(bars): - height = bar.get_height() - ax3.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}x', ha='center', va='bottom', fontsize=9) - -# Add reference line at 1.0 -ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='`__int128` baseline') - -ax3.set_xlabel('Operations', fontsize=12) -ax3.set_ylabel('Relative Performance (vs __int128)', fontsize=12) -ax3.set_title('Relative Performance Comparison - s390x', fontsize=14, fontweight='bold') -ax3.set_xticks(x) -ax3.set_xticklabels(operations, rotation=45, ha='right') -ax3.legend() -ax3.grid(axis='y', alpha=0.3) - -# Add interpretation text -ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes, - fontsize=10, verticalalignment='top', style='italic') - -plt.tight_layout() -plt.savefig('s390x_relative_performance.png', dpi=300, bbox_inches='tight') -plt.show() - -# Generate summary statistics -print("\nPerformance Summary (x64):") -print("-" * 50) -for impl in implementations: - if impl == '__int128': - continue - avg_ratio = normalized_df[impl].mean() - print(f"{impl}: {avg_ratio:.2f}x average vs __int128") - -print("\nBest performer by operation:") -print("-" * 50) -for i, op in enumerate(operations): - row_data = df.iloc[i, 1:] - best_impl = row_data.idxmin() - best_time = row_data.min() - print(f"{op}: {best_impl} ({best_time:,} ns)") - diff --git a/doc/signed_plots_32bit.py b/doc/signed_plots_32bit.py deleted file mode 100644 index 73dbbdd2..00000000 --- a/doc/signed_plots_32bit.py +++ /dev/null @@ -1,154 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -""" -# Linux x86_32 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'int128_t': [9530060, 785799, 778881, 1148024, 10337258, 10438037], - 'boost::mp::int128_t': [12168353, 7777469, 8214089, 9477355, 22857709, 14848256] -} -""" - -# Linux ARM32 -data = { - 'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'], - 'int128_t': [6149439, 457850, 488321, 1793874, 17738614, 18064819], - 'boost::mp::int128_t': [6432579, 5669571, 7464427, 11410321, 38956122, 30144743] -} - -df = pd.DataFrame(data) - -# Function to determine color based on ranking -def get_colors_by_rank(row): - values = row[1:].values - ranks = np.argsort(values) + 1 - colors = [] - for rank in ranks: - if rank == 1: - colors.append('#90EE90') # Light Green - Best - elif rank == 2: - colors.append('#FFFFE0') # Light Yellow - Second - else: - colors.append('#FFB6C1') # Light Red - Third - return colors - -# Create figure with subplots -fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10)) - -# Prepare data -operations = df['Operation'] -x = np.arange(len(operations)) -width = 0.25 - -# Get implementation names -implementations = df.columns[1:] - -# Plot 1: Regular scale bar chart with color coding -for i, (idx, row) in enumerate(df.iterrows()): - colors = get_colors_by_rank(row) - for j, impl in enumerate(implementations): - ax1.bar(x[i] + (j-1)*width, row[impl], width, - color=colors[j], edgecolor='black', linewidth=0.5, - label=impl if i == 0 else "") - -ax1.set_xlabel('Operations', fontsize=12) -ax1.set_ylabel('Time (nanoseconds)', fontsize=12) -ax1.set_title('GCC 14 - ARM32 Benchmark Results', fontsize=14, fontweight='bold') -ax1.set_xticks(x) -ax1.set_xticklabels(operations, rotation=45, ha='right') -ax1.legend(loc='upper left') -ax1.grid(axis='y', alpha=0.3) - -# Add value labels on bars -for i, (idx, row) in enumerate(df.iterrows()): - for j, impl in enumerate(implementations): - ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}', - ha='center', va='bottom', fontsize=8, rotation=90) - -# Plot 2: Log scale for better visualization -for i, impl in enumerate(implementations): - bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5) - - # Color each bar based on its rank within operation - for j, bar in enumerate(bars): - operation_values = df.iloc[j, 1:].values - rank = np.argsort(operation_values).tolist().index(i) + 1 - if rank == 1: - bar.set_facecolor('#90EE90') - elif rank == 2: - bar.set_facecolor('#FFFFE0') - else: - bar.set_facecolor('#FFB6C1') - -ax2.set_xlabel('Operations', fontsize=12) -ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12) -ax2.set_title('GCC 14 - ARM32 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold') -ax2.set_yscale('log') -ax2.set_xticks(x) -ax2.set_xticklabels(operations, rotation=45, ha='right') -ax2.legend(loc='upper left') -ax2.grid(axis='y', alpha=0.3, which='both') - -plt.tight_layout() -plt.savefig('ARM32_benchmarks.png', dpi=300, bbox_inches='tight') -plt.show() - -# Create a normalized performance chart -fig3, ax3 = plt.subplots(figsize=(10, 6)) - -# Normalize data relative to boost::mp::int128_t -normalized_df = df.copy() -for col in implementations: - normalized_df[col] = df[col] / df['boost::mp::int128_t'] - -# Plot normalized bars -for i, impl in enumerate(implementations): - if impl == 'boost::mp::int128_t': - continue # Skip since it's always 1.0 - bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width, - label=impl, edgecolor='black', linewidth=0.5) - - # Add value labels - for j, bar in enumerate(bars): - height = bar.get_height() - ax3.text(bar.get_x() + bar.get_width()/2., height, - f'{height:.2f}x', ha='center', va='bottom', fontsize=9) - -# Add reference line at 1.0 -ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='boost::mp::int128_t baseline') - -ax3.set_xlabel('Operations', fontsize=12) -ax3.set_ylabel('Relative Performance (vs boost::mp::int128_t)', fontsize=12) -ax3.set_title('Relative Performance Comparison - ARM32', fontsize=14, fontweight='bold') -ax3.set_xticks(x) -ax3.set_xticklabels(operations, rotation=45, ha='right') -ax3.legend() -ax3.grid(axis='y', alpha=0.3) - -# Add interpretation text -ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes, - fontsize=10, verticalalignment='top', style='italic') - -plt.tight_layout() -plt.savefig('ARM32_relative_performance.png', dpi=300, bbox_inches='tight') -plt.show() - -# Generate summary statistics -print("\nPerformance Summary (x64):") -print("-" * 50) -for impl in implementations: - if impl == '__int128': - continue - avg_ratio = normalized_df[impl].mean() - print(f"{impl}: {avg_ratio:.2f}x average vs __int128") - -print("\nBest performer by operation:") -print("-" * 50) -for i, op in enumerate(operations): - row_data = df.iloc[i, 1:] - best_impl = row_data.idxmin() - best_time = row_data.min() - print(f"{op}: {best_impl} ({best_time:,} ns)") - diff --git a/include/boost/int128/cstdlib.hpp b/include/boost/int128/cstdlib.hpp index 2839ac39..b90eda39 100644 --- a/include/boost/int128/cstdlib.hpp +++ b/include/boost/int128/cstdlib.hpp @@ -61,15 +61,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr i128div_t div(const int12 return i128div_t{0, 0}; } - #if defined(BOOST_INT128_HAS_INT128) - - const auto builtin_x {static_cast(x)}; - const auto builtin_y {static_cast(y)}; - return i128div_t{static_cast(builtin_x / builtin_y), - static_cast(builtin_x % builtin_y)}; - - #else - const auto abs_lhs {static_cast(abs(x))}; const auto abs_rhs {static_cast(abs(y))}; @@ -78,19 +69,29 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr i128div_t div(const int12 return {0, x}; } - const auto unsigned_res {div(abs_lhs, abs_rhs)}; - const auto negative_quot {(x.high < 0) != (y.high < 0)}; const auto negative_rem {x.high < 0}; + #if defined(BOOST_INT128_HAS_INT128) + + if (abs_rhs.high != 0) + { + const auto builtin_x {static_cast(x)}; + const auto builtin_y {static_cast(y)}; + return i128div_t{static_cast(builtin_x / builtin_y), + static_cast(builtin_x % builtin_y)}; + } + + #endif + + const auto unsigned_res {div(abs_lhs, abs_rhs)}; + i128div_t res {static_cast(unsigned_res.quot), static_cast(unsigned_res.rem)}; res.quot = negative_quot ? -res.quot : res.quot; res.rem = negative_rem ? -res.rem : res.rem; return res; - - #endif } } // namespace int128 diff --git a/include/boost/int128/detail/common_div.hpp b/include/boost/int128/detail/common_div.hpp index 6ffe4bac..0237ead5 100644 --- a/include/boost/int128/detail/common_div.hpp +++ b/include/boost/int128/detail/common_div.hpp @@ -7,6 +7,7 @@ #include #include +#include #ifndef BOOST_INT128_BUILD_MODULE @@ -67,6 +68,242 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void half_word_div( quotient.low |= (remainder / rhs) & UINT32_MAX; } +// Portable 128-bit by 64-bit unsigned division producing a 64-bit quotient and remainder. +// This is the classic Hacker's Delight divlu (two 32-bit "digit" steps over 64-bit words). +// Precondition: u1 < d so the quotient is guaranteed to fit in 64 bits. It is constexpr-safe +// and serves as the fallback for udiv_2by1 on every target without a hardware 128/64 divide. +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t divlu(std::uint64_t u1, std::uint64_t u0, std::uint64_t d, std::uint64_t& r) noexcept +{ + constexpr std::uint64_t b {UINT64_C(1) << 32U}; // Number base (2^32) + + BOOST_INT128_ASSUME(u1 < d); // LCOV_EXCL_LINE + + // D.1: normalize so that the divisor's most significant bit is set + const auto s {countl_zero(d)}; + d <<= s; + + const auto vn1 {d >> 32U}; + const auto vn0 {d & UINT32_MAX}; + + // Shift the dividend left by s. The (64 - s) shift is undefined when s == 0, so guard it. + const auto un32 {s == 0 ? u1 : ((u1 << s) | (u0 >> (64 - s)))}; + const auto un10 {u0 << s}; + + const auto un1 {un10 >> 32U}; + const auto un0 {un10 & UINT32_MAX}; + + // First quotient digit + auto q1 {un32 / vn1}; + auto rhat {un32 - (q1 * vn1)}; + + while (q1 >= b || (q1 * vn0) > ((b * rhat) + un1)) + { + --q1; + rhat += vn1; + if (rhat >= b) + { + break; + } + } + + const auto un21 {(un32 * b) + un1 - (q1 * d)}; + + // Second quotient digit + auto q0 {un21 / vn1}; + rhat = un21 - (q0 * vn1); + + while (q0 >= b || (q0 * vn0) > ((b * rhat) + un0)) + { + --q0; + rhat += vn1; + if (rhat >= b) + { + break; + } + } + + // The remainder is shifted back down by the normalization amount + r = ((un21 * b) + un0 - (q0 * d)) >> s; + return (q1 * b) + q0; +} + +#if defined(BOOST_INT128_HAS_X86_64_DIVQ) + +// Inline asm cannot appear in a constexpr function body before C++20, so the x86-64 DIV +// instruction is wrapped in a non-constexpr helper that udiv_2by1 only calls at runtime. +BOOST_INT128_FORCE_INLINE std::uint64_t udiv_2by1_divq(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d, std::uint64_t& r) noexcept +{ + std::uint64_t q {}; + __asm__("divq %[d]" : "=a"(q), "=d"(r) : [d] "r"(d), "a"(u0), "d"(u1) : "cc"); + return q; +} + +#endif // BOOST_INT128_HAS_X86_64_DIVQ + +// Divides the 128-bit value (u1:u0) by d, returning a 64-bit quotient and the true remainder. +// Precondition: u1 < d. Mirrors common_mul.hpp::umul: a hardware instruction at runtime where +// one exists, and the portable divlu in constexpr evaluation and everywhere else. +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t udiv_2by1(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d, std::uint64_t& r) noexcept +{ + BOOST_INT128_ASSUME(u1 < d); // LCOV_EXCL_LINE + + #if (defined(BOOST_INT128_HAS_X86_64_DIVQ) || (defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) + + if (!BOOST_INT128_IS_CONSTANT_EVALUATED(u1)) + { + #if defined(BOOST_INT128_HAS_X86_64_DIVQ) + + return udiv_2by1_divq(u1, u0, d, r); + + #else + + return _udiv128(u1, u0, d, &r); + + #endif + } + + #endif + + return divlu(u1, u0, d, r); +} + +#if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4127) // Pre c++17 the if constexpr remainder part will hit this +#endif + +// Divides the 128-bit value (uh:ul) by the 128-bit divisor (vh:vl) where vh != 0. Because the +// divisor is >= 2^64 the quotient is guaranteed to fit in a single 64-bit word, which is +// returned. When need_remainder is true the 128-bit remainder is written to (rem_hi:rem_lo). +// +// This is one normalized quotient digit (Knuth Algorithm D specialized to a 2-word divisor). +// The top-limb estimate qhat (reusing udiv_2by1, a hardware divq on x86-64) is bounded by +// Knuth Theorem B to q <= qhat <= q + 2; the D3 refinement against d0 tightens it to q <= qhat +// <= q + 1, and the conditional add-back then corrects the remaining off-by-one. +template +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t div3by2(const std::uint64_t uh, const std::uint64_t ul, + const std::uint64_t vh, const std::uint64_t vl, std::uint64_t& rem_hi, std::uint64_t& rem_lo) noexcept +{ + BOOST_INT128_ASSUME(vh != 0); // LCOV_EXCL_LINE + + // D.1: normalize so the divisor's most significant bit is set + const auto s {countl_zero(vh)}; + const auto cs {64 - s}; + + std::uint64_t d1 {}; + std::uint64_t d0 {}; + std::uint64_t u2 {}; + std::uint64_t u1 {}; + std::uint64_t u0 {}; + + if (s == 0) + { + d1 = vh; + d0 = vl; + u2 = 0; + u1 = uh; + u0 = ul; + } + else + { + d1 = (vh << s) | (vl >> cs); + d0 = vl << s; + u2 = uh >> cs; + u1 = (uh << s) | (ul >> cs); + u0 = ul << s; + } + + BOOST_INT128_ASSUME(u2 <= d1); // LCOV_EXCL_LINE + + // D.3: estimate the single quotient digit qhat = floor((u2:u1) / d1), clamped to 2^64 - 1. + // rhat is the remainder of that estimate. + std::uint64_t qhat {}; + std::uint64_t rhat {}; + bool rhat_overflow {false}; + if (u2 < d1) + { + qhat = udiv_2by1(u2, u1, d1, rhat); + } + else + { + // u2 == d1: floor((u2:u1)/d1) clamps to 2^64 - 1, leaving rhat == u1 + d1 (may carry). + qhat = UINT64_MAX; + rhat = u1 + d1; + rhat_overflow = rhat < u1; + } + + std::uint64_t qd0_hi {}; + auto qd0_lo {umul(qhat, d0, qd0_hi)}; + + // Refine qhat against d0 (Knuth D3). The top-limb estimate alone can exceed the true quotient + // by up to 2; this brings it down to at most one too large, which the add-back below corrects. + // At most two iterations run, and only while the running remainder rhat stays below 2^64. + if (!rhat_overflow) + { + while (qd0_hi > rhat || (qd0_hi == rhat && qd0_lo > u0)) + { + --qhat; + rhat += d1; + const auto rhat_carry {rhat < d1}; + qd0_lo = umul(qhat, d0, qd0_hi); + if (rhat_carry) + { + break; + } + } + } + + // D.4: multiply and subtract (u2:u1:u0) - qhat * (d1:d0). qd0 already holds qhat * d0. + std::uint64_t qd1_hi {}; + const auto qd1_lo {umul(qhat, d1, qd1_hi)}; + + const auto p0 {qd0_lo}; + const auto p1 {qd0_hi + qd1_lo}; + const auto p2 {qd1_hi + static_cast(p1 < qd0_hi)}; + + const auto r0 {u0 - p0}; + const auto borrow0 {static_cast(u0 < p0)}; + const auto t1 {u1 - p1}; + auto r1 {t1 - borrow0}; + const auto borrow1 {static_cast(u1 < p1) + static_cast(t1 < borrow0)}; + + // D.5/D.6: if the top limb borrowed, qhat was one too large. Correct it and add the divisor + // back into the remainder. The probability of this branch is small. + auto r0_final {r0}; + if (BOOST_INT128_UNLIKELY((u2 < p2) || ((u2 - p2) < borrow1))) + { + --qhat; // LCOV_EXCL_LINE + const auto sum0 {r0 + d0}; // LCOV_EXCL_LINE + r0_final = sum0; // LCOV_EXCL_LINE + r1 = r1 + d1 + static_cast(sum0 < r0); // LCOV_EXCL_LINE + } + + BOOST_INT128_IF_CONSTEXPR (need_remainder) + { + if (s == 0) + { + rem_hi = r1; + rem_lo = r0_final; + } + else + { + rem_lo = (r0_final >> s) | (r1 << cs); + rem_hi = r1 >> s; + } + } + else + { + static_cast(rem_hi); + static_cast(rem_lo); + } + + return qhat; +} + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif + namespace impl { #if defined(_MSC_VER) @@ -267,218 +504,57 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T from_words(const return {static_cast(high), low}; } -#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 - -template -BOOST_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T divisor, T& remainder) -{ - using high_word_type = decltype(T{}.high); - - // Skip normalization if divisor is already large enough - // use direct division and intrinsic - // This is only possible in the unsigned case - BOOST_INT128_IF_CONSTEXPR (!std::numeric_limits::is_signed) - { - constexpr auto divisor_lower_bound{UINT64_MAX >> 1}; - if (divisor.high >= divisor_lower_bound) - { - T quotient{}; - - quotient.low = static_cast(dividend.high / divisor.high); - - std::uint64_t product0_high{}; - auto product0_low{_umul128(quotient.low, divisor.low, &product0_high)}; - - std::uint64_t product1_high{}; - auto product1_low{_umul128(quotient.low, static_cast(divisor.high), &product1_high)}; - - T product{}; - product.low = product0_low; - auto carry{BOOST_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast(&product.high))}; - product1_high += static_cast(carry); - - if (product1_high > 0 || product > dividend) - { - --quotient.low; - - // Recalculate with adjusted quotient - product0_low = _umul128(quotient.low, divisor.low, &product0_high); - product1_low = _umul128(quotient.low, divisor.high, &product1_high); - - product.low = product0_low; - carry = BOOST_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast(&product.high)); - product1_high += static_cast(carry); - } - - BOOST_INT128_IF_CONSTEXPR(needs_mod) - { - auto borrow{BOOST_INT128_SUB_BORROW(0, dividend.low, product.low, &remainder.low)}; - BOOST_INT128_SUB_BORROW(borrow, dividend.high, product.high, reinterpret_cast(&remainder.high)); - } - - return quotient; - } - } - - const auto shift_amount {countl_zero(static_cast(divisor.high))}; - divisor <<= shift_amount; - - auto high_digit {static_cast(shift_amount == 0 ? 0 : dividend.high >> (64 - shift_amount))}; - dividend <<= shift_amount; - - // Initial quotient estimate - T quotient {}; - const bool high_digit_gte_divisor {high_digit >= static_cast(divisor.high)}; - quotient.high = high_digit_gte_divisor ? 1 : 0; - std::uint64_t remainder_estimate {}; - - quotient.low = _udiv128(high_digit_gte_divisor ? high_digit - divisor.high : high_digit, - dividend.high, divisor.high, &remainder_estimate); - - // Bounded correction loop with early exit - // Typically 2 is the most number of corrections we need since this is only for 2x2 division - // Other cases have been filtered out well before we've made it this far - int correction_steps {}; - constexpr int max_corrections {2}; - - while (correction_steps < max_corrections) - { - T product{}; - product.low = _umul128(quotient.low, divisor.low, reinterpret_cast(&product.high)); - if (product <= T{static_cast(remainder_estimate), dividend.low}) - { - break; - } - - --quotient.low; - const auto sum {remainder_estimate + divisor.high}; - if (remainder_estimate > sum) - { - break; - } - remainder_estimate = sum; - - correction_steps++; - } - - // Final verification and adjustment - std::uint64_t product0_high{}; - auto product_low {_umul128(quotient.low, divisor.low, &product0_high)}; - auto borrow {BOOST_INT128_SUB_BORROW(0, dividend.low, product_low, ÷nd.low)}; - - std::uint64_t product1_high{}; - product_low = _umul128(quotient.low, divisor.high, &product1_high); - product1_high += static_cast(BOOST_INT128_ADD_CARRY(0, product_low, product0_high, &product_low)); - - borrow = BOOST_INT128_SUB_BORROW(borrow, static_cast(dividend.high), product_low, reinterpret_cast(÷nd.high)); - borrow = BOOST_INT128_SUB_BORROW(borrow, high_digit, product1_high, &high_digit); - quotient.low -= static_cast(borrow); - - BOOST_INT128_IF_CONSTEXPR (needs_mod) - { - if (borrow) - { - auto carry { BOOST_INT128_ADD_CARRY(0, dividend.low, divisor.low, ÷nd.low) }; - BOOST_INT128_ADD_CARRY(carry, static_cast(dividend.high), static_cast(divisor.high), reinterpret_cast(÷nd.high)); - } - - dividend >>= shift_amount; - remainder = dividend; - } - - return quotient; -} - -#endif - } // namespace impl // We only need to take the time to process the remainder in the modulo case // In the division case it is a waste of cycles +// +// 128/64 -> 128-bit quotient (and optional 64-bit remainder) by two-step long division. +// The leading 64/64 yields the high quotient word and a remainder r < rhs, which satisfies +// the udiv_2by1 precondition for the low quotient word. This covers every rhs (including +// rhs <= UINT32_MAX) through the single hardware-or-portable udiv_2by1 primitive. template BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept { - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) - - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs)) - { - using high_word_type = decltype(T{}.high); - - quotient.high = static_cast(static_cast(lhs.high) / rhs); - auto remainder {static_cast(lhs.high) % rhs}; - quotient.low = _udiv128(remainder, lhs.low, rhs, &remainder); - return; - } - - #endif - - if (rhs <= UINT32_MAX) - { - half_word_div(lhs, static_cast(rhs), quotient); - } - else - { - std::uint32_t u[4] {}; - std::uint32_t v[2] {}; - std::uint32_t q[4] {}; + using high_word_type = decltype(T{}.high); - const auto m {impl::to_words(lhs, u)}; - const auto n {impl::to_words(rhs, v)}; + BOOST_INT128_ASSUME(rhs != 0); // LCOV_EXCL_LINE - impl::knuth_divide(u, m, v, n, q); + const auto u_high {static_cast(lhs.high)}; - quotient = impl::from_words(q); - } + quotient.high = static_cast(u_high / rhs); + auto r {u_high % rhs}; + quotient.low = udiv_2by1(r, lhs.low, rhs, r); } template BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept { - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) - - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs)) - { - using high_word_type = decltype(T{}.high); - - quotient.high = static_cast(static_cast(lhs.high) / rhs); - remainder.low = static_cast(lhs.high) % rhs; - quotient.low = _udiv128(remainder.low, lhs.low, rhs, &remainder.low); - return; - } - - #endif + using high_word_type = decltype(T{}.high); - if (rhs <= UINT32_MAX) - { - half_word_div(lhs, static_cast(rhs), quotient, remainder); - } - else - { - std::uint32_t u[4] {}; - std::uint32_t v[2] {}; - std::uint32_t q[4] {}; + BOOST_INT128_ASSUME(rhs != 0); // LCOV_EXCL_LINE - const auto m {impl::to_words(lhs, u)}; - const auto n {impl::to_words(rhs, v)}; + const auto u_high {static_cast(lhs.high)}; - impl::knuth_divide(u, m, v, n, q); + quotient.high = static_cast(u_high / rhs); + auto r {u_high % rhs}; + quotient.low = udiv_2by1(r, lhs.low, rhs, r); - quotient = impl::from_words(q); - remainder = impl::from_words(u); - } + remainder.high = static_cast(0); + remainder.low = r; } template BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept { - half_word_div(lhs, rhs, quotient, remainder); + one_word_div(lhs, static_cast(rhs), quotient, remainder); } template BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept { - half_word_div(lhs, rhs, quotient); + one_word_div(lhs, static_cast(rhs), quotient); } #ifdef _MSC_VER @@ -492,62 +568,33 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T { BOOST_INT128_ASSUME(divisor != static_cast(0)); - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 - - BOOST_INT128_IF_CONSTEXPR(!std::numeric_limits::is_signed) - { - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(dividend)) - { - T remainder{}; - return impl::div_mod_msvc(dividend, divisor, remainder); - } - } - - #endif - - std::uint32_t u[4]{}; - std::uint32_t v[4]{}; - std::uint32_t q[4]{}; - - const auto m{ impl::to_words(dividend, u) }; - const auto n{ impl::to_words(divisor, v) }; + using high_word_type = decltype(T{}.high); - impl::knuth_divide(u, m, v, n, q); + std::uint64_t rem_hi {}; + std::uint64_t rem_lo {}; - return impl::from_words(q); + const auto q {div3by2(static_cast(dividend.high), dividend.low, + static_cast(divisor.high), divisor.low, rem_hi, rem_lo)}; + return T{static_cast(0), q}; } template BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept { BOOST_INT128_ASSUME(divisor != static_cast(0)); - - #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 - - BOOST_INT128_IF_CONSTEXPR(!std::numeric_limits::is_signed) - { - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(dividend)) - { - return impl::div_mod_msvc(dividend, divisor, remainder); - } - } - - #endif - - std::uint32_t u[4]{}; - std::uint32_t v[4]{}; - std::uint32_t q[4]{}; + using high_word_type = decltype(T{}.high); - const auto m{ impl::to_words(dividend, u) }; - const auto n{ impl::to_words(divisor, v) }; + std::uint64_t rem_hi {}; + std::uint64_t rem_lo {}; - impl::knuth_divide(u, m, v, n, q); + const auto q {div3by2(static_cast(dividend.high), dividend.low, + static_cast(divisor.high), divisor.low, rem_hi, rem_lo)}; - remainder = impl::from_words(u); + remainder = T{static_cast(rem_hi), rem_lo}; - return impl::from_words(q); + return T{static_cast(0), q}; } #ifdef _MSC_VER diff --git a/include/boost/int128/detail/common_mul.hpp b/include/boost/int128/detail/common_mul.hpp index be26c763..e0c1a8e1 100644 --- a/include/boost/int128/detail/common_mul.hpp +++ b/include/boost/int128/detail/common_mul.hpp @@ -10,7 +10,6 @@ #ifndef BOOST_INT128_BUILD_MODULE #include -#include #endif @@ -18,85 +17,89 @@ namespace boost { namespace int128 { namespace detail { -// See: The Art of Computer Programming Volume 2 (Semi-numerical algorithms) section 4.3.1 -// Algorithm M: Multiplication of Non-negative integers -template -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType knuth_multiply(const std::uint32_t (&u)[u_size], - const std::uint32_t (&v)[v_size]) noexcept +// High 64 bits of the 64x64 -> 128 product, computed with four 32-bit partial products +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t umulh_generic(const std::uint64_t a, const std::uint64_t b) noexcept { - using high_word_type = decltype(ReturnType{}.high); - - std::uint32_t w[u_size + v_size] {}; + const std::uint64_t a_lo {a & UINT32_MAX}; + const std::uint64_t a_hi {a >> 32U}; + const std::uint64_t b_lo {b & UINT32_MAX}; + const std::uint64_t b_hi {b >> 32U}; - // M.1 - for (std::size_t j {}; j < v_size; ++j) - { - // M.2 - if (v[j] == 0) - { - w[j + u_size] = 0; - continue; - } - - // M.3 - std::uint64_t t {}; - for (std::size_t i {}; i < u_size; ++i) - { - // M.4 - t += static_cast(u[i]) * v[j] + w[i + j]; - w[i + j] = static_cast(t); - t >>= 32u; - } - - // M.5 - w[j + u_size] = static_cast(t); - } + const std::uint64_t lo_lo {a_lo * b_lo}; + const std::uint64_t hi_lo {a_hi * b_lo}; + const std::uint64_t lo_hi {a_lo * b_hi}; + const std::uint64_t hi_hi {a_hi * b_hi}; - const auto low {static_cast(w[0]) | (static_cast(w[1]) << 32)}; - const auto high {static_cast(w[2]) | (static_cast(w[3]) << 32)}; + const std::uint64_t cross {(lo_lo >> 32U) + (hi_lo & UINT32_MAX) + (lo_hi & UINT32_MAX)}; - return {static_cast(high), low}; + return hi_hi + (hi_lo >> 32U) + (lo_hi >> 32U) + (cross >> 32U); } -template -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const T& x, std::uint32_t (&words)[4]) noexcept +// Full 64x64 -> 128 product +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t umul(const std::uint64_t a, const std::uint64_t b, std::uint64_t& hi) noexcept { #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x)) + if (!BOOST_INT128_IS_CONSTANT_EVALUATED(a)) { - std::memcpy(&words, &x, sizeof(T)); - return; + #if defined(BOOST_INT128_HAS_INT128) + + const detail::builtin_u128 product {static_cast(a) * static_cast(b)}; + hi = static_cast(product >> 64U); + return static_cast(product); + + #elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(__CUDA_ARCH__) + + return _umul128(a, b, &hi); + + #elif defined(_M_ARM64) && !defined(__CUDA_ARCH__) + + hi = __umulh(a, b); + return a * b; + + #endif } #endif - words[0] = static_cast(x.low & UINT32_MAX); // LCOV_EXCL_LINE - words[1] = static_cast(x.low >> 32); // LCOV_EXCL_LINE - words[2] = static_cast(static_cast(x.high) & UINT32_MAX); // LCOV_EXCL_LINE - words[3] = static_cast(static_cast(x.high) >> 32); // LCOV_EXCL_LINE + hi = umulh_generic(a, b); + return a * b; } +// Low 128 bits of a 128x128 product +template +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const T& rhs) noexcept +{ + using high_word_type = decltype(ReturnType{}.high); + + std::uint64_t result_high {}; + const std::uint64_t result_low {umul(lhs.low, rhs.low, result_high)}; + + result_high += lhs.low * static_cast(rhs.high); + result_high += static_cast(lhs.high) * rhs.low; + + return ReturnType{static_cast(result_high), result_low}; +} -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept +// Low 128 bits of a 128x64 product +template +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const std::uint64_t rhs) noexcept { - #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION + using high_word_type = decltype(ReturnType{}.high); - if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x)) - { - std::memcpy(&words, &x, sizeof(std::uint64_t)); - return; - } + std::uint64_t result_high {}; + const std::uint64_t result_low {umul(lhs.low, rhs, result_high)}; - #endif + result_high += static_cast(lhs.high) * rhs; - words[0] = static_cast(x & UINT32_MAX); // LCOV_EXCL_LINE - words[1] = static_cast(x >> 32); // LCOV_EXCL_LINE + return ReturnType{static_cast(result_high), result_low}; } -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept +// Low 128 bits of a 128x32 product +template +BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const std::uint32_t rhs) noexcept { - words[0] = x; + return low_word_mul(lhs, static_cast(rhs)); } } // namespace detail diff --git a/include/boost/int128/detail/config.hpp b/include/boost/int128/detail/config.hpp index 28ad2df9..4a93c2c3 100644 --- a/include/boost/int128/detail/config.hpp +++ b/include/boost/int128/detail/config.hpp @@ -169,6 +169,11 @@ using builtin_u128 = std::_Unsigned128; #endif // Platform macros +// Hardware 128-bit by 64-bit unsigned division via the x86-64 DIV instruction +#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) && !defined(__CUDA_ARCH__) +# define BOOST_INT128_HAS_X86_64_DIVQ +#endif + // The builtin is only constexpr from clang-7 or GCC-10 #ifdef __has_builtin # if __has_builtin(__builtin_sub_overflow) && ((defined(__clang__) && __clang_major__ >= 7) || (defined(__GNUC__) && __GNUC__ >= 10)) diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp index 9fd11a3e..898ae962 100644 --- a/include/boost/int128/detail/int128_imp.hpp +++ b/include/boost/int128/detail/int128_imp.hpp @@ -2012,66 +2012,14 @@ BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator-=(const Integer rhs namespace detail { -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t signed_shift_left_32(const std::uint64_t low) noexcept -{ - return {static_cast(low >> 32), low << 32}; -} - -BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t library_mul(const int128_t lhs, const int128_t rhs) noexcept -{ - const auto a {lhs.low >> 32U}; - const auto b {lhs.low & UINT32_MAX}; - const auto c {rhs.low >> 32U}; - const auto d {rhs.low & UINT32_MAX}; - - int128_t result { static_cast(static_cast(lhs.high) * rhs.low + lhs.low * static_cast(rhs.high) + a * c), b * d }; - result += signed_shift_left_32(a * d) + signed_shift_left_32(b * c); - - return result; -} - BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint64_t rhs) noexcept { - const auto low_res{lhs.low * rhs}; - - const auto a_lo{lhs.low & UINT32_MAX}; - const auto a_high{lhs.low >> 32U}; - const auto b_lo{rhs & UINT32_MAX}; - const auto b_high{rhs >> 32U}; - - const auto lo_lo{a_lo * b_lo}; - const auto lo_hi{a_lo * b_high}; - const auto hi_lo{a_high * b_lo}; - const auto hi_hi{a_high * b_high}; - - const auto mid{(lo_lo >> 32U) + (lo_hi & UINT32_MAX) + (hi_lo & UINT32_MAX)}; - - const auto carry{hi_hi + (lo_hi >> 32) + (hi_lo >> 32) + (mid >> 32)}; - - // Compute the high word in the unsigned domain so that the multiplication - // and addition wrap modulo 2^64. - const auto high_res{static_cast(static_cast(lhs.high) * rhs + carry)}; - - return {high_res, low_res}; + return low_word_mul(lhs, rhs); } BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint32_t rhs) noexcept { - const auto low_res{lhs.low * rhs}; - - const auto a_lo{lhs.low & UINT32_MAX}; - const auto a_hi{lhs.low >> 32U}; - - const auto lo_lo{a_lo * rhs}; - const auto hi_lo{a_hi * rhs}; - - const auto mid{(lo_lo >> 32U) + (hi_lo & UINT32_MAX)}; - - const auto carry{(hi_lo >> 32U) + (mid >> 32U)}; - - const auto high_res{static_cast(static_cast(lhs.high) * rhs + carry)}; - - return {high_res, low_res}; + return low_word_mul(lhs, rhs); } #if defined(_M_AMD64) && !defined(__GNUC__) @@ -2096,7 +2044,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs)) { - return library_mul(lhs, rhs); + return low_word_mul(lhs, rhs); } else { @@ -2126,7 +2074,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu # else - return library_mul(lhs, rhs); + return low_word_mul(lhs, rhs); # endif @@ -2138,34 +2086,16 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs)) { - return library_mul(lhs, rhs); // LCOV_EXCL_LINE + return low_word_mul(lhs, rhs); // LCOV_EXCL_LINE } else { return msvc_amd64_mul(lhs, rhs); } - #elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) - - if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs)) - { - return library_mul(lhs, rhs); // LCOV_EXCL_LINE - } - else - { - std::uint32_t lhs_words[4] {}; - std::uint32_t rhs_words[4] {}; - - // Since in all likelihood this equates to memcpy we don't need to convert to non-negative integers and back - to_words(lhs, lhs_words); - to_words(rhs, rhs_words); - - return knuth_multiply(lhs_words, rhs_words); - } - #else - return library_mul(lhs, rhs); + return low_word_mul(lhs, rhs); #endif } @@ -2276,21 +2206,16 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const { return {0,0}; } - #if defined(BOOST_INT128_HAS_INT128) - - return static_cast(static_cast(lhs) / static_cast(rhs)); - #else - - int128_t quotient {}; const auto negative_res {(lhs.high < 0) != (rhs.high < 0)}; - if (abs_rhs.high != 0) - { - quotient = detail::knuth_div(abs_lhs, abs_rhs); - } - else + // Narrow fast path: when the divisor magnitude fits in 64 bits, divide the magnitudes with + // the hardware-accelerated one_word_div and reapply the sign. This reuses the abs values + // computed above and beats native signed division (the out-of-line __divti3) for this case. + if (abs_rhs.high == 0) { + int128_t quotient {}; + if (abs_lhs.high == 0) { quotient = {0, abs_lhs.low / abs_rhs.low}; @@ -2299,9 +2224,19 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const { detail::one_word_div(abs_lhs, abs_rhs.low, quotient); } + + return negative_res ? -quotient : quotient; } + #if defined(BOOST_INT128_HAS_INT128) + + return static_cast(static_cast(lhs) / static_cast(rhs)); + + #else + + const auto quotient {detail::knuth_div(abs_lhs, abs_rhs)}; return negative_res ? -quotient : quotient; + #endif } @@ -2538,23 +2473,15 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const { return lhs; } - #if defined(BOOST_INT128_HAS_INT128) - else - { - return static_cast(static_cast(lhs) % static_cast(rhs)); - } - #else - const auto is_neg{lhs < 0}; - - int128_t remainder {}; + const auto is_neg {lhs < 0}; - if (abs_rhs.high != 0) - { - detail::knuth_div(abs_lhs, abs_rhs, remainder); - } - else + // Narrow fast path: when the divisor magnitude fits in 64 bits, take the remainder of the + // magnitudes with the hardware-accelerated one_word_div and reapply the dividend's sign. + if (abs_rhs.high == 0) { + int128_t remainder {}; + if (abs_lhs.high == 0) { remainder = int128_t{0, abs_lhs.low % abs_rhs.low}; @@ -2562,11 +2489,20 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const else { int128_t quotient {}; - detail::one_word_div(abs_lhs, abs_rhs.low, quotient, remainder); } + + return is_neg ? -remainder : remainder; } + #if defined(BOOST_INT128_HAS_INT128) + + return static_cast(static_cast(lhs) % static_cast(rhs)); + + #else + + int128_t remainder {}; + detail::knuth_div(abs_lhs, abs_rhs, remainder); return is_neg ? -remainder : remainder; #endif diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp index 1788433d..e1b8d73d 100644 --- a/include/boost/int128/detail/uint128_imp.hpp +++ b/include/boost/int128/detail/uint128_imp.hpp @@ -2271,11 +2271,9 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_m # endif - #elif (defined(__s390x__) || defined(__s390x__)) && defined(__GNUC__) - # define BOOST_INT128_HIDE_MUL - - return static_cast(static_cast(lhs) * static_cast(rhs)); - + // s390x intentionally falls through to the synthetic low_word_mul below. Casting to builtin_u128 + // makes GCC reconstruct the value through a vector-unit stack round-trip that is several times + // slower, and the memcpy path is unsafe for the narrow (scalar rhs) overloads on big-endian. #elif ((defined(_M_AMD64) && !defined(__GNUC__)) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION) if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs)) @@ -2288,18 +2286,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_m // We need to hide this if we use a non-const eval method above to avoid a litany of cross-platform warnings #ifndef BOOST_INT128_HIDE_MUL - constexpr std::size_t rhs_words_needed {std::is_same::value ? 1 : - std::is_same::value ? 2 : - std::is_same::value ? 4 : 0}; - - static_assert(rhs_words_needed != 0, "Must be 32, 64 or 128 bit unsigned integer"); - - std::uint32_t lhs_words[4] {}; - std::uint32_t rhs_words[rhs_words_needed] {}; - to_words(lhs, lhs_words); - to_words(rhs, rhs_words); - - return knuth_multiply(lhs_words, rhs_words); + return low_word_mul(lhs, rhs); #else #undef BOOST_INT128_HIDE_MUL @@ -2506,31 +2493,30 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, cons { return {0, 0}; } - #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) - else - { - return static_cast(static_cast(lhs) / static_cast(rhs)); - } - #else - else if (rhs.high != 0U) - { - return detail::knuth_div(lhs, rhs); - } - else + + // A divisor that fits in 64 bits is handled by the hardware-accelerated narrow path. This + // beats the native 128/128 divide for this common case on every platform (it avoids the + // out-of-line __udivti3 call on GCC/Clang and uses divq / _udiv128 directly where present). + if (rhs.high == 0U) { if (lhs.high == 0U) { return {0, lhs.low / rhs.low}; } - else - { - uint128_t quotient {}; - - detail::one_word_div(lhs, rhs.low, quotient); - return quotient; - } + uint128_t quotient {}; + detail::one_word_div(lhs, rhs.low, quotient); + return quotient; } + + #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) + + return static_cast(static_cast(lhs) / static_cast(rhs)); + + #else + + return detail::knuth_div(lhs, rhs); + #endif } @@ -2665,38 +2651,36 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, cons { return {0, 0}; } - else if (rhs > lhs) + if (rhs > lhs) { return lhs; } - #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) - else - { - return static_cast(static_cast(lhs) % static_cast(rhs)); - } - #else - else if (rhs.high != 0U) - { - uint128_t remainder {}; - detail::knuth_div(lhs, rhs, remainder); - return remainder; - } - else + + // A divisor that fits in 64 bits is handled by the hardware-accelerated narrow path, which + // beats the native 128/128 divide for this common case on every platform. + if (rhs.high == 0U) { if (lhs.high == 0U) { return {0, lhs.low % rhs.low}; } - else - { - uint128_t quotient {}; - uint128_t remainder {}; - - detail::one_word_div(lhs, rhs.low, quotient, remainder); - return remainder; - } + uint128_t quotient {}; + uint128_t remainder {}; + detail::one_word_div(lhs, rhs.low, quotient, remainder); + return remainder; } + + #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__) + + return static_cast(static_cast(lhs) % static_cast(rhs)); + + #else + + uint128_t remainder {}; + detail::knuth_div(lhs, rhs, remainder); + return remainder; + #endif } diff --git a/test/Jamfile b/test/Jamfile index cae51584..2be537df 100644 --- a/test/Jamfile +++ b/test/Jamfile @@ -44,6 +44,13 @@ project : requirements clang:on gcc:on + # The b2 sanitizer features only add -fsanitize flags; define the macros the + # tests use to skip checks that intentionally exercise UB (e.g. shift tests). + on:UBSAN=1 + norecover:UBSAN=1 + on:ASAN=1 + norecover:ASAN=1 + [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ] ; @@ -88,6 +95,7 @@ run test_format.cpp ; run test_fmt_format.cpp ; run test_div.cpp ; +run test_div_primitives.cpp ; run test_num_digits.cpp ; run test_spaceship_operator.cpp ; diff --git a/test/benchmark_i128.cpp b/test/benchmark_i128.cpp index ba0a4f0d..4433a217 100644 --- a/test/benchmark_i128.cpp +++ b/test/benchmark_i128.cpp @@ -319,6 +319,36 @@ BOOST_INT128_NO_INLINE void test_two_element_operation(const std::vector& dat std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n"; } +// Benchmarks the narrow division overloads (128-bit divided by a 64-bit or 32-bit value), +// which exercise the hardware-accelerated one_word_div path rather than the full 128/128 divide. +template +BOOST_INT128_NO_INLINE void test_narrow_division(const std::vector& data_vec, const char* operation, const char* type) +{ + const auto t1 = std::chrono::steady_clock::now(); + std::int64_t s = 0; // discard variable + + for (std::size_t k {}; k < K; ++k) + { + for (std::size_t i {}; i < data_vec.size() - 1U; ++i) + { + if (HalfWord) + { + const auto divisor = static_cast(data_vec[i + 1]) | 1U; + s += static_cast(data_vec[i] / divisor); + } + else + { + const auto divisor = static_cast(data_vec[i + 1]) | UINT64_C(1); + s += static_cast(data_vec[i] / divisor); + } + } + } + + const auto t2 = std::chrono::steady_clock::now(); + + std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n"; +} + std::vector generate_shift_vector() { std::random_device rd; @@ -473,6 +503,32 @@ int main() #endif std::cerr << std::endl; + + #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128) + test_narrow_division(builtin_vector, "div64", "Builtin"); + #endif + + test_narrow_division(library_vector, "div64", "Library"); + test_narrow_division(mp_vector, "div64", "mp::i128"); + + #ifdef BOOST_INT128_BENCHMARK_ABSL + test_narrow_division(absl_vector, "div64", "absl::i128"); + #endif + + std::cerr << std::endl; + + #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128) + test_narrow_division(builtin_vector, "div32", "Builtin"); + #endif + + test_narrow_division(library_vector, "div32", "Library"); + test_narrow_division(mp_vector, "div32", "mp::i128"); + + #ifdef BOOST_INT128_BENCHMARK_ABSL + test_narrow_division(absl_vector, "div32", "absl::i128"); + #endif + + std::cerr << std::endl; } // Single word operations { diff --git a/test/benchmark_u128.cpp b/test/benchmark_u128.cpp index a8a88996..767a1bb5 100644 --- a/test/benchmark_u128.cpp +++ b/test/benchmark_u128.cpp @@ -337,6 +337,36 @@ BOOST_INT128_NO_INLINE void test_two_element_operation(const std::vector& dat std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n"; } +// Benchmarks the narrow division overloads (128-bit divided by a 64-bit or 32-bit value), +// which exercise the hardware-accelerated one_word_div path rather than the full 128/128 divide. +template +BOOST_INT128_NO_INLINE void test_narrow_division(const std::vector& data_vec, const char* operation, const char* type) +{ + const auto t1 = std::chrono::steady_clock::now(); + std::uint64_t s = 0; // discard variable + + for (std::size_t k {}; k < K; ++k) + { + for (std::size_t i {}; i < data_vec.size() - 1U; ++i) + { + if (HalfWord) + { + const auto divisor = static_cast(data_vec[i + 1]) | 1U; + s += static_cast(data_vec[i] / divisor); + } + else + { + const auto divisor = static_cast(data_vec[i + 1]) | UINT64_C(1); + s += static_cast(data_vec[i] / divisor); + } + } + } + + const auto t2 = std::chrono::steady_clock::now(); + + std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n"; +} + template BOOST_INT128_NO_INLINE void test_gcd(const std::vector& data_vec, const char* type) { @@ -586,6 +616,32 @@ int main() std::cerr << std::endl; + #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128) + test_narrow_division(builtin_vector, "div64", "Builtin"); + #endif + + test_narrow_division(library_vector, "div64", "Library"); + test_narrow_division(mp_vector, "div64", "mp::u128"); + + #ifdef BOOST_INT128_BENCHMARK_ABSL + test_narrow_division(absl_vector, "div64", "absl::u128"); + #endif + + std::cerr << std::endl; + + #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128) + test_narrow_division(builtin_vector, "div32", "Builtin"); + #endif + + test_narrow_division(library_vector, "div32", "Library"); + test_narrow_division(mp_vector, "div32", "mp::u128"); + + #ifdef BOOST_INT128_BENCHMARK_ABSL + test_narrow_division(absl_vector, "div32", "absl::u128"); + #endif + + std::cerr << std::endl; + #if (defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)) && defined(BOOST_INT128_BENCHMARK_BUILTIN_GCD) //test_gcd(builtin_vector, "Builtin"); #endif diff --git a/test/test_ckd.cpp b/test/test_ckd.cpp index 35b2a84c..e79343cc 100644 --- a/test/test_ckd.cpp +++ b/test/test_ckd.cpp @@ -122,13 +122,13 @@ bool ref_std_mul_overflow(const A a, const B b, R* r) noexcept } template -void check_op(const T2 a, const T3 b, Ref ref_overflow, Ckd ckd_overflow) +void check_op(const T2 lhs, const T3 rhs, Ref ref_overflow, Ckd ckd_overflow) { T1 expected {}; - const bool expected_overflow {ref_overflow(a, b, &expected)}; + const bool expected_overflow {ref_overflow(lhs, rhs, &expected)}; T1 got {}; - const bool got_overflow {ckd_overflow(&got, a, b)}; + const bool got_overflow {ckd_overflow(&got, lhs, rhs)}; BOOST_TEST_EQ(got_overflow, expected_overflow); BOOST_TEST(got == expected); @@ -479,7 +479,7 @@ void test_mul_edges() // constexpr usability for all three operations. // -#if defined(__GNUC__) && __GNUC__ == 7 && !defined(__clang__) && !defined(__SIZEOF_INT128__) +#if defined(__GNUC__) && __GNUC__ <= 7 && !defined(__clang__) && !defined(__SIZEOF_INT128__) # define BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128 #endif @@ -515,12 +515,18 @@ constexpr int mul_value() return r; } -#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128 constexpr bool mul_overflows_i128_min() { int128_t r {0}; return ckd_mul(&r, (std::numeric_limits::min)(), int128_t{-1}); } + +#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128 + +// MSVC 14.1 warns of integral overflow +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4307) #endif void test_constexpr() @@ -530,11 +536,15 @@ void test_constexpr() static_assert(mul_overflows_int_max(), "INT_MAX * 2 overflows int"); static_assert(sub_value() == 2, "5 - 3 == 2"); static_assert(mul_value() == 42, "6 * 7 == 42"); -#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128 static_assert(mul_overflows_i128_min(), "INT128_MIN * -1 overflows int128_t"); -#endif } +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +#endif + int main() { test_standard_oracle(); @@ -542,7 +552,10 @@ int main() test_add_edges(); test_sub_edges(); test_mul_edges(); + + #ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128 test_constexpr(); + #endif return boost::report_errors(); } diff --git a/test/test_consteval_funcs.cpp b/test/test_consteval_funcs.cpp index 02f0b760..64f1e79d 100644 --- a/test/test_consteval_funcs.cpp +++ b/test/test_consteval_funcs.cpp @@ -4,6 +4,12 @@ #include +// Only warns on MSVC 14.1 +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable:4307) +#endif + #if defined(__cpp_consteval) && __cpp_consteval >= 201811L # define BOOST_INT128_CONSTEVAL consteval #else diff --git a/test/test_div_primitives.cpp b/test/test_div_primitives.cpp new file mode 100644 index 00000000..694f0fef --- /dev/null +++ b/test/test_div_primitives.cpp @@ -0,0 +1,266 @@ +// Copyright 2025 Matt Borland +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt + +// Validates the low-level division building blocks in detail/common_div.hpp: +// * udiv_2by1 / divlu : 128/64 -> 64-bit quotient + remainder +// * div3by2 : 128/128 (divisor >= 2^64) -> single 64-bit quotient + 128-bit remainder +// +// div3by2 is cross-checked against the independent 32-bit-limb Knuth Algorithm D +// (impl::knuth_divide), which is a completely separate implementation, so this check is valid +// on every platform. Where a native 128-bit integer exists it is also used as an oracle. + +#include +#include +#include +#include +#include +#include +#include + +using namespace boost::int128; + +static std::mt19937_64 rng(0xC0FFEEULL); +static std::uniform_int_distribution dist(0, UINT64_MAX); + +// Independent oracle for 128/128 division with divisor >= 2^64, using the 32-bit-limb +// Knuth Algorithm D that the library retains. Returns quotient and remainder. +static void knuth_oracle(const std::uint64_t uh, const std::uint64_t ul, + const std::uint64_t vh, const std::uint64_t vl, + uint128_t& quot, uint128_t& rem) +{ + const uint128_t u_val {uh, ul}; + const uint128_t v_val {vh, vl}; + + if (u_val < v_val) + { + quot = uint128_t{UINT64_C(0)}; + rem = u_val; + return; + } + + std::uint32_t u[4] {}; + std::uint32_t v[4] {}; + std::uint32_t q[4] {}; + + const auto m {detail::impl::to_words(u_val, u)}; + const auto n {detail::impl::to_words(v_val, v)}; + + detail::impl::knuth_divide(u, m, v, n, q); + + quot = detail::impl::from_words(q); + rem = detail::impl::from_words(u); +} + +static void check_div3by2(const std::uint64_t uh, const std::uint64_t ul, + const std::uint64_t vh, const std::uint64_t vl) +{ + std::uint64_t rem_hi {}; + std::uint64_t rem_lo {}; + const auto q {detail::div3by2(uh, ul, vh, vl, rem_hi, rem_lo)}; + + uint128_t expected_q {}; + uint128_t expected_r {}; + knuth_oracle(uh, ul, vh, vl, expected_q, expected_r); + + // The quotient always fits in 64 bits when the divisor is >= 2^64 + BOOST_TEST_EQ(expected_q.high, UINT64_C(0)); + BOOST_TEST_EQ(q, expected_q.low); + BOOST_TEST_EQ(uint128_t(rem_hi, rem_lo), expected_r); +} + +static void test_div3by2_random() +{ + constexpr int iters {2000000}; + for (int i {}; i < iters; ++i) + { + const auto uh {dist(rng)}; + const auto ul {dist(rng)}; + auto vh {dist(rng)}; + const auto vl {dist(rng)}; + + if (vh == 0) + { + vh = 1; // divisor must be >= 2^64 for div3by2 + } + + check_div3by2(uh, ul, vh, vl); + } +} + +static void test_div3by2_edges() +{ + const std::uint64_t test_words[] { + UINT64_C(0), UINT64_C(1), UINT64_C(2), UINT64_C(3), + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x0123456789ABCDEF), + UINT64_C(0xFFFFFFFF), UINT64_C(0x100000000) + }; + + for (const auto vh : test_words) + { + if (vh == 0) + { + continue; // div3by2 requires vh != 0 + } + + for (const auto vl : test_words) + { + for (const auto uh : test_words) + { + for (const auto ul : test_words) + { + check_div3by2(uh, ul, vh, vl); + } + } + } + } + + // abs(INT128_MIN) == 2^127 as a dividend, divided by a range of >= 2^64 divisors + for (const auto vl : test_words) + { + check_div3by2(UINT64_C(0x8000000000000000), UINT64_C(0), UINT64_C(0x8000000000000001), vl); + check_div3by2(UINT64_C(0x8000000000000000), UINT64_C(0), UINT64_C(0xFFFFFFFFFFFFFFFF), vl); + } +} + +#if defined(BOOST_INT128_HAS_INT128) + +// Construct dividends of the exact form V*q + offset to stress the correction / add-back path, +// where the single-digit quotient estimate is most likely to be one too large. +static void test_div3by2_boundary() +{ + const std::uint64_t div_hi[] { + UINT64_C(1), UINT64_C(0x8000000000000000), UINT64_C(0xFFFFFFFFFFFFFFFF), + UINT64_C(0x0123456789ABCDEF), UINT64_C(0x00000000FFFFFFFF) + }; + const std::uint64_t div_lo[] { + UINT64_C(0), UINT64_C(1), UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xDEADBEEFCAFEBABE) + }; + const std::uint64_t quotients[] { + UINT64_C(1), UINT64_C(2), UINT64_C(7), UINT64_C(0xFFFFFFFF), + UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000), UINT64_C(0x123456789) + }; + + for (const auto vh : div_hi) + { + for (const auto vl : div_lo) + { + const detail::builtin_u128 v_val {(static_cast(vh) << 64) | vl}; + + for (const auto q : quotients) + { + // offsets just below the divisor are where the maximum remainder lives + const detail::builtin_u128 offsets[] { + detail::builtin_u128{0}, detail::builtin_u128{1}, v_val - 1, v_val >> 1 + }; + + for (const auto off : offsets) + { + const detail::builtin_u128 prod {v_val * q}; + + // skip combinations where V*q already overflows 128 bits + if (q != 0 && (prod / q) != v_val) + { + continue; + } + + const detail::builtin_u128 u_val {prod + off}; + if (u_val < prod) + { + continue; // offset pushed us past 2^128 + } + + const auto uh {static_cast(u_val >> 64)}; + const auto ul {static_cast(u_val)}; + + std::uint64_t rem_hi {}; + std::uint64_t rem_lo {}; + const auto got_q {detail::div3by2(uh, ul, vh, vl, rem_hi, rem_lo)}; + + BOOST_TEST_EQ(got_q, static_cast(u_val / v_val)); + BOOST_TEST_EQ(uint128_t(rem_hi, rem_lo), static_cast(u_val % v_val)); + } + } + } + } +} + +static void check_2by1(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d) +{ + const detail::builtin_u128 full {(static_cast(u1) << 64) | u0}; + const auto expected_q {static_cast(full / d)}; + const auto expected_r {static_cast(full % d)}; + + std::uint64_t r {}; + const auto q {detail::udiv_2by1(u1, u0, d, r)}; + BOOST_TEST_EQ(q, expected_q); + BOOST_TEST_EQ(r, expected_r); + + // divlu is the portable fallback that udiv_2by1 uses off x86-64 / MSVC; test it directly too + std::uint64_t r2 {}; + const auto q2 {detail::divlu(u1, u0, d, r2)}; + BOOST_TEST_EQ(q2, expected_q); + BOOST_TEST_EQ(r2, expected_r); +} + +static void test_udiv_2by1_random() +{ + constexpr int iters {2000000}; + for (int i {}; i < iters; ++i) + { + const auto u0 {dist(rng)}; + auto d {dist(rng)}; + if (d == 0) + { + d = 1; + } + const auto u1 {dist(rng) % d}; // precondition: u1 < d + + check_2by1(u1, u0, d); + } +} + +static void test_udiv_2by1_edges() +{ + const std::uint64_t divisors[] { + UINT64_C(1), UINT64_C(2), UINT64_C(3), UINT64_C(10), + UINT64_C(0xFFFFFFFF), UINT64_C(0x100000000), UINT64_C(0x80000000), + UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000), UINT64_C(0xFFFFFFFFFFFFFFFF) + }; + const std::uint64_t lows[] { + UINT64_C(0), UINT64_C(1), UINT64_C(0x80000000), + UINT64_C(0xFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF) + }; + + for (const auto d : divisors) + { + for (const auto u0 : lows) + { + check_2by1(UINT64_C(0), u0, d); // u1 == 0 + check_2by1(d - 1, u0, d); // u1 == d - 1 (maximal) + if (d > 1) + { + check_2by1(d / 2, u0, d); + } + } + } +} + +#endif // BOOST_INT128_HAS_INT128 + +int main() +{ + test_div3by2_random(); + test_div3by2_edges(); + + #if defined(BOOST_INT128_HAS_INT128) + + test_div3by2_boundary(); + test_udiv_2by1_random(); + test_udiv_2by1_edges(); + + #endif + + return boost::report_errors(); +} diff --git a/test/test_i128.cpp b/test/test_i128.cpp index 254b497e..5dd982ff 100644 --- a/test/test_i128.cpp +++ b/test/test_i128.cpp @@ -77,7 +77,16 @@ IntType get_root_max() template IntType get_root_min() { - return static_cast(std::sqrt(std::numeric_limits::min())); + // numeric_limits::min() is negative for signed IntType, so sqrt() would be + // NaN and the cast UB; use the negative of the positive root instead. + BOOST_INT128_IF_CONSTEXPR (std::is_signed::value) + { + return static_cast(-get_root_max()); + } + else + { + return static_cast(0); + } } #include diff --git a/test/test_u128.cpp b/test/test_u128.cpp index 092bacdd..925f2c73 100644 --- a/test/test_u128.cpp +++ b/test/test_u128.cpp @@ -67,7 +67,16 @@ T get_root_max() template T get_root_min() { - return static_cast(std::sqrt(std::numeric_limits::min())); + // numeric_limits::min() is negative for signed T, so sqrt() would be NaN + // and the cast UB; use the negative of the positive root instead. + BOOST_INT128_IF_CONSTEXPR (std::is_signed::value) + { + return static_cast(-get_root_max()); + } + else + { + return static_cast(0); + } } #include diff --git a/test/test_u128_no_sign_conv.cpp b/test/test_u128_no_sign_conv.cpp index 55d08a31..729161ac 100644 --- a/test/test_u128_no_sign_conv.cpp +++ b/test/test_u128_no_sign_conv.cpp @@ -66,7 +66,16 @@ T get_root_max() template T get_root_min() { - return static_cast(std::sqrt(std::numeric_limits::min())); + // numeric_limits::min() is negative for signed T, so sqrt() would be NaN + // and the cast UB; use the negative of the positive root instead. + BOOST_INT128_IF_CONSTEXPR (std::is_signed::value) + { + return static_cast(-get_root_max()); + } + else + { + return static_cast(0); + } } #include diff --git a/test/test_x64_msvc_div.cpp b/test/test_x64_msvc_div.cpp index 1d3f1e73..40cbb70c 100644 --- a/test/test_x64_msvc_div.cpp +++ b/test/test_x64_msvc_div.cpp @@ -2,93 +2,118 @@ // Distributed under the Boost Software License, Version 1.0. // https://www.boost.org/LICENSE_1_0.txt +// On MSVC x64 the division building blocks use the hardware intrinsics _udiv128 (via udiv_2by1) +// and _umul128 (via umul, inside div3by2). This validates that intrinsic path against +// intrinsic-free references on the same inputs: the portable Hacker's Delight divlu, and the +// 32-bit-limb Knuth Algorithm D. Those references are in turn checked against a native 128-bit +// integer on the platforms that have one (see test_div_primitives.cpp), so agreement here pins +// down the MSVC intrinsic wiring specifically. + #include +#include #include #include #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 +using boost::int128::uint128_t; + static std::mt19937_64 rng{42}; -static constexpr std::size_t N{1024U}; +static constexpr std::size_t N{4096U}; static std::uniform_int_distribution dist{UINT64_C(0), UINT64_MAX}; -static std::uniform_int_distribution dist32{UINT32_C(0), UINT32_MAX}; -void test_two_words() +// Independent reference for 128/128 (divisor >= 2^64): the 32-bit-limb Knuth Algorithm D, which +// uses no 64-bit-divide or multiply intrinsics. +static void knuth_oracle(const std::uint64_t uh, const std::uint64_t ul, + const std::uint64_t vh, const std::uint64_t vl, + uint128_t& quot, uint128_t& rem) { - for (std::size_t i{}; i < N; ++i) - { - boost::int128::uint128_t lhs{dist(rng), dist(rng)}; - boost::int128::uint128_t rhs{dist(rng), dist(rng)}; + const uint128_t u_val{uh, ul}; + const uint128_t v_val{vh, vl}; - // Guarantee lhs is greater than rhs - if (lhs < rhs) - { - std::swap(lhs, rhs); - } + if (u_val < v_val) + { + quot = uint128_t{UINT64_C(0)}; + rem = u_val; + return; + } + std::uint32_t u[4]{}; + std::uint32_t v[4]{}; + std::uint32_t q[4]{}; - boost::int128::uint128_t remainder{}; - const auto quotient{boost::int128::detail::impl::div_mod_msvc(lhs, rhs, remainder)}; + const auto m{boost::int128::detail::impl::to_words(u_val, u)}; + const auto n{boost::int128::detail::impl::to_words(v_val, v)}; - boost::int128::uint128_t knuth_remainder{}; - const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)}; + boost::int128::detail::impl::knuth_divide(u, m, v, n, q); - BOOST_TEST_EQ(remainder, knuth_remainder); - BOOST_TEST_EQ(quotient, knuth_quotient); - } + quot = boost::int128::detail::impl::from_words(q); + rem = boost::int128::detail::impl::from_words(u); } -void test_four_by_three() +// _udiv128 (udiv_2by1) versus the portable divlu, for 128/64 -> 64. +void test_udiv_2by1() { for (std::size_t i{}; i < N; ++i) { - boost::int128::uint128_t lhs{dist(rng), dist(rng)}; - boost::int128::uint128_t rhs{dist32(rng), dist(rng)}; + const auto u0{dist(rng)}; + auto d{dist(rng)}; + if (d == 0) + { + d = 1; + } + const auto u1{dist(rng) % d}; // precondition u1 < d - boost::int128::uint128_t remainder{}; - const auto quotient{boost::int128::detail::impl::div_mod_msvc(lhs, rhs, remainder)}; + std::uint64_t r_intrin{}; + const auto q_intrin{boost::int128::detail::udiv_2by1(u1, u0, d, r_intrin)}; - boost::int128::uint128_t knuth_remainder{}; - const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)}; + std::uint64_t r_soft{}; + const auto q_soft{boost::int128::detail::divlu(u1, u0, d, r_soft)}; - BOOST_TEST_EQ(remainder, knuth_remainder); - BOOST_TEST_EQ(quotient, knuth_quotient); + BOOST_TEST_EQ(q_intrin, q_soft); + BOOST_TEST_EQ(r_intrin, r_soft); } +} - // The biggest gap we can have between 2 word unsigned values - { - constexpr auto lhs{(std::numeric_limits::max)()}; - constexpr boost::int128::uint128_t rhs{1,0}; +static void check_div3by2(const std::uint64_t uh, const std::uint64_t ul, + const std::uint64_t vh, const std::uint64_t vl) +{ + std::uint64_t rh{}; + std::uint64_t rl{}; + const auto q{boost::int128::detail::div3by2(uh, ul, vh, vl, rh, rl)}; - boost::int128::uint128_t remainder{}; - const auto quotient{boost::int128::detail::impl::div_mod_msvc(lhs, rhs, remainder)}; + uint128_t expected_q{}; + uint128_t expected_r{}; + knuth_oracle(uh, ul, vh, vl, expected_q, expected_r); - boost::int128::uint128_t knuth_remainder{}; - const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)}; + BOOST_TEST_EQ(expected_q.high, UINT64_C(0)); + BOOST_TEST_EQ(q, expected_q.low); + BOOST_TEST_EQ(uint128_t(rh, rl), expected_r); +} - BOOST_TEST_EQ(remainder, knuth_remainder); - BOOST_TEST_EQ(quotient, knuth_quotient); - } - // And again for signed +// _udiv128 + _umul128 (div3by2) versus the 32-bit-limb Knuth reference, for 128/128 -> 64. +void test_div3by2() +{ + for (std::size_t i{}; i < N; ++i) { - constexpr auto lhs{static_cast((std::numeric_limits::max)())}; - constexpr boost::int128::uint128_t rhs{1,0}; - - boost::int128::uint128_t remainder{}; - const auto quotient{boost::int128::detail::impl::div_mod_msvc(lhs, rhs, remainder)}; - - boost::int128::uint128_t knuth_remainder{}; - const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)}; - - BOOST_TEST_EQ(remainder, knuth_remainder); - BOOST_TEST_EQ(quotient, knuth_quotient); + auto vh{dist(rng)}; + if (vh == 0) + { + vh = 1; // div3by2 requires divisor >= 2^64 + } + check_div3by2(dist(rng), dist(rng), vh, dist(rng)); } + + // The widest gap between two-word unsigned values, and the signed-max case the original test + // exercised, both dividing by exactly 2^64. + check_div3by2(UINT64_MAX, UINT64_MAX, UINT64_C(1), UINT64_C(0)); + check_div3by2(UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_MAX, UINT64_C(1), UINT64_C(0)); } int main() { - test_two_words(); - test_four_by_three(); + test_udiv_2by1(); + test_div3by2(); return boost::report_errors(); } @@ -100,4 +125,4 @@ int main() return 0; } -#endif +#endif