diff --git a/.drone.jsonnet b/.drone.jsonnet
index d7b0c56f..6e443018 100644
--- a/.drone.jsonnet
+++ b/.drone.jsonnet
@@ -24,9 +24,8 @@ local linux_pipeline(name, image, environment, packages = "", sources = [], arch
         os: "linux",
         arch: arch
     },
-    clone:
-    {
-        retries: 5,
+    "clone": {
+       "retries": 5
     },
     steps:
     [
@@ -38,7 +37,9 @@ local linux_pipeline(name, image, environment, packages = "", sources = [], arch
             commands:
             [
                 'set -e',
-                'wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -',
+                'echo $DRONE_STAGE_MACHINE',
+                'uname -a',
+                'curl -sSL --retry 5 https://apt.llvm.org/llvm-snapshot.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/llvm-snapshot.gpg',
             ] +
             (if sources != [] then [ ('apt-add-repository "' + source + '"') for source in sources ] else []) +
             (if packages != "" then [ 'apt-get update', 'apt-get -y install ' + packages ] else []) +
@@ -268,6 +269,34 @@ local windows_pipeline(name, image, environment, arch = "amd64") =
         "g++-14-multilib",
     ),
 
+    linux_pipeline(
+        "Linux 26.04 GCC 15 32",
+        "cppalliance/droneubuntu2604:1",
+        { TOOLSET: 'gcc', COMPILER: 'g++-15', CXXSTD: '03,11,14,17,20,23', ADDRMD: '32', CXXFLAGS: "-fexcess-precision=fast" },
+        "g++-15-multilib",
+    ),
+
+    linux_pipeline(
+        "Linux 26.04 GCC 15 64",
+        "cppalliance/droneubuntu2604:1",
+        { TOOLSET: 'gcc', COMPILER: 'g++-15', CXXSTD: '03,11,14,17,20,23', ADDRMD: '64', CXXFLAGS: "-fexcess-precision=fast" },
+        "g++-15-multilib",
+    ),
+
+    linux_pipeline(
+        "Linux 26.04 GCC 16 32",
+        "cppalliance/droneubuntu2604:1",
+        { TOOLSET: 'gcc', COMPILER: 'g++-16', CXXSTD: '03,11,14,17,20,23', ADDRMD: '32', CXXFLAGS: "-fexcess-precision=fast" },
+        "g++-16-multilib",
+    ),
+
+    linux_pipeline(
+        "Linux 26.04 GCC 16 64",
+        "cppalliance/droneubuntu2604:1",
+        { TOOLSET: 'gcc', COMPILER: 'g++-16', CXXSTD: '03,11,14,17,20,23', ADDRMD: '64', CXXFLAGS: "-fexcess-precision=fast" },
+        "g++-16-multilib libabsl-dev",
+    ),
+
     linux_pipeline(
         "Linux 18.04 Clang 5.0",
         "cppalliance/droneubuntu1804:1",
@@ -387,17 +416,27 @@ local windows_pipeline(name, image, environment, arch = "amd64") =
     ),
 
     linux_pipeline(
-        "Linux 24.04 Clang 20 ASAN",
+        "Linux 24.04 Clang 21",
         "cppalliance/droneubuntu2404:1",
-        { TOOLSET: 'clang', COMPILER: 'clang++-20', CXXSTD: '03,11,14,17,20,23,2c' } + asan,
-        "clang-20",
-        ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-20 main"],
+        { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' },
+        "clang-21",
+        ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"],
+    ),
+
+    linux_pipeline(
+        "Linux 24.04 Clang 21 UBSAN",
+        "cppalliance/droneubuntu2404:1",
+        { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + ubsan,
+        "clang-21",
+        ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"],
     ),
 
-    macos_pipeline(
-        "MacOS Xcode 14.3.1",
-        { TOOLSET: 'clang', COMPILER: 'clang++', CXXSTD: '03,11,14,17,20,2b' } + asan,
-        xcode_version = "14.3.1", osx_version = "sonoma", arch = "arm64",
+    linux_pipeline(
+        "Linux 24.04 Clang 21 ASAN",
+        "cppalliance/droneubuntu2404:1",
+        { TOOLSET: 'clang', COMPILER: 'clang++-21', CXXSTD: '17,20,2b' } + asan,
+        "clang-21",
+        ["deb http://apt.llvm.org/noble/ llvm-toolchain-noble-21 main"],
     ),
 
     windows_pipeline(
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png
index dc6a7617..c3bcc0cb 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png
index 83c8c583..f415d0f3 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM32_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png
index 24535136..9763188a 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png
index 53de90ef..a0850e12 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png
index fcda4936..be27885c 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png
index d3882063..9bc0fa08 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/ppc64le_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png
index 4c6e4a0b..a4acb9e8 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/s390x_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png
index 6e4afd1f..48b13255 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/s390x_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png
index 04006510..a617985c 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/x64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png
index b0c804dc..ff8ea077 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/x64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png
index 21d31774..ba00ac99 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/linux/x86_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png
index 426e25e2..070f0ec9 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/linux/x86_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png
index 7c489191..a87c426d 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png
index 623bb465..19eb0bd6 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/macos/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png
deleted file mode 100644
index e377a803..00000000
Binary files a/doc/modules/ROOT/images/i128_graphs/macos/x64_benchmarks.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png
deleted file mode 100644
index f33de2ca..00000000
Binary files a/doc/modules/ROOT/images/i128_graphs/macos/x64_relative_performance.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png
index 136cbe9e..1771b3af 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png
index 02fb8df2..28156d8b 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png
index 811ed34b..d12d4ad9 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/x64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png
index 6d8d4b7b..44cab7da 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/x64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png b/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png
index f267154e..f6061bff 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png and b/doc/modules/ROOT/images/i128_graphs/windows/x86_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png b/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png
index aadf4d5f..7b05c54e 100644
Binary files a/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png and b/doc/modules/ROOT/images/i128_graphs/windows/x86_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png
index 35b756ef..656c35ad 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png
index ab77fb2b..43f03412 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM32_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png
index 7144e238..60353279 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png
index 706a4de8..c86d7035 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png
index ad886c80..a2142d97 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png
index b3b87b3b..6950202b 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/ppc64le_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png
index e99ab249..1f8be96e 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/s390x_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png
index 08ed1a3c..db03e704 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/s390x_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png
index 5d6194f9..9dfc748e 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/x64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png
index ed9cbc71..eb200f34 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/x64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png
index d3567a5e..93b48c97 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/linux/x86_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png
index 3780492b..5b498e0e 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/linux/x86_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png
index 989c040f..756dc31a 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png
index 15f49776..36047908 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/macos/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png
deleted file mode 100644
index fd5c43e1..00000000
Binary files a/doc/modules/ROOT/images/u128_graphs/macos/x64_benchmarks.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png
deleted file mode 100644
index f8123403..00000000
Binary files a/doc/modules/ROOT/images/u128_graphs/macos/x64_relative_performance.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png
new file mode 100644
index 00000000..0ccdcf58
Binary files /dev/null and b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png
new file mode 100644
index 00000000..75ef018b
Binary files /dev/null and b/doc/modules/ROOT/images/u128_graphs/windows/ARM64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png
deleted file mode 100644
index 7f6b0ff7..00000000
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/arm64_benchmarks.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png
deleted file mode 100644
index 3338a211..00000000
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/arm64_relative_performance.png and /dev/null differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png
index 9c6fba5b..aa3d9c30 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/windows/x64_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png
index 514ce6f3..5dc1c090 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/windows/x64_relative_performance.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png b/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png
index 1841898b..038ff287 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png and b/doc/modules/ROOT/images/u128_graphs/windows/x86_benchmarks.png differ
diff --git a/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png b/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png
index 9352658a..2446939c 100644
Binary files a/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png and b/doc/modules/ROOT/images/u128_graphs/windows/x86_relative_performance.png differ
diff --git a/doc/modules/ROOT/pages/i128_benchmarks.adoc b/doc/modules/ROOT/pages/i128_benchmarks.adoc
index 915e1be5..af9b1f3c 100644
--- a/doc/modules/ROOT/pages/i128_benchmarks.adoc
+++ b/doc/modules/ROOT/pages/i128_benchmarks.adoc
@@ -24,12 +24,12 @@ On MSVC platforms we use as reference `std::_Signed128` from the header `<__msvc
 |===
 | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128`
 
-| Comparisons | 879535 | 748787 | 2210502 | 741269
-| Addition | 92165 | 92441 | 283528 | 92323
-| Subtraction | 92514 | 88390 | 668953 | 90394
-| Multiplication | 115727 | 90897 | 312723 | 89558
-| Division | 1234838 | 1352795 | 1320695 | 1200439
-| Modulo | 1193529 | 1256687 | 1287093 | 1293439
+| Comparisons | 2232997 | 1970941 | 5478483 | 1944089
+| Addition | 244246 | 292081 | 650160 | 227720
+| Subtraction | 220957 | 196953 | 1625774 | 315611
+| Multiplication | 433431 | 321168 | 1595688 | 304069
+| Division | 4462364 | 4983165 | 4992819 | 4986970
+| Modulo | 4803576 | 5257406 | 4988844 | 5081814
 |===
 
 ////
@@ -44,12 +44,12 @@ image::i128_graphs/linux/x64_relative_performance.png[x64 Relative Performance,
 |===
 | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128`
 
-| Comparisons | 3495621 | 2279914 | 5910287 | 3749448
-| Addition | 191514 | 133319 | 566860 | 164848
-| Subtraction | 131380 | 193984 | 1066509 | 193467
-| Multiplication | 236071 | 234594 | 864526 | 237676
-| Division | 2412757 | 2434752 | 2508755 | 2484139
-| Modulo | 2501357 | 2171828 | 2571959 | 2158203
+| Comparisons | 4115337 | 2169531 | 5914108 | 3725321
+| Addition | 194461 | 196244 | 543680 | 195216
+| Subtraction | 151441 | 97565 | 1161677 | 192729
+| Multiplication | 334847 | 232518 | 904461 | 240980
+| Division | 2403064 | 1848517 | 2493904 | 2431322
+| Modulo | 2235322 | 2159401 | 2535438 | 2321638
 |===
 
 ////
@@ -64,12 +64,12 @@ image::i128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performan
 |===
 | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128`
 
-| Comparisons | 14099505 | 12588237 | 21074294 | 13972778
-| Addition | 1151086 | 1374984 | 3303931 | 1195725
-| Subtraction | 1223119 | 753561 | 4224613 | 1295929
-| Multiplication | 1904542 | 2060986 | 3034387 | 1733150
-| Division | 8768877 | 7080113 | 7306287 | 7968543
-| Modulo | 8661233 | 7180650 | 8801605 | 8175497
+| Comparisons | 5171094 | 5069329 | 7457296 | 5343843
+| Addition | 625328 | 785936 | 1286888 | 670826
+| Subtraction | 667538 | 356865 | 2555881 | 741947
+| Multiplication | 904480 | 729911 | 1562062 | 786829
+| Division | 3758577 | 2211087 | 3095993 | 3940264
+| Modulo | 4218409 | 2330114 | 3684163 | 3849849
 |===
 
 ////
@@ -107,12 +107,12 @@ NOTE: This platform has no hardware type so we compare relative to `boost::mp::i
 |===
 | Operation | `int128_t` | `boost::mp::int128_t`
 
-| Comparisons | 9530060 | 12168353
-| Addition | 785799 | 7777469
-| Subtraction  | 778881 | 8214089
-| Multiplication  | 1148024 | 9477355
-| Division  | 10337258 | 22857709
-| Modulo | 10438037 | 14848256
+| Comparisons | 10310201 | 14160000
+| Addition | 786499 | 7379646
+| Subtraction  | 907051 | 7890190
+| Multiplication  | 855780 | 10826565
+| Division  | 10254664 | 24702433
+| Modulo | 10851123 | 17348307
 |===
 
 ////
@@ -152,12 +152,12 @@ image::i128_graphs/linux/ARM32_relative_performance.png[ARM32 Relative Performan
 |===
 | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t`
 
-| Comparisons | 2186843 | 2142626 | 4854983
-| Addition | 186771 | 184598 | 2645943
-| Subtraction | 193660 | 186335 | 2925784
-| Multiplication | 402806 | 117413 | 3887479
-| Division | 1612873 | 2369701 | 6437280
-| Modulo | 1637135 | 2218627 | 6236026
+| Comparisons | 1879694 | 1894168 | 5198915
+| Addition | 141120 | 143877 | 2846799
+| Subtraction | 157649 | 156965 | 3027203
+| Multiplication | 266740 | 138754 | 4080611
+| Division | 1387560 | 1752869 | 6924406
+| Modulo | 1616895 | 1908345 | 6397442
 |===
 ////
 image::i128_graphs/windows/x64_benchmarks.png[x64 Benchmark Results, width=100%]
@@ -171,12 +171,12 @@ image::i128_graphs/windows/x64_relative_performance.png[x64 Relative Performance
 |===
 | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t`
 
-| Comparisons | 911829 | 368104 | 2376802
-| Addition | 33233 | 34001 | 121700
-| Subtraction | 33411 | 34130 | 1488822
-| Multiplication | 117586 | 56324 | 1564799
-| Division | 1127267 | 1500725 | 2808293
-| Modulo | 1287100 | 1548073 | 2997474
+| Comparisons | 991273 | 391918 | 2551137
+| Addition | 34519 | 48953 | 1243326
+| Subtraction | 34184 | 36278 | 1387708
+| Multiplication | 126490 | 36781 | 1632232
+| Division | 1128432 | 1107571 | 2472959
+| Modulo | 1427629 | 1310481 | 2926904
 |===
 ////
 image::i128_graphs/windows/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%]
@@ -190,12 +190,12 @@ image::i128_graphs/windows/ARM64_relative_performance.png[ARM64 Relative Perform
 |===
 | Operation | `std::_Signed128` | `int128_t` | `boost::mp::int128_t`
 
-| Comparisons | 3187340 | 3046252 | 4269507
-| Addition | 185960 | 189165 | 2488618
-| Subtraction | 979025 | 192609 | 2783600
-| Multiplication | 1896082 | 3569921 | 4908622
-| Division | 5566403 | 4348306 | 6835035
-| Modulo | 4697289 | 4793845 | 6476032
+| Comparisons | 3832024 | 3823023 | 5568151
+| Addition | 232554 | 197092 | 3488510
+| Subtraction | 1198377 | 145823 | 4011233
+| Multiplication | 2921104 | 428925 | 6219931
+| Division | 7174578 | 7189000 | 9748526
+| Modulo | 5528639 | 7028725 | 9205892
 |===
 ////
 image::i128_graphs/windows/x86_benchmarks.png[x86_32 Benchmark Results, width=100%]
@@ -212,12 +212,12 @@ image::i128_graphs/windows/x86_relative_performance.png[x86_32 Relative Performa
 |===
 | Operation | `__int128` | `int128_t` | `boost::mp::int128_t` | `absl::int128`
 
-| Comparisons | 133275 | 131953 | 340555 | 133509
-| Addition | 20203 | 17797 | 169909 | 20208
-| Subtraction | 20203 | 17832 | 172497 | 22199
-| Multiplication | 21496 | 20202 | 78269 | 20364
-| Division | 662767 | 682891 | 969277 | 663602
-| Modulo | 719179 | 692509 | 1026090 | 717897
+| Comparisons | 135259 | 134127 | 340037 | 136845
+| Addition | 20399 | 18575 | 169575 | 20429
+| Subtraction | 20156 | 18983 | 168041 | 20875
+| Multiplication | 20654 | 20860 | 69443 | 20651
+| Division | 668004 | 659823 | 976248 | 660963
+| Modulo | 664356 | 662282 | 1026487 | 665474
 |===
 
 ////
@@ -225,23 +225,3 @@ image::i128_graphs/macos/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100
 ////
 
 image::i128_graphs/macos/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%]
-
-=== x86_64
-
-[cols="1,1,1,1"]
-|===
-| Operation | `__int128` | `int128_t` | `boost::mp::int128_t`
-
-| Comparisons | 1628142 | 1748005 | 4318109
-| Addition | 224648 | 180393 | 925013
-| Subtraction | 212849 | 131062 | 1876834
-| Multiplication | 432205 | 407829 | 651209
-| Division | 3924951 | 2409106 | 3719183
-| Modulo | 3042060 | 2423738 | 4443402
-|===
-
-////
-image::i128_graphs/macos/x64_benchmarks.png[x64 Benchmark Results, width=100%]
-////
-
-image::i128_graphs/macos/x64_relative_performance.png[x64 Relative Performance, width=100%]
diff --git a/doc/modules/ROOT/pages/u128_benchmarks.adoc b/doc/modules/ROOT/pages/u128_benchmarks.adoc
index 88f9a03b..ff2e0089 100644
--- a/doc/modules/ROOT/pages/u128_benchmarks.adoc
+++ b/doc/modules/ROOT/pages/u128_benchmarks.adoc
@@ -24,12 +24,12 @@ On MSVC platforms we use as reference `std::_Unsigned128` from the header `<__ms
 |===
 | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128`
 
-| Comparisons | 785130 | 765065 | 1363581 | 766205
-| Addition | 90260 | 85758 | 89958 | 89255
-| Subtraction | 91143 | 91449 | 91224 | 89716
-| Multiplication | 111803 | 90069 | 113559 | 89660
-| Division | 1058435 | 901516 | 1040071 | 1044710
-| Modulo | 1003366 | 830830 | 1001701 | 978533
+| Comparisons | 2555576 | 2404372 | 3576079 | 2099066
+| Addition | 242772 | 241336 | 328546 | 301186
+| Subtraction | 372481 | 260064 | 287267 | 282908
+| Multiplication | 356366 | 312736 | 326328 | 277284
+| Division | 4481403 | 4498211 | 4602586 | 4290212
+| Modulo | 3965562 | 4506879 | 4487023 | 4247367
 |===
 
 ////
@@ -44,12 +44,12 @@ image::u128_graphs/linux/x64_relative_performance.png[x64 Relative Performance,
 |===
 | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128`
 
-| Comparisons | 3427201 | 2078586 | 5026689 | 3753922
-| Addition | 194968 | 159662 | 587373 | 194070
-| Subtraction | 193067 | 161903 | 330052 | 140777
-| Multiplication | 263187 | 201333 | 972009 | 244420
-| Division | 2338258 | 2247175 | 2190856 | 2223032
-| Modulo | 2260200 | 2097760 | 2227961 | 2186750
+| Comparisons | 4077924 | 2335044 | 5360167 | 4184235
+| Addition | 137276 | 151553 | 184406 | 151276
+| Subtraction | 155498 | 133470 | 186793 | 149111
+| Multiplication | 218009 | 233811 | 324341 | 293431
+| Division | 2254781 | 1819447 | 2211225 | 2152312
+| Modulo | 2274294 | 1743274 | 2324356 | 2381378
 |===
 
 ////
@@ -64,12 +64,12 @@ image::u128_graphs/linux/ARM64_relative_performance.png[ARM64 Relative Performan
 |===
 | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128`
 
-| Comparisons | 6803419 | 6280326 | 7965082 | 10515929
-| Addition | 546801 | 618774 | 621572 | 1744226
-| Subtraction | 590011 | 359100 | 691515 | 1527622
-| Multiplication | 891753 | 1192196 | 944289 | 1839038
-| Division | 3827125 | 3201674 | 3997037 | 4913142
-| Modulo | 4925696 | 3360251 | 5144403 | 5422155
+| Comparisons | 7293935 | 6198402 | 8182815 | 13820009
+| Addition | 636224 | 707436 | 611849 | 1530136
+| Subtraction | 572225 | 350035 | 595266 | 1211168
+| Multiplication | 1040424 | 741789 | 899957 | 1843000
+| Division | 4191637 | 2593472 | 4106663 | 4883553
+| Modulo | 4156643 | 2133029 | 4398856 | 5011442
 |===
 
 ////
@@ -107,12 +107,12 @@ NOTE: This platform has no hardware type so we compare relative to `boost::mp::u
 |===
 | Operation | `uint128_t` | `boost::mp::uint128_t`
 
-| Comparisons | 9000979 | 8722814
-| Addition | 898718 | 9912175
-| Subtraction  | 778881 | 9773677
-| Multiplication  | 1778273 | 8678420
-| Division  | 8496503 | 18133965
-| Modulo | 9081442 | 11257837
+| Comparisons | 9545542 | 8582001
+| Addition | 686648 | 7261481
+| Subtraction  | 618456 | 7968678
+| Multiplication  | 859253 | 6746697
+| Division  | 8271920 | 15931092
+| Modulo | 9932867 | 10242720
 |===
 
 ////
@@ -152,12 +152,12 @@ image::u128_graphs/linux/ARM32_relative_performance.png[ARM32 Relative Performan
 |===
 | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t`
 
-| Comparisons | 2060556 | 1921174 | 3009890
-| Addition | 261475 | 106545 | 2710279
-| Subtraction | 178724 | 124181 | 3059187
-| Multiplication | 146063 | 136115 | 3495634
-| Division | 1332838 | 1360295 | 4852899
-| Modulo | 1465138 | 1471169 | 3926336
+| Comparisons | 2055229 | 1714007 | 2490543
+| Addition | 152603 | 116444 | 2596037
+| Subtraction | 150576 | 116367 | 2901567
+| Multiplication | 131223 | 123694 | 3300491
+| Division | 1476783 | 1489919 | 4898388
+| Modulo | 1421066 | 1411521 | 3793762
 |===
 ////
 image::u128_graphs/windows/x64_benchmarks.png[x64 Benchmark Results, width=100%]
@@ -171,18 +171,18 @@ image::u128_graphs/windows/x64_relative_performance.png[x64 Relative Performance
 |===
 | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t`
 
-| Comparisons | 3424403 | 2062167 | 5026689
-| Addition | 123659 | 133084 | 587373
-| Subtraction | 171721 | 99453 | 330052
-| Multiplication | 329287 | 283443 | 972009
-| Division | 2044821 | 1825020 | 2190856
-| Modulo | 2176318 | 1897933 | 2227961
+| Comparisons | 945196 | 405891 | 1306884
+| Addition | 37403 | 40039 | 1351728
+| Subtraction | 33927 | 38887 | 1594845
+| Multiplication | 74384 | 46406 | 1281286
+| Division | 992963 | 790846 | 2035065
+| Modulo | 1087702 | 861121 | 1702396
 |===
 ////
-image::u128_graphs/windows/arm64_benchmarks.png[ARM64 Benchmark Results, width=100%]
+image::u128_graphs/windows/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100%]
 ////
 
-image::u128_graphs/windows/arm64_relative_performance.png[ARM64 Relative Performance, width=100%]
+image::u128_graphs/windows/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%]
 
 === x86_32
 
@@ -190,12 +190,12 @@ image::u128_graphs/windows/arm64_relative_performance.png[ARM64 Relative Perform
 |===
 | Operation | `std::_Unsigned128` | `uint128_t` | `boost::mp::uint128_t`
 
-| Comparisons | 4215438 | 3883846 | 2852442
-| Addition | 199945 | 208436 | 3242910
-| Subtraction | 1206168 | 210874 | 3851129
-| Multiplication | 2282869 | 2680359 | 5378001
-| Division | 5516964 | 4328917 | 6948267
-| Modulo | 4551146 | 4330152 | 6294325
+| Comparisons | 4806287 | 3940703 | 2624013
+| Addition | 254275 | 202421 | 2961566
+| Subtraction | 1322877 | 207351 | 3703369
+| Multiplication | 2327500 | 2312040 | 4375417
+| Division | 5596877 | 5629510 | 6756883
+| Modulo | 4616488 | 5696116 | 6409969
 |===
 ////
 image::u128_graphs/windows/x86_benchmarks.png[x86_32 Benchmark Results, width=100%]
@@ -212,12 +212,12 @@ image::u128_graphs/windows/x86_relative_performance.png[x86_32 Relative Performa
 |===
 | Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t` | `absl::uint128`
 
-| Comparisons | 131902 | 133564 | 134182 | 132366
-| Addition | 20613 | 17912 | 40176 | 20178
-| Subtraction | 20484 | 18237 | 40311 | 20207
-| Multiplication | 20160 | 20580 | 43285 | 20049
-| Division | 686521 | 699201 | 945928 | 672398
-| Modulo | 777084 | 724648 | 953117 | 734229
+| Comparisons | 134425 | 134742 | 133107 | 135182
+| Addition | 20754 | 18389 | 20653 | 20929
+| Subtraction | 20552 | 18573 | 20590 | 20439
+| Multiplication | 20264 | 20150 | 20181 | 20228
+| Division | 685358 | 740877 | 913877 | 718985
+| Modulo | 733080 | 699666 | 951657 | 719500
 |===
 
 ////
@@ -225,23 +225,3 @@ image::u128_graphs/macos/ARM64_benchmarks.png[ARM64 Benchmark Results, width=100
 ////
 
 image::u128_graphs/macos/ARM64_relative_performance.png[ARM64 Relative Performance, width=100%]
-
-=== x86_64
-
-[cols="1,1,1,1"]
-|===
-| Operation | `unsigned __int128` | `uint128_t` | `boost::mp::uint128_t`
-
-| Comparisons | 688225 | 712352 | 689146
-| Addition | 104921 | 124992 | 137819
-| Subtraction | 129150 | 102302 | 153484
-| Multiplication | 120363 | 119652 | 164100
-| Division | 2333812 | 1981469 | 2784139
-| Modulo | 2621949 | 2219481 | 2736682
-|===
-
-////
-image::u128_graphs/macos/x64_benchmarks.png[x64 Benchmark Results, width=100%]
-////
-
-image::u128_graphs/macos/x64_relative_performance.png[x64 Relative Performance, width=100%]
diff --git a/doc/plots.py b/doc/plots.py
index f1150102..5196c0ab 100644
--- a/doc/plots.py
+++ b/doc/plots.py
@@ -1,225 +1,356 @@
+#!/usr/bin/env python3
+"""Generate every Boost.Int128 benchmark graph and write it straight into the
+documentation images tree.
+
+Each entry produces two PNGs whose names match the image:: directives in the
+.adoc pages:
+
+    modules/ROOT/images/<sign>_graphs/<os>/<arch>_benchmarks.png
+    modules/ROOT/images/<sign>_graphs/<os>/<arch>_relative_performance.png
+
+To refresh a platform's numbers, edit its 'data' block here and re-run; the
+right file is overwritten automatically.
+"""
+
+import os
+
+import matplotlib
+matplotlib.use('Agg')  # headless backend: write files, never open a window
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 
-"""
-# ARM64 MSVC
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Unsigned128': [878929, 32788, 33627, 68120, 925583, 1104772],
-    'uint128_t': [259725, 33723, 36799, 35334, 1020148, 1143344],
-    'boost::mp::uint128_t': [1246502, 1437452, 1648131, 1459418, 2216648, 2089105]
-}
-"""
+# Operation order shared by every dataset (matches the x-axis of all charts).
+OPERATIONS = ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo']
 
-"""
-# x86 MSVC
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Unsigned128': [4215438, 199945, 1206168, 2282869, 5516964, 4551146],
-    'uint128_t': [3883846, 208436, 210874, 2680359, 4328917, 4330152],
-    'boost::mp::uint128_t': [2852442, 3242910, 3851129, 5378001, 6948267, 6294325]
-}
-"""
+# One entry per published graph. Fields:
+#   sign  : 'u128' (unsigned) or 'i128' (signed)  -> selects the *_graphs folder
+#   os    : 'linux' | 'macos' | 'windows'         -> selects the sub-folder
+#   arch  : file stem used by the .adoc image:: directives (casing must match the
+#           image:: targets exactly; ARM stems are always upper-case, e.g. 'ARM64')
+#   title : chart heading prefix, e.g. 'GCC 14 - x64'
+#   data  : implementation -> timings in microseconds, in OPERATIONS order
+# The normalization baseline for the relative chart is detected automatically
+# (native type where present, otherwise Boost.Multiprecision).
+DATASETS = [
+    # ----------------------------- unsigned, Linux -----------------------------
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 'x64', 'title': 'GCC 16 - x64',
+        'data': {
+            'unsigned __int128': [2555576, 242772, 372481, 356366, 4481403, 3965562],
+            'uint128_t': [2404372, 241336, 260064, 312736, 4498211, 4506879],
+            'boost::mp::uint128_t': [3576079, 328546, 287267, 326328, 4602586, 4487023],
+            'absl::uint128': [2099066, 301186, 282908, 277284, 4290212, 4247367],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 'ARM64', 'title': 'GCC 13 - ARM64',
+        'data': {
+            'unsigned __int128': [4077924, 137276, 155498, 218009, 2254781, 2274294],
+            'uint128_t': [2335044, 151553, 133470, 233811, 1819447, 1743274],
+            'boost::mp::uint128_t': [5360167, 184406, 186793, 324341, 2211225, 2324356],
+            'absl::uint128': [4184235, 151276, 149111, 293431, 2152312, 2381378],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 's390x', 'title': 'GCC 13 - s390x',
+        'data': {
+            'unsigned __int128': [7293935, 636224, 572225, 1040424, 4191637, 4156643],
+            'uint128_t': [6198402, 707436, 350035, 741789, 2593472, 2133029],
+            'boost::mp::uint128_t': [8182815, 611849, 595266, 899957, 4106663, 4398856],
+            'absl::uint128': [13820009, 1530136, 1211168, 1843000, 4883553, 5011442],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 'ppc64le', 'title': 'GCC 14 - ppc64le',
+        'data': {
+            'unsigned __int128': [5242604, 221776, 222894, 194494, 4821119, 4955570],
+            'uint128_t': [4450958, 193063, 175259, 192929, 4896360, 4273487],
+            'boost::mp::uint128_t': [5704848, 847504, 786659, 795187, 5344637, 5407877],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 'x86', 'title': 'GCC 16 - x86_32',
+        'data': {
+            'uint128_t': [9545542, 686648, 618456, 859253, 8271920, 9932867],
+            'boost::mp::uint128_t': [8582001, 7261481, 7968678, 6746697, 15931092, 10242720],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'linux', 'arch': 'ARM32', 'title': 'GCC 14 - ARM32',
+        'data': {
+            'uint128_t': [5286033, 454715, 487190, 1471479, 19868087, 20332627],
+            'boost::mp::uint128_t': [4538707, 5543856, 6465126, 8246098, 32820805, 27238658],
+        },
+    },
+    # ---------------------------- unsigned, Windows ----------------------------
+    {
+        'sign': 'u128', 'os': 'windows', 'arch': 'x64', 'title': 'MSVC 14.5 - x64',
+        'data': {
+            'std::_Unsigned128': [2055229, 152603, 150576, 131223, 1476783, 1421066],
+            'uint128_t': [1714007, 116444, 116367, 123694, 1489919, 1411521],
+            'boost::mp::uint128_t': [2490543, 2596037, 2901567, 3300491, 4898388, 3793762],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'windows', 'arch': 'ARM64', 'title': 'MSVC 14.5 - ARM64',
+        'data': {
+            'std::_Unsigned128': [945196, 37403, 33927, 74384, 992963, 1087702],
+            'uint128_t': [405891, 40039, 38887, 46406, 790846, 861121],
+            'boost::mp::uint128_t': [1306884, 1351728, 1594845, 1281286, 2035065, 1702396],
+        },
+    },
+    {
+        'sign': 'u128', 'os': 'windows', 'arch': 'x86', 'title': 'MSVC 14.5 - x86_32',
+        'data': {
+            'std::_Unsigned128': [4806287, 254275, 1322877, 2327500, 5596877, 4616488],
+            'uint128_t': [3940703, 202421, 207351, 2312040, 5629510, 5696116],
+            'boost::mp::uint128_t': [2624013, 2961566, 3703369, 4375417, 6756883, 6409969],
+        },
+    },
+    # ----------------------------- unsigned, macOS -----------------------------
+    {
+        'sign': 'u128', 'os': 'macos', 'arch': 'ARM64', 'title': 'Clang 22 - ARM64',
+        'data': {
+            'unsigned __int128': [134425, 20754, 20552, 20264, 685358, 733080],
+            'uint128_t': [134742, 18389, 18573, 20150, 740877, 699666],
+            'boost::mp::uint128_t': [133107, 20653, 20590, 20181, 913877, 951657],
+            'absl::uint128': [135182, 20929, 20439, 20228, 718985, 719500],
+        },
+    },
+    # ------------------------------ signed, Linux ------------------------------
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 'x64', 'title': 'GCC 16 - x64',
+        'data': {
+            '`__int128`': [2232997, 244246, 220957, 433431, 4462364, 4803576],
+            'int128_t': [1970941, 292081, 196953, 321168, 4983165, 5257406],
+            'boost::mp::int128_t': [5478483, 650160, 1625774, 1595688, 4992819, 4988844],
+            'absl::int128': [1944089, 227720, 315611, 304069, 4986970, 5081814],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 'ARM64', 'title': 'GCC 13 - ARM64',
+        'data': {
+            '`__int128`': [4115337, 194461, 151441, 334847, 2403064, 2235322],
+            'int128_t': [2169531, 196244, 97565, 232518, 1848517, 2159401],
+            'boost::mp::int128_t': [5914108, 543680, 1161677, 904461, 2493904, 2535438],
+            'absl::int128': [3725321, 195216, 192729, 240980, 2431322, 2321638],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 's390x', 'title': 'GCC 13 - s390x',
+        'data': {
+            '`__int128`': [5171094, 625328, 667538, 904480, 3758577, 4218409],
+            'int128_t': [5069329, 785936, 356865, 729911, 2211087, 2330114],
+            'boost::mp::int128_t': [7457296, 1286888, 2555881, 1562062, 3095993, 3684163],
+            'absl::int128': [5343843, 670826, 741947, 786829, 3940264, 3849849],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 'ppc64le', 'title': 'GCC 14 - ppc64le',
+        'data': {
+            '`__int128`': [4538094, 221708, 222629, 193315, 5607581, 5623562],
+            'int128_t': [5796198, 191841, 174273, 191785, 4669820, 4750314],
+            'boost::mp::int128_t': [13907323, 1177034, 1861166, 878393, 5616217, 5641480],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 'x86', 'title': 'GCC 16 - x86_32',
+        'data': {
+            'int128_t': [10310201, 786499, 907051, 855780, 10254664, 10851123],
+            'boost::mp::int128_t': [14160000, 7379646, 7890190, 10826565, 24702433, 17348307],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'linux', 'arch': 'ARM32', 'title': 'GCC 14 - ARM32',
+        'data': {
+            'int128_t': [6149439, 457850, 488321, 1793874, 17738614, 18064819],
+            'boost::mp::int128_t': [6432579, 5669571, 7464427, 11410321, 38956122, 30144743],
+        },
+    },
+    # ----------------------------- signed, Windows -----------------------------
+    {
+        'sign': 'i128', 'os': 'windows', 'arch': 'x64', 'title': 'MSVC 14.5 - x64',
+        'data': {
+            'std::_Signed128': [1879694, 141120, 157649, 266740, 1387560, 1616895],
+            'int128_t': [1894168, 143877, 156965, 138754, 1752869, 1908345],
+            'boost::mp::int128_t': [5198915, 2846799, 3027203, 4080611, 6924406, 6397442],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'windows', 'arch': 'ARM64', 'title': 'MSVC 14.3 - ARM64',
+        'data': {
+            'std::_Signed128': [991273, 34519, 34184, 126490, 1128432, 1427629],
+            'int128_t': [391918, 48953, 36278, 36781, 1107571, 1310481],
+            'boost::mp::int128_t': [2551137, 1243326, 1387708, 1632232, 2472959, 2926904],
+        },
+    },
+    {
+        'sign': 'i128', 'os': 'windows', 'arch': 'x86', 'title': 'MSVC 14.5 - x86_32',
+        'data': {
+            'std::_Signed128': [3832024, 232554, 1198377, 2921104, 7174578, 5528639],
+            'int128_t': [3823023, 197092, 145823, 428925, 7189000, 7028725],
+            'boost::mp::int128_t': [5568151, 3488510, 4011233, 6219931, 9748526, 9205892],
+        },
+    },
+    # ------------------------------ signed, macOS ------------------------------
+    {
+        'sign': 'i128', 'os': 'macos', 'arch': 'ARM64', 'title': 'Clang 22 - ARM64',
+        'data': {
+            '`__int128`': [135259, 20399, 20156, 20654, 668004, 664356],
+            'int128_t': [134127, 18575, 18983, 20860, 659823, 662282],
+            'boost::mp::int128_t': [340037, 169575, 168041, 69443, 976248, 1026487],
+            'absl::int128': [136845, 20429, 20875, 20651, 660963, 665474],
+        },
+    },
+]
 
-"""
-# x64 MSVC
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Unsigned128': [2060556, 261475, 178724, 146063, 1332838, 1465138],
-    'uint128_t': [1921174, 106545, 124181, 136115, 1360295, 1471169],
-    'boost::mp::uint128_t': [3009890, 2710279, 3059187, 3495634, 4852899, 3926336]
-}
-"""
-"""
-# ARM64 macOS
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [131902, 20613, 20484, 20160, 686521, 777084],
-    'uint128_t': [133564, 17912, 18237, 20580, 699201, 724648],
-    'boost::mp::uint128_t': [134182, 40176, 40311, 43285, 945928, 953117],
-    'absl::uint128': [132366, 20178, 20207, 20049, 672398, 734229]
-}
-"""
-"""
-# x64 macOS
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [688225, 104921, 129150, 120363, 2333812, 2621949],
-    'uint128_t': [712352, 124992, 102302, 119652, 1981469, 2219481],
-    'boost::mp::uint128_t': [689146, 137819, 153484, 164100, 2784139, 2736682]
-}
-"""
+# Bar colors by speed rank within an operation: green best, yellow second, red rest.
+RANK_COLORS = {1: '#90EE90', 2: '#FFFFE0'}
+SLOW_COLOR = '#FFB6C1'
 
-# Linux x64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [785130, 90260, 91143, 111803, 1058435, 1003366],
-    'uint128_t': [765065, 85758, 91449, 90069, 901516, 830830],
-    'boost::mp::uint128_t': [1363581, 89958, 91224, 113559, 1040071, 1001701],
-    'absl::uint128': [766205, 89255, 89716, 89660, 1044710, 978533]
+# Baseline candidates in priority order; first one present in a dataset wins.
+BASELINE_PRIORITY = {
+    'u128': ['unsigned __int128', 'std::_Unsigned128', 'boost::mp::uint128_t'],
+    'i128': ['`__int128`', '__int128', 'std::_Signed128', 'boost::mp::int128_t'],
 }
 
-"""
-# Linux ARM64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [3427201, 194968, 193067, 263187, 2338258, 2260200],
-    'uint128_t': [2078586, 159662, 161903, 201333, 2247175, 2097760],
-    'boost::mp::uint128_t': [5026689, 587373, 330052, 972009, 2190856, 2227961],
-    'absl::uint128': [3753922, 194070, 140777, 244420, 2223032, 2186750]
-}
-"""
-"""
-# Linux S390x
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [6803419, 546801, 590011, 891753, 3827125, 4925696],
-    'uint128_t': [6280326, 618774, 359100, 1192196, 3201674, 3360251],
-    'boost::mp::uint128_t': [7965082, 621572, 691515, 944289, 3997037, 5144403],
-    'absl::uint128': [10515929, 1744226, 1527622, 1839038, 4913142, 5422155]
-}
-"""
-"""
-# Linux ppc64le
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'unsigned __int128': [5242604, 221776, 222894, 194494, 4821119, 4955570],
-    'uint128_t': [4450958, 193063, 175259, 192929, 4896360, 4273487],
-    'boost::mp::uint128_t': [5704848, 847504, 786659, 795187, 5344637, 5407877]
-}
-"""
-df = pd.DataFrame(data)
-
-# Function to determine color based on ranking
-def get_colors_by_rank(row):
-    values = row[1:].values
-    ranks = np.argsort(values) + 1
-    colors = []
-    for rank in ranks:
-        if rank == 1:
-            colors.append('#90EE90')  # Light Green - Best
-        elif rank == 2:
-            colors.append('#FFFFE0')  # Light Yellow - Second
-        else:
-            colors.append('#FFB6C1')  # Light Red - Third
-    return colors
-
-# Create figure with subplots
-fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
-
-# Prepare data
-operations = df['Operation']
-x = np.arange(len(operations))
-width = 0.25
-
-# Get implementation names
-implementations = df.columns[1:]
-
-# Plot 1: Regular scale bar chart with color coding
-for i, (idx, row) in enumerate(df.iterrows()):
-    colors = get_colors_by_rank(row)
-    for j, impl in enumerate(implementations):
-        ax1.bar(x[i] + (j-1)*width, row[impl], width,
-                color=colors[j], edgecolor='black', linewidth=0.5,
-                label=impl if i == 0 else "")
-
-ax1.set_xlabel('Operations', fontsize=12)
-ax1.set_ylabel('Time (nanoseconds)', fontsize=12)
-ax1.set_title('GCC 14 - x64 Benchmark Results', fontsize=14, fontweight='bold')
-ax1.set_xticks(x)
-ax1.set_xticklabels(operations, rotation=45, ha='right')
-ax1.legend(loc='upper left')
-ax1.grid(axis='y', alpha=0.3)
-
-# Add value labels on bars
-for i, (idx, row) in enumerate(df.iterrows()):
-    for j, impl in enumerate(implementations):
-        ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}',
-                 ha='center', va='bottom', fontsize=8, rotation=90)
-
-# Plot 2: Log scale for better visualization
-for i, impl in enumerate(implementations):
-    bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5)
-
-    # Color each bar based on its rank within operation
-    for j, bar in enumerate(bars):
-        operation_values = df.iloc[j, 1:].values
-        rank = np.argsort(operation_values).tolist().index(i) + 1
-        if rank == 1:
-            bar.set_facecolor('#90EE90')
-        elif rank == 2:
-            bar.set_facecolor('#FFFFE0')
-        else:
-            bar.set_facecolor('#FFB6C1')
-
-ax2.set_xlabel('Operations', fontsize=12)
-ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12)
-ax2.set_title('GCC 14 - x64 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold')
-ax2.set_yscale('log')
-ax2.set_xticks(x)
-ax2.set_xticklabels(operations, rotation=45, ha='right')
-ax2.legend(loc='upper left')
-ax2.grid(axis='y', alpha=0.3, which='both')
-
-plt.tight_layout()
-plt.savefig('x64_benchmarks.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Create a normalized performance chart
-fig3, ax3 = plt.subplots(figsize=(10, 6))
-
-# Normalize data relative to unsigned __int128
-normalized_df = df.copy()
-for col in implementations:
-    normalized_df[col] = df[col] / df['unsigned __int128']
-
-# Plot normalized bars
-for i, impl in enumerate(implementations):
-    if impl == 'unsigned __int128':
-        continue  # Skip since it's always 1.0
-    bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width,
-                   label=impl, edgecolor='black', linewidth=0.5)
-
-    # Add value labels
-    for j, bar in enumerate(bars):
-        height = bar.get_height()
-        ax3.text(bar.get_x() + bar.get_width()/2., height,
-                 f'{height:.2f}x', ha='center', va='bottom', fontsize=9)
-
-# Add reference line at 1.0
-ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='unsigned __int128 baseline')
-
-ax3.set_xlabel('Operations', fontsize=12)
-ax3.set_ylabel('Relative Performance (vs unsigned __int128)', fontsize=12)
-ax3.set_title('Relative Performance Comparison - x64', fontsize=14, fontweight='bold')
-ax3.set_xticks(x)
-ax3.set_xticklabels(operations, rotation=45, ha='right')
-ax3.legend()
-ax3.grid(axis='y', alpha=0.3)
-
-# Add interpretation text
-ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes,
-         fontsize=10, verticalalignment='top', style='italic')
-
-plt.tight_layout()
-plt.savefig('x64_relative_performance.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Generate summary statistics
-print("\nPerformance Summary (x64):")
-print("-" * 50)
-for impl in implementations:
-    if impl == 'unsigned __int128':
-        continue
-    avg_ratio = normalized_df[impl].mean()
-    print(f"{impl}: {avg_ratio:.2f}x average vs unsigned __int128")
-
-print("\nBest performer by operation:")
-print("-" * 50)
-for i, op in enumerate(operations):
-    row_data = df.iloc[i, 1:]
-    best_impl = row_data.idxmin()
-    best_time = row_data.min()
-    print(f"{op}: {best_impl} ({best_time:,} ns)")
 
+# Pick the column every other implementation is compared against.
+def detect_baseline(impls, sign):
+    for candidate in BASELINE_PRIORITY[sign]:
+        if candidate in impls:
+            return candidate
+    return impls[0]
+
+
+# 1-based speed rank per implementation for one operation row (1 == fastest).
+def speed_ranks(values):
+    return np.argsort(np.argsort(values)) + 1
+
+
+def color_for_rank(rank):
+    return RANK_COLORS.get(rank, SLOW_COLOR)
+
+
+# Build the two-panel benchmark figure (linear + log) and save it.
+def save_benchmark_chart(df, impls, x, width, title, path):
+    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
+
+    # Speed rank (1 == fastest) of each implementation, per operation row.
+    rank_by_op = [speed_ranks(df.iloc[op][impls].values) for op in range(len(df))]
+
+    # Linear panel: one rank-colored bar per implementation within each operation.
+    for op_idx, (_, row) in enumerate(df.iterrows()):
+        ranks = rank_by_op[op_idx]
+        for j, impl in enumerate(impls):
+            ax1.bar(x[op_idx] + (j - 1) * width, row[impl], width,
+                    color=color_for_rank(ranks[j]), edgecolor='black', linewidth=0.5,
+                    label=impl if op_idx == 0 else "")
+            ax1.text(x[op_idx] + (j - 1) * width, row[impl], f'{row[impl]:,}',
+                     ha='center', va='bottom', fontsize=8, rotation=90)
+
+    ax1.set_xlabel('Operations', fontsize=12)
+    ax1.set_ylabel('Time (microseconds)', fontsize=12)
+    ax1.set_title(f'{title} Benchmark Results', fontsize=14, fontweight='bold')
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(OPERATIONS, rotation=45, ha='right')
+    ax1.legend(loc='upper left')
+    ax1.grid(axis='y', alpha=0.3)
+
+    # Log panel: same bars and rank colors, log y-axis for the wide dynamic range.
+    # Draw once per implementation (carries the legend label), then recolor each
+    # bar by its rank so the legend entry stays attached.
+    for j, impl in enumerate(impls):
+        bars = ax2.bar(x + (j - 1) * width, df[impl], width, label=impl,
+                       edgecolor='black', linewidth=0.5)
+        for op_idx, bar in enumerate(bars):
+            bar.set_facecolor(color_for_rank(rank_by_op[op_idx][j]))
+
+    ax2.set_xlabel('Operations', fontsize=12)
+    ax2.set_ylabel('Time (microseconds) - Log Scale', fontsize=12)
+    ax2.set_title(f'{title} Benchmark Results (Log Scale)', fontsize=14, fontweight='bold')
+    ax2.set_yscale('log')
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(OPERATIONS, rotation=45, ha='right')
+    ax2.legend(loc='upper left')
+    ax2.grid(axis='y', alpha=0.3, which='both')
+
+    fig.tight_layout()
+    fig.savefig(path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+
+
+# Build the relative-performance figure (everything normalized to baseline) and save it.
+def save_relative_chart(df, impls, x, width, title, baseline, path):
+    fig, ax = plt.subplots(figsize=(10, 6))
+
+    normalized = df[impls].div(df[baseline], axis=0)
+    for i, impl in enumerate(impls):
+        if impl == baseline:
+            continue
+        bars = ax.bar(x + (i - 1.5) * width, normalized[impl], width,
+                      label=impl, edgecolor='black', linewidth=0.5)
+        for bar in bars:
+            height = bar.get_height()
+            ax.text(bar.get_x() + bar.get_width() / 2., height,
+                    f'{height:.2f}x', ha='center', va='bottom', fontsize=9)
+
+    # Headroom above the tallest bar so its value label and the "lower is better"
+    # note in the top-left corner never collide with the bars.
+    plotted = [impl for impl in impls if impl != baseline]
+    tallest = float(normalized[plotted].to_numpy().max())
+    ax.set_ylim(top=max(tallest * 1.20, 1.12))
+
+    ax.axhline(y=1.0, color='red', linestyle='--', alpha=0.5,
+               label=f'{baseline} baseline')
+    ax.set_xlabel('Operations', fontsize=12)
+    ax.set_ylabel(f'Relative Performance (vs {baseline})', fontsize=12)
+    ax.set_title(f'Relative Performance Comparison - {title}', fontsize=14, fontweight='bold')
+    ax.set_xticks(x)
+    ax.set_xticklabels(OPERATIONS, rotation=45, ha='right')
+    ax.legend()
+    ax.grid(axis='y', alpha=0.3)
+    ax.text(0.02, 0.98, 'Lower is better', transform=ax.transAxes,
+            fontsize=10, verticalalignment='top', style='italic')
+
+    fig.tight_layout()
+    fig.savefig(path, dpi=300, bbox_inches='tight')
+    plt.close(fig)
+
+
+# Render and save both charts for a single dataset; return the two output paths.
+def render_dataset(entry, images_dir):
+    impls = list(entry['data'].keys())
+    df = pd.DataFrame({'Operation': OPERATIONS, **entry['data']})
+    baseline = detect_baseline(impls, entry['sign'])
+
+    x = np.arange(len(OPERATIONS))
+    width = 0.25
+
+    out_dir = os.path.join(images_dir, f"{entry['sign']}_graphs", entry['os'])
+    os.makedirs(out_dir, exist_ok=True)
+    bench_path = os.path.join(out_dir, f"{entry['arch']}_benchmarks.png")
+    rel_path = os.path.join(out_dir, f"{entry['arch']}_relative_performance.png")
+
+    save_benchmark_chart(df, impls, x, width, entry['title'], bench_path)
+    save_relative_chart(df, impls, x, width, entry['title'], baseline, rel_path)
+    return bench_path, rel_path
+
+
+def main():
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    images_dir = os.path.join(script_dir, 'modules', 'ROOT', 'images')
+
+    written = 0
+    for entry in DATASETS:
+        bench_path, rel_path = render_dataset(entry, images_dir)
+        for path in (bench_path, rel_path):
+            print(f"wrote {os.path.relpath(path, script_dir)}")
+            written += 1
+
+    print(f"\nDone: {written} images across {len(DATASETS)} platforms.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/plots_32bit.py b/doc/plots_32bit.py
deleted file mode 100644
index 4e98830e..00000000
--- a/doc/plots_32bit.py
+++ /dev/null
@@ -1,153 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-"""
-# Linux x86_32
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'uint128_t': [9000979, 898718, 778881, 1778273, 8496503, 9081442],
-    'boost::mp::uint128_t': [8722814, 9912175, 9773677, 8678420, 18133965, 11257837]
-}
-"""
-# Linux ARM32
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'uint128_t': [5286033, 454715, 487190, 1471479, 19868087, 20332627],
-    'boost::mp::uint128_t': [4538707, 5543856, 6465126, 8246098, 32820805, 27238658]
-}
-
-df = pd.DataFrame(data)
-
-# Function to determine color based on ranking
-def get_colors_by_rank(row):
-    values = row[1:].values
-    ranks = np.argsort(values) + 1
-    colors = []
-    for rank in ranks:
-        if rank == 1:
-            colors.append('#90EE90')  # Light Green - Best
-        elif rank == 2:
-            colors.append('#FFFFE0')  # Light Yellow - Second
-        else:
-            colors.append('#FFB6C1')  # Light Red - Third
-    return colors
-
-# Create figure with subplots
-fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
-
-# Prepare data
-operations = df['Operation']
-x = np.arange(len(operations))
-width = 0.25
-
-# Get implementation names
-implementations = df.columns[1:]
-
-# Plot 1: Regular scale bar chart with color coding
-for i, (idx, row) in enumerate(df.iterrows()):
-    colors = get_colors_by_rank(row)
-    for j, impl in enumerate(implementations):
-        ax1.bar(x[i] + (j-1)*width, row[impl], width,
-                color=colors[j], edgecolor='black', linewidth=0.5,
-                label=impl if i == 0 else "")
-
-ax1.set_xlabel('Operations', fontsize=12)
-ax1.set_ylabel('Time (nanoseconds)', fontsize=12)
-ax1.set_title('GCC 14 - ARM32 Benchmark Results', fontsize=14, fontweight='bold')
-ax1.set_xticks(x)
-ax1.set_xticklabels(operations, rotation=45, ha='right')
-ax1.legend(loc='upper left')
-ax1.grid(axis='y', alpha=0.3)
-
-# Add value labels on bars
-for i, (idx, row) in enumerate(df.iterrows()):
-    for j, impl in enumerate(implementations):
-        ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}',
-                 ha='center', va='bottom', fontsize=8, rotation=90)
-
-# Plot 2: Log scale for better visualization
-for i, impl in enumerate(implementations):
-    bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5)
-
-    # Color each bar based on its rank within operation
-    for j, bar in enumerate(bars):
-        operation_values = df.iloc[j, 1:].values
-        rank = np.argsort(operation_values).tolist().index(i) + 1
-        if rank == 1:
-            bar.set_facecolor('#90EE90')
-        elif rank == 2:
-            bar.set_facecolor('#FFFFE0')
-        else:
-            bar.set_facecolor('#FFB6C1')
-
-ax2.set_xlabel('Operations', fontsize=12)
-ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12)
-ax2.set_title('GCC 14 - ARM32 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold')
-ax2.set_yscale('log')
-ax2.set_xticks(x)
-ax2.set_xticklabels(operations, rotation=45, ha='right')
-ax2.legend(loc='upper left')
-ax2.grid(axis='y', alpha=0.3, which='both')
-
-plt.tight_layout()
-plt.savefig('ARM32_benchmarks.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Create a normalized performance chart
-fig3, ax3 = plt.subplots(figsize=(10, 6))
-
-# Normalize data relative to boost::mp::uint128_t
-normalized_df = df.copy()
-for col in implementations:
-    normalized_df[col] = df[col] / df['boost::mp::uint128_t']
-
-# Plot normalized bars
-for i, impl in enumerate(implementations):
-    if impl == 'boost::mp::uint128_t':
-        continue  # Skip since it's always 1.0
-    bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width,
-                   label=impl, edgecolor='black', linewidth=0.5)
-
-    # Add value labels
-    for j, bar in enumerate(bars):
-        height = bar.get_height()
-        ax3.text(bar.get_x() + bar.get_width()/2., height,
-                 f'{height:.2f}x', ha='center', va='bottom', fontsize=9)
-
-# Add reference line at 1.0
-ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='boost::mp::uint128_t baseline')
-
-ax3.set_xlabel('Operations', fontsize=12)
-ax3.set_ylabel('Relative Performance (vs boost::mp::uint128_t)', fontsize=12)
-ax3.set_title('Relative Performance Comparison - ARM3232', fontsize=14, fontweight='bold')
-ax3.set_xticks(x)
-ax3.set_xticklabels(operations, rotation=45, ha='right')
-ax3.legend()
-ax3.grid(axis='y', alpha=0.3)
-
-# Add interpretation text
-ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes,
-         fontsize=10, verticalalignment='top', style='italic')
-
-plt.tight_layout()
-plt.savefig('ARM32_relative_performance.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Generate summary statistics
-print("\nPerformance Summary (x64):")
-print("-" * 50)
-for impl in implementations:
-    if impl == 'unsigned __int128':
-        continue
-    avg_ratio = normalized_df[impl].mean()
-    print(f"{impl}: {avg_ratio:.2f}x average vs unsigned __int128")
-
-print("\nBest performer by operation:")
-print("-" * 50)
-for i, op in enumerate(operations):
-    row_data = df.iloc[i, 1:]
-    best_impl = row_data.idxmin()
-    best_time = row_data.min()
-    print(f"{op}: {best_impl} ({best_time:,} ns)")
-
diff --git a/doc/signed_plots.py b/doc/signed_plots.py
deleted file mode 100644
index d34e14fb..00000000
--- a/doc/signed_plots.py
+++ /dev/null
@@ -1,223 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-"""
-# Linux x64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [879535, 92165, 92514, 115727, 1234838, 1193529],
-    'int128_t': [748787, 92441, 88390, 90897, 1352795, 1256687],
-    'boost::mp::int128_t': [2210502, 283528, 668953, 312723, 1320695, 1287093],
-    'absl::int128': [741269, 92323, 90394, 89558, 1200439, 1293439],
-}
-"""
-"""
-# Linux ARM64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [3495621, 191514, 131380, 236071, 2412757, 2501357],
-    'int128_t': [2279914, 133319, 193984, 234594, 2434752, 2171828],
-    'boost::mp::int128_t': [5910287, 566860, 1066509, 864526, 2508755, 2571959],
-    'absl::int128': [3749448, 164848, 193467, 237676, 2484139, 2158203]
-}
-
-"""
-
-# Linux s390x
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [14099505, 1151086, 1223119, 1904542, 8768877, 8661233],
-    'int128_t': [12588237, 1374984, 753561, 2060986, 7080113, 7180650],
-    'boost::mp::int128_t': [21074294, 3303931, 4224613, 3034387, 7306287, 8801605],
-    'absl::int128': [13972778, 1195725, 1295929, 1733150, 7968543, 8175497],
-}
-
-"""
-# Linux ppc64le
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [4538094, 221708, 222629, 193315, 5607581, 5623562],
-    'int128_t': [5796198, 191841, 174273, 191785, 4669820, 4750314],
-    'boost::mp::int128_t': [13907323, 1177034, 1861166, 878393, 5616217, 5641480]
-}
-"""
-"""
-# macos x64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [1628142, 224648, 212849, 432205, 3924951, 3042060],
-    'int128_t': [1748005, 180393, 131062, 407829, 2409106, 2423738],
-    'boost::mp::int128_t': [4318109, 925013, 1876834, 651209, 3719183, 4443402]
-}
-"""
-"""
-# macos ARM
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    '`__int128`': [133275, 20203, 20203, 21496, 662767, 719179],
-    'int128_t': [131953, 17797, 17832, 20202, 682891, 692509],
-    'boost::mp::int128_t': [340555, 169909, 172497, 78269, 969277, 1026090],
-    'absl::int128': [133509, 20208, 22199, 20364, 663602, 717897]
-}
-"""
-"""
-# MSVC 14.3 - ARM64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Signed128': [911829, 33233, 33411, 117586, 1127267, 1287100],
-    'int128_t': [368104, 34001, 34130, 56324, 1500725, 1548073],
-    'boost::mp::int128_t': [2376802, 121700, 1488822, 1564799, 2808293, 2997474]
-}
-"""
-"""
-# MSVC 14.3 - x64
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Signed128': [2186843, 186771, 193660, 402806, 1612873, 1637135],
-    'int128_t': [2142626, 184598, 186335, 117413, 2369701, 2218627],
-    'boost::mp::int128_t': [4854983, 2645943, 2925784, 3887479, 6437280, 6236026]
-}
-"""
-"""
-# MSVC 14.3 - x86
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'std::_Signed128': [3495288, 199936, 1089785, 2653505, 7267297, 5779771],
-    'int128_t': [3520950, 212116, 210354, 2595285, 5516460, 5842785],
-    'boost::mp::int128_t': [7877534, 3477656, 4108539, 7030276, 10229356, 9069360]
-}
-"""
-df = pd.DataFrame(data)
-
-# Function to determine color based on ranking
-def get_colors_by_rank(row):
-    values = row[1:].values
-    ranks = np.argsort(values) + 1
-    colors = []
-    for rank in ranks:
-        if rank == 1:
-            colors.append('#90EE90')  # Light Green - Best
-        elif rank == 2:
-            colors.append('#FFFFE0')  # Light Yellow - Second
-        else:
-            colors.append('#FFB6C1')  # Light Red - Third
-    return colors
-
-# Create figure with subplots
-fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
-
-# Prepare data
-operations = df['Operation']
-x = np.arange(len(operations))
-width = 0.25
-
-# Get implementation names
-implementations = df.columns[1:]
-
-# Plot 1: Regular scale bar chart with color coding
-for i, (idx, row) in enumerate(df.iterrows()):
-    colors = get_colors_by_rank(row)
-    for j, impl in enumerate(implementations):
-        ax1.bar(x[i] + (j-1)*width, row[impl], width,
-                color=colors[j], edgecolor='black', linewidth=0.5,
-                label=impl if i == 0 else "")
-
-ax1.set_xlabel('Operations', fontsize=12)
-ax1.set_ylabel('Time (nanoseconds)', fontsize=12)
-ax1.set_title('GCC 13 - s390x Benchmark Results', fontsize=14, fontweight='bold')
-ax1.set_xticks(x)
-ax1.set_xticklabels(operations, rotation=45, ha='right')
-ax1.legend(loc='upper left')
-ax1.grid(axis='y', alpha=0.3)
-
-# Add value labels on bars
-for i, (idx, row) in enumerate(df.iterrows()):
-    for j, impl in enumerate(implementations):
-        ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}',
-                 ha='center', va='bottom', fontsize=8, rotation=90)
-
-# Plot 2: Log scale for better visualization
-for i, impl in enumerate(implementations):
-    bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5)
-
-    # Color each bar based on its rank within operation
-    for j, bar in enumerate(bars):
-        operation_values = df.iloc[j, 1:].values
-        rank = np.argsort(operation_values).tolist().index(i) + 1
-        if rank == 1:
-            bar.set_facecolor('#90EE90')
-        elif rank == 2:
-            bar.set_facecolor('#FFFFE0')
-        else:
-            bar.set_facecolor('#FFB6C1')
-
-ax2.set_xlabel('Operations', fontsize=12)
-ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12)
-ax2.set_title('GCC 13 - s390x Benchmark Results (Log Scale)', fontsize=14, fontweight='bold')
-ax2.set_yscale('log')
-ax2.set_xticks(x)
-ax2.set_xticklabels(operations, rotation=45, ha='right')
-ax2.legend(loc='upper left')
-ax2.grid(axis='y', alpha=0.3, which='both')
-
-plt.tight_layout()
-plt.savefig('s390x_benchmarks.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Create a normalized performance chart
-fig3, ax3 = plt.subplots(figsize=(10, 6))
-
-# Normalize data relative to __int128
-normalized_df = df.copy()
-for col in implementations:
-    normalized_df[col] = df[col] / df['`__int128`']
-
-# Plot normalized bars
-for i, impl in enumerate(implementations):
-    if impl == '`__int128`':
-        continue  # Skip since it's always 1.0
-    bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width,
-                   label=impl, edgecolor='black', linewidth=0.5)
-
-    # Add value labels
-    for j, bar in enumerate(bars):
-        height = bar.get_height()
-        ax3.text(bar.get_x() + bar.get_width()/2., height,
-                 f'{height:.2f}x', ha='center', va='bottom', fontsize=9)
-
-# Add reference line at 1.0
-ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='`__int128` baseline')
-
-ax3.set_xlabel('Operations', fontsize=12)
-ax3.set_ylabel('Relative Performance (vs __int128)', fontsize=12)
-ax3.set_title('Relative Performance Comparison - s390x', fontsize=14, fontweight='bold')
-ax3.set_xticks(x)
-ax3.set_xticklabels(operations, rotation=45, ha='right')
-ax3.legend()
-ax3.grid(axis='y', alpha=0.3)
-
-# Add interpretation text
-ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes,
-         fontsize=10, verticalalignment='top', style='italic')
-
-plt.tight_layout()
-plt.savefig('s390x_relative_performance.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Generate summary statistics
-print("\nPerformance Summary (x64):")
-print("-" * 50)
-for impl in implementations:
-    if impl == '__int128':
-        continue
-    avg_ratio = normalized_df[impl].mean()
-    print(f"{impl}: {avg_ratio:.2f}x average vs __int128")
-
-print("\nBest performer by operation:")
-print("-" * 50)
-for i, op in enumerate(operations):
-    row_data = df.iloc[i, 1:]
-    best_impl = row_data.idxmin()
-    best_time = row_data.min()
-    print(f"{op}: {best_impl} ({best_time:,} ns)")
-
diff --git a/doc/signed_plots_32bit.py b/doc/signed_plots_32bit.py
deleted file mode 100644
index 73dbbdd2..00000000
--- a/doc/signed_plots_32bit.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-"""
-# Linux x86_32
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'int128_t': [9530060, 785799, 778881, 1148024, 10337258, 10438037],
-    'boost::mp::int128_t': [12168353, 7777469, 8214089, 9477355, 22857709, 14848256]
-}
-"""
-
-# Linux ARM32
-data = {
-    'Operation': ['Comparisons', 'Addition', 'Subtraction', 'Multiplication', 'Division', 'Modulo'],
-    'int128_t': [6149439, 457850, 488321, 1793874, 17738614, 18064819],
-    'boost::mp::int128_t': [6432579, 5669571, 7464427, 11410321, 38956122, 30144743]
-}
-
-df = pd.DataFrame(data)
-
-# Function to determine color based on ranking
-def get_colors_by_rank(row):
-    values = row[1:].values
-    ranks = np.argsort(values) + 1
-    colors = []
-    for rank in ranks:
-        if rank == 1:
-            colors.append('#90EE90')  # Light Green - Best
-        elif rank == 2:
-            colors.append('#FFFFE0')  # Light Yellow - Second
-        else:
-            colors.append('#FFB6C1')  # Light Red - Third
-    return colors
-
-# Create figure with subplots
-fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
-
-# Prepare data
-operations = df['Operation']
-x = np.arange(len(operations))
-width = 0.25
-
-# Get implementation names
-implementations = df.columns[1:]
-
-# Plot 1: Regular scale bar chart with color coding
-for i, (idx, row) in enumerate(df.iterrows()):
-    colors = get_colors_by_rank(row)
-    for j, impl in enumerate(implementations):
-        ax1.bar(x[i] + (j-1)*width, row[impl], width,
-                color=colors[j], edgecolor='black', linewidth=0.5,
-                label=impl if i == 0 else "")
-
-ax1.set_xlabel('Operations', fontsize=12)
-ax1.set_ylabel('Time (nanoseconds)', fontsize=12)
-ax1.set_title('GCC 14 - ARM32 Benchmark Results', fontsize=14, fontweight='bold')
-ax1.set_xticks(x)
-ax1.set_xticklabels(operations, rotation=45, ha='right')
-ax1.legend(loc='upper left')
-ax1.grid(axis='y', alpha=0.3)
-
-# Add value labels on bars
-for i, (idx, row) in enumerate(df.iterrows()):
-    for j, impl in enumerate(implementations):
-        ax1.text(x[i] + (j-1)*width, row[impl], f'{row[impl]:,}',
-                 ha='center', va='bottom', fontsize=8, rotation=90)
-
-# Plot 2: Log scale for better visualization
-for i, impl in enumerate(implementations):
-    bars = ax2.bar(x + (i-1)*width, df[impl], width, label=impl, edgecolor='black', linewidth=0.5)
-
-    # Color each bar based on its rank within operation
-    for j, bar in enumerate(bars):
-        operation_values = df.iloc[j, 1:].values
-        rank = np.argsort(operation_values).tolist().index(i) + 1
-        if rank == 1:
-            bar.set_facecolor('#90EE90')
-        elif rank == 2:
-            bar.set_facecolor('#FFFFE0')
-        else:
-            bar.set_facecolor('#FFB6C1')
-
-ax2.set_xlabel('Operations', fontsize=12)
-ax2.set_ylabel('Time (nanoseconds) - Log Scale', fontsize=12)
-ax2.set_title('GCC 14 - ARM32 Benchmark Results (Log Scale)', fontsize=14, fontweight='bold')
-ax2.set_yscale('log')
-ax2.set_xticks(x)
-ax2.set_xticklabels(operations, rotation=45, ha='right')
-ax2.legend(loc='upper left')
-ax2.grid(axis='y', alpha=0.3, which='both')
-
-plt.tight_layout()
-plt.savefig('ARM32_benchmarks.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Create a normalized performance chart
-fig3, ax3 = plt.subplots(figsize=(10, 6))
-
-# Normalize data relative to boost::mp::int128_t
-normalized_df = df.copy()
-for col in implementations:
-    normalized_df[col] = df[col] / df['boost::mp::int128_t']
-
-# Plot normalized bars
-for i, impl in enumerate(implementations):
-    if impl == 'boost::mp::int128_t':
-        continue  # Skip since it's always 1.0
-    bars = ax3.bar(x + (i-1.5)*width, normalized_df[impl], width,
-                   label=impl, edgecolor='black', linewidth=0.5)
-
-    # Add value labels
-    for j, bar in enumerate(bars):
-        height = bar.get_height()
-        ax3.text(bar.get_x() + bar.get_width()/2., height,
-                 f'{height:.2f}x', ha='center', va='bottom', fontsize=9)
-
-# Add reference line at 1.0
-ax3.axhline(y=1.0, color='red', linestyle='--', alpha=0.5, label='boost::mp::int128_t baseline')
-
-ax3.set_xlabel('Operations', fontsize=12)
-ax3.set_ylabel('Relative Performance (vs boost::mp::int128_t)', fontsize=12)
-ax3.set_title('Relative Performance Comparison - ARM32', fontsize=14, fontweight='bold')
-ax3.set_xticks(x)
-ax3.set_xticklabels(operations, rotation=45, ha='right')
-ax3.legend()
-ax3.grid(axis='y', alpha=0.3)
-
-# Add interpretation text
-ax3.text(0.02, 0.98, 'Lower is better', transform=ax3.transAxes,
-         fontsize=10, verticalalignment='top', style='italic')
-
-plt.tight_layout()
-plt.savefig('ARM32_relative_performance.png', dpi=300, bbox_inches='tight')
-plt.show()
-
-# Generate summary statistics
-print("\nPerformance Summary (x64):")
-print("-" * 50)
-for impl in implementations:
-    if impl == '__int128':
-        continue
-    avg_ratio = normalized_df[impl].mean()
-    print(f"{impl}: {avg_ratio:.2f}x average vs __int128")
-
-print("\nBest performer by operation:")
-print("-" * 50)
-for i, op in enumerate(operations):
-    row_data = df.iloc[i, 1:]
-    best_impl = row_data.idxmin()
-    best_time = row_data.min()
-    print(f"{op}: {best_impl} ({best_time:,} ns)")
-
diff --git a/include/boost/int128/cstdlib.hpp b/include/boost/int128/cstdlib.hpp
index 2839ac39..b90eda39 100644
--- a/include/boost/int128/cstdlib.hpp
+++ b/include/boost/int128/cstdlib.hpp
@@ -61,15 +61,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr i128div_t div(const int12
         return i128div_t{0, 0};
     }
 
-    #if defined(BOOST_INT128_HAS_INT128)
-
-    const auto builtin_x {static_cast<detail::builtin_i128>(x)};
-    const auto builtin_y {static_cast<detail::builtin_i128>(y)};
-    return i128div_t{static_cast<int128_t>(builtin_x / builtin_y),
-                     static_cast<int128_t>(builtin_x % builtin_y)};
-
-    #else
-
     const auto abs_lhs {static_cast<uint128_t>(abs(x))};
     const auto abs_rhs {static_cast<uint128_t>(abs(y))};
 
@@ -78,19 +69,29 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr i128div_t div(const int12
         return {0, x};
     }
 
-    const auto unsigned_res {div(abs_lhs, abs_rhs)};
-
     const auto negative_quot {(x.high < 0) != (y.high < 0)};
     const auto negative_rem {x.high < 0};
 
+    #if defined(BOOST_INT128_HAS_INT128)
+
+    if (abs_rhs.high != 0)
+    {
+        const auto builtin_x {static_cast<detail::builtin_i128>(x)};
+        const auto builtin_y {static_cast<detail::builtin_i128>(y)};
+        return i128div_t{static_cast<int128_t>(builtin_x / builtin_y),
+                         static_cast<int128_t>(builtin_x % builtin_y)};
+    }
+
+    #endif
+
+    const auto unsigned_res {div(abs_lhs, abs_rhs)};
+
     i128div_t res {static_cast<int128_t>(unsigned_res.quot), static_cast<int128_t>(unsigned_res.rem)};
 
     res.quot = negative_quot ? -res.quot : res.quot;
     res.rem = negative_rem ? -res.rem : res.rem;
 
     return res;
-
-    #endif
 }
 
 } // namespace int128
diff --git a/include/boost/int128/detail/common_div.hpp b/include/boost/int128/detail/common_div.hpp
index 6ffe4bac..0237ead5 100644
--- a/include/boost/int128/detail/common_div.hpp
+++ b/include/boost/int128/detail/common_div.hpp
@@ -7,6 +7,7 @@
 
 #include <boost/int128/detail/config.hpp>
 #include <boost/int128/detail/clz.hpp>
+#include <boost/int128/detail/common_mul.hpp>
 
 #ifndef BOOST_INT128_BUILD_MODULE
 
@@ -67,6 +68,242 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void half_word_div(
     quotient.low |= (remainder / rhs) & UINT32_MAX;
 }
 
+// Portable 128-bit by 64-bit unsigned division producing a 64-bit quotient and remainder.
+// This is the classic Hacker's Delight divlu (two 32-bit "digit" steps over 64-bit words).
+// Precondition: u1 < d so the quotient is guaranteed to fit in 64 bits. It is constexpr-safe
+// and serves as the fallback for udiv_2by1 on every target without a hardware 128/64 divide.
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t divlu(std::uint64_t u1, std::uint64_t u0, std::uint64_t d, std::uint64_t& r) noexcept
+{
+    constexpr std::uint64_t b {UINT64_C(1) << 32U}; // Number base (2^32)
+
+    BOOST_INT128_ASSUME(u1 < d); // LCOV_EXCL_LINE
+
+    // D.1: normalize so that the divisor's most significant bit is set
+    const auto s {countl_zero(d)};
+    d <<= s;
+
+    const auto vn1 {d >> 32U};
+    const auto vn0 {d & UINT32_MAX};
+
+    // Shift the dividend left by s. The (64 - s) shift is undefined when s == 0, so guard it.
+    const auto un32 {s == 0 ? u1 : ((u1 << s) | (u0 >> (64 - s)))};
+    const auto un10 {u0 << s};
+
+    const auto un1 {un10 >> 32U};
+    const auto un0 {un10 & UINT32_MAX};
+
+    // First quotient digit
+    auto q1 {un32 / vn1};
+    auto rhat {un32 - (q1 * vn1)};
+
+    while (q1 >= b || (q1 * vn0) > ((b * rhat) + un1))
+    {
+        --q1;
+        rhat += vn1;
+        if (rhat >= b)
+        {
+            break;
+        }
+    }
+
+    const auto un21 {(un32 * b) + un1 - (q1 * d)};
+
+    // Second quotient digit
+    auto q0 {un21 / vn1};
+    rhat = un21 - (q0 * vn1);
+
+    while (q0 >= b || (q0 * vn0) > ((b * rhat) + un0))
+    {
+        --q0;
+        rhat += vn1;
+        if (rhat >= b)
+        {
+            break;
+        }
+    }
+
+    // The remainder is shifted back down by the normalization amount
+    r = ((un21 * b) + un0 - (q0 * d)) >> s;
+    return (q1 * b) + q0;
+}
+
+#if defined(BOOST_INT128_HAS_X86_64_DIVQ)
+
+// Inline asm cannot appear in a constexpr function body before C++20, so the x86-64 DIV
+// instruction is wrapped in a non-constexpr helper that udiv_2by1 only calls at runtime.
+BOOST_INT128_FORCE_INLINE std::uint64_t udiv_2by1_divq(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d, std::uint64_t& r) noexcept
+{
+    std::uint64_t q {};
+    __asm__("divq %[d]" : "=a"(q), "=d"(r) : [d] "r"(d), "a"(u0), "d"(u1) : "cc");
+    return q;
+}
+
+#endif // BOOST_INT128_HAS_X86_64_DIVQ
+
+// Divides the 128-bit value (u1:u0) by d, returning a 64-bit quotient and the true remainder.
+// Precondition: u1 < d. Mirrors common_mul.hpp::umul: a hardware instruction at runtime where
+// one exists, and the portable divlu in constexpr evaluation and everywhere else.
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t udiv_2by1(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d, std::uint64_t& r) noexcept
+{
+    BOOST_INT128_ASSUME(u1 < d); // LCOV_EXCL_LINE
+
+    #if (defined(BOOST_INT128_HAS_X86_64_DIVQ) || (defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
+
+    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(u1))
+    {
+        #if defined(BOOST_INT128_HAS_X86_64_DIVQ)
+
+        return udiv_2by1_divq(u1, u0, d, r);
+
+        #else
+
+        return _udiv128(u1, u0, d, &r);
+
+        #endif
+    }
+
+    #endif
+
+    return divlu(u1, u0, d, r);
+}
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable : 4127) // Pre c++17 the if constexpr remainder part will hit this
+#endif
+
+// Divides the 128-bit value (uh:ul) by the 128-bit divisor (vh:vl) where vh != 0. Because the
+// divisor is >= 2^64 the quotient is guaranteed to fit in a single 64-bit word, which is
+// returned. When need_remainder is true the 128-bit remainder is written to (rem_hi:rem_lo).
+//
+// This is one normalized quotient digit (Knuth Algorithm D specialized to a 2-word divisor).
+// The top-limb estimate qhat (reusing udiv_2by1, a hardware divq on x86-64) is bounded by
+// Knuth Theorem B to q <= qhat <= q + 2; the D3 refinement against d0 tightens it to q <= qhat
+// <= q + 1, and the conditional add-back then corrects the remaining off-by-one.
+template <bool need_remainder>
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t div3by2(const std::uint64_t uh, const std::uint64_t ul,
+    const std::uint64_t vh, const std::uint64_t vl, std::uint64_t& rem_hi, std::uint64_t& rem_lo) noexcept
+{
+    BOOST_INT128_ASSUME(vh != 0); // LCOV_EXCL_LINE
+
+    // D.1: normalize so the divisor's most significant bit is set
+    const auto s {countl_zero(vh)};
+    const auto cs {64 - s};
+
+    std::uint64_t d1 {};
+    std::uint64_t d0 {};
+    std::uint64_t u2 {};
+    std::uint64_t u1 {};
+    std::uint64_t u0 {};
+
+    if (s == 0)
+    {
+        d1 = vh;
+        d0 = vl;
+        u2 = 0;
+        u1 = uh;
+        u0 = ul;
+    }
+    else
+    {
+        d1 = (vh << s) | (vl >> cs);
+        d0 = vl << s;
+        u2 = uh >> cs;
+        u1 = (uh << s) | (ul >> cs);
+        u0 = ul << s;
+    }
+
+    BOOST_INT128_ASSUME(u2 <= d1); // LCOV_EXCL_LINE
+
+    // D.3: estimate the single quotient digit qhat = floor((u2:u1) / d1), clamped to 2^64 - 1.
+    // rhat is the remainder of that estimate.
+    std::uint64_t qhat {};
+    std::uint64_t rhat {};
+    bool rhat_overflow {false};
+    if (u2 < d1)
+    {
+        qhat = udiv_2by1(u2, u1, d1, rhat);
+    }
+    else
+    {
+        // u2 == d1: floor((u2:u1)/d1) clamps to 2^64 - 1, leaving rhat == u1 + d1 (may carry).
+        qhat = UINT64_MAX;
+        rhat = u1 + d1;
+        rhat_overflow = rhat < u1;
+    }
+
+    std::uint64_t qd0_hi {};
+    auto qd0_lo {umul(qhat, d0, qd0_hi)};
+
+    // Refine qhat against d0 (Knuth D3). The top-limb estimate alone can exceed the true quotient
+    // by up to 2; this brings it down to at most one too large, which the add-back below corrects.
+    // At most two iterations run, and only while the running remainder rhat stays below 2^64.
+    if (!rhat_overflow)
+    {
+        while (qd0_hi > rhat || (qd0_hi == rhat && qd0_lo > u0))
+        {
+            --qhat;
+            rhat += d1;
+            const auto rhat_carry {rhat < d1};
+            qd0_lo = umul(qhat, d0, qd0_hi);
+            if (rhat_carry)
+            {
+                break;
+            }
+        }
+    }
+
+    // D.4: multiply and subtract (u2:u1:u0) - qhat * (d1:d0). qd0 already holds qhat * d0.
+    std::uint64_t qd1_hi {};
+    const auto qd1_lo {umul(qhat, d1, qd1_hi)};
+
+    const auto p0 {qd0_lo};
+    const auto p1 {qd0_hi + qd1_lo};
+    const auto p2 {qd1_hi + static_cast<std::uint64_t>(p1 < qd0_hi)};
+
+    const auto r0 {u0 - p0};
+    const auto borrow0 {static_cast<std::uint64_t>(u0 < p0)};
+    const auto t1 {u1 - p1};
+    auto r1 {t1 - borrow0};
+    const auto borrow1 {static_cast<std::uint64_t>(u1 < p1) + static_cast<std::uint64_t>(t1 < borrow0)};
+
+    // D.5/D.6: if the top limb borrowed, qhat was one too large. Correct it and add the divisor
+    // back into the remainder. The probability of this branch is small.
+    auto r0_final {r0};
+    if (BOOST_INT128_UNLIKELY((u2 < p2) || ((u2 - p2) < borrow1)))
+    {
+        --qhat;                                                  // LCOV_EXCL_LINE
+        const auto sum0 {r0 + d0};                               // LCOV_EXCL_LINE
+        r0_final = sum0;                                         // LCOV_EXCL_LINE
+        r1 = r1 + d1 + static_cast<std::uint64_t>(sum0 < r0);    // LCOV_EXCL_LINE
+    }
+
+    BOOST_INT128_IF_CONSTEXPR (need_remainder)
+    {
+        if (s == 0)
+        {
+            rem_hi = r1;
+            rem_lo = r0_final;
+        }
+        else
+        {
+            rem_lo = (r0_final >> s) | (r1 << cs);
+            rem_hi = r1 >> s;
+        }
+    }
+    else
+    {
+        static_cast<void>(rem_hi);
+        static_cast<void>(rem_lo);
+    }
+
+    return qhat;
+}
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
+
 namespace impl {
 
 #if defined(_MSC_VER)
@@ -267,218 +504,57 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T from_words(const
     return {static_cast<high_word_type>(high), low};
 }
 
-#if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
-
-template <bool needs_mod, typename T>
-BOOST_INT128_HOST_DEVICE constexpr T div_mod_msvc(T dividend, T divisor, T& remainder)
-{
-    using high_word_type = decltype(T{}.high);
-
-    // Skip normalization if divisor is already large enough
-    // use direct division and intrinsic
-    // This is only possible in the unsigned case
-    BOOST_INT128_IF_CONSTEXPR (!std::numeric_limits<T>::is_signed)
-    {
-        constexpr auto divisor_lower_bound{UINT64_MAX >> 1};
-        if (divisor.high >= divisor_lower_bound)
-        {
-            T quotient{};
-
-            quotient.low = static_cast<std::uint64_t>(dividend.high / divisor.high);
-
-            std::uint64_t product0_high{};
-            auto product0_low{_umul128(quotient.low, divisor.low, &product0_high)};
-
-            std::uint64_t product1_high{};
-            auto product1_low{_umul128(quotient.low, static_cast<std::uint64_t>(divisor.high), &product1_high)};
-
-            T product{};
-            product.low = product0_low;
-            auto carry{BOOST_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast<std::uint64_t*>(&product.high))};
-            product1_high += static_cast<std::uint64_t>(carry);
-
-            if (product1_high > 0 || product > dividend)
-            {
-                --quotient.low;
-
-                // Recalculate with adjusted quotient
-                product0_low = _umul128(quotient.low, divisor.low, &product0_high);
-                product1_low = _umul128(quotient.low, divisor.high, &product1_high);
-
-                product.low = product0_low;
-                carry = BOOST_INT128_ADD_CARRY(0, product0_high, product1_low, reinterpret_cast<std::uint64_t*>(&product.high));
-                product1_high += static_cast<std::uint64_t>(carry);
-            }
-
-            BOOST_INT128_IF_CONSTEXPR(needs_mod)
-            {
-                auto borrow{BOOST_INT128_SUB_BORROW(0, dividend.low, product.low, &remainder.low)};
-                BOOST_INT128_SUB_BORROW(borrow, dividend.high, product.high, reinterpret_cast<std::uint64_t*>(&remainder.high));
-            }
-
-            return quotient;
-        }
-    }
-
-    const auto shift_amount {countl_zero(static_cast<std::uint64_t>(divisor.high))};
-    divisor <<= shift_amount;
-
-    auto high_digit {static_cast<std::uint64_t>(shift_amount == 0 ? 0 : dividend.high >> (64 - shift_amount))};
-    dividend <<= shift_amount;
-
-    // Initial quotient estimate
-    T quotient {};
-    const bool high_digit_gte_divisor {high_digit >= static_cast<std::uint64_t>(divisor.high)};
-    quotient.high = high_digit_gte_divisor ? 1 : 0;
-    std::uint64_t remainder_estimate {};
-
-    quotient.low = _udiv128(high_digit_gte_divisor ? high_digit - divisor.high : high_digit,
-                            dividend.high, divisor.high, &remainder_estimate);
-
-    // Bounded correction loop with early exit
-    // Typically 2 is the most number of corrections we need since this is only for 2x2 division
-    // Other cases have been filtered out well before we've made it this far
-    int correction_steps {};
-    constexpr int max_corrections {2};
-
-    while (correction_steps < max_corrections)
-    {
-        T product{};
-        product.low = _umul128(quotient.low, divisor.low, reinterpret_cast<std::uint64_t*>(&product.high));
-        if (product <= T{static_cast<high_word_type>(remainder_estimate), dividend.low})
-        {
-            break;
-        }
-
-        --quotient.low;
-        const auto sum {remainder_estimate + divisor.high};
-        if (remainder_estimate > sum)
-        {
-            break;
-        }
-        remainder_estimate = sum;
-
-        correction_steps++;
-    }
-
-    // Final verification and adjustment
-    std::uint64_t product0_high{};
-    auto product_low {_umul128(quotient.low, divisor.low, &product0_high)};
-    auto borrow {BOOST_INT128_SUB_BORROW(0, dividend.low, product_low, &dividend.low)};
-
-    std::uint64_t product1_high{};
-    product_low = _umul128(quotient.low, divisor.high, &product1_high);
-    product1_high += static_cast<std::uint64_t>(BOOST_INT128_ADD_CARRY(0, product_low, product0_high, &product_low));
-
-    borrow = BOOST_INT128_SUB_BORROW(borrow, static_cast<std::uint64_t>(dividend.high), product_low, reinterpret_cast<std::uint64_t*>(&dividend.high));
-    borrow = BOOST_INT128_SUB_BORROW(borrow, high_digit, product1_high, &high_digit);
-    quotient.low -= static_cast<std::uint64_t>(borrow);
-
-    BOOST_INT128_IF_CONSTEXPR (needs_mod)
-    {
-        if (borrow)
-        {
-            auto carry { BOOST_INT128_ADD_CARRY(0, dividend.low, divisor.low, &dividend.low) };
-            BOOST_INT128_ADD_CARRY(carry, static_cast<std::uint64_t>(dividend.high), static_cast<std::uint64_t>(divisor.high), reinterpret_cast<std::uint64_t*>(&dividend.high));
-        }
-
-        dividend >>= shift_amount;
-        remainder = dividend;
-    }
-
-    return quotient;
-}
-
-#endif
-
 } // namespace impl
 
 // We only need to take the time to process the remainder in the modulo case
 // In the division case it is a waste of cycles
+//
+// 128/64 -> 128-bit quotient (and optional 64-bit remainder) by two-step long division.
+// The leading 64/64 yields the high quotient word and a remainder r < rhs, which satisfies
+// the udiv_2by1 precondition for the low quotient word. This covers every rhs (including
+// rhs <= UINT32_MAX) through the single hardware-or-portable udiv_2by1 primitive.
 
 template <typename T>
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient) noexcept
 {
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
-
-    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
-    {
-        using high_word_type = decltype(T{}.high);
-
-        quotient.high = static_cast<high_word_type>(static_cast<std::uint64_t>(lhs.high) / rhs);
-        auto remainder {static_cast<std::uint64_t>(lhs.high) % rhs};
-        quotient.low = _udiv128(remainder, lhs.low, rhs, &remainder);
-        return;
-    }
-
-    #endif
-
-    if (rhs <= UINT32_MAX)
-    {
-        half_word_div(lhs, static_cast<std::uint32_t>(rhs), quotient);
-    }
-    else
-    {
-        std::uint32_t u[4] {};
-        std::uint32_t v[2] {};
-        std::uint32_t q[4] {};
+    using high_word_type = decltype(T{}.high);
 
-        const auto m {impl::to_words(lhs, u)};
-        const auto n {impl::to_words(rhs, v)};
+    BOOST_INT128_ASSUME(rhs != 0); // LCOV_EXCL_LINE
 
-        impl::knuth_divide<false>(u, m, v, n, q);
+    const auto u_high {static_cast<std::uint64_t>(lhs.high)};
 
-        quotient = impl::from_words<T>(q);
-    }
+    quotient.high = static_cast<high_word_type>(u_high / rhs);
+    auto r {u_high % rhs};
+    quotient.low = udiv_2by1(r, lhs.low, rhs, r);
 }
 
 template <typename T>
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint64_t rhs, T& quotient, T& remainder) noexcept
 {
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920 && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
-
-    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
-    {
-        using high_word_type = decltype(T{}.high);
-
-        quotient.high = static_cast<high_word_type>(static_cast<std::uint64_t>(lhs.high) / rhs);
-        remainder.low = static_cast<std::uint64_t>(lhs.high) % rhs;
-        quotient.low = _udiv128(remainder.low, lhs.low, rhs, &remainder.low);
-        return;
-    }
-
-    #endif
+    using high_word_type = decltype(T{}.high);
 
-    if (rhs <= UINT32_MAX)
-    {
-        half_word_div(lhs, static_cast<std::uint32_t>(rhs), quotient, remainder);
-    }
-    else
-    {
-        std::uint32_t u[4] {};
-        std::uint32_t v[2] {};
-        std::uint32_t q[4] {};
+    BOOST_INT128_ASSUME(rhs != 0); // LCOV_EXCL_LINE
 
-        const auto m {impl::to_words(lhs, u)};
-        const auto n {impl::to_words(rhs, v)};
+    const auto u_high {static_cast<std::uint64_t>(lhs.high)};
 
-        impl::knuth_divide<true>(u, m, v, n, q);
+    quotient.high = static_cast<high_word_type>(u_high / rhs);
+    auto r {u_high % rhs};
+    quotient.low = udiv_2by1(r, lhs.low, rhs, r);
 
-        quotient = impl::from_words<T>(q);
-        remainder = impl::from_words<T>(u);
-    }
+    remainder.high = static_cast<high_word_type>(0);
+    remainder.low = r;
 }
 
 template <typename T>
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient, T& remainder) noexcept
 {
-    half_word_div(lhs, rhs, quotient, remainder);
+    one_word_div(lhs, static_cast<std::uint64_t>(rhs), quotient, remainder);
 }
 
 template <typename T>
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void one_word_div(const T& lhs, const std::uint32_t rhs, T& quotient) noexcept
 {
-    half_word_div(lhs, rhs, quotient);
+    one_word_div(lhs, static_cast<std::uint64_t>(rhs), quotient);
 }
 
 #ifdef _MSC_VER
@@ -492,62 +568,33 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T
 {
     BOOST_INT128_ASSUME(divisor != static_cast<T>(0));
 
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
-
-    BOOST_INT128_IF_CONSTEXPR(!std::numeric_limits<T>::is_signed)
-    {
-        if (!BOOST_INT128_IS_CONSTANT_EVALUATED(dividend))
-        {
-            T remainder{};
-            return impl::div_mod_msvc<false>(dividend, divisor, remainder);
-        }
-    }
-
-    #endif
-
-    std::uint32_t u[4]{};
-    std::uint32_t v[4]{};
-    std::uint32_t q[4]{};
-
-    const auto m{ impl::to_words(dividend, u) };
-    const auto n{ impl::to_words(divisor, v) };
+    using high_word_type = decltype(T{}.high);
 
-    impl::knuth_divide<false>(u, m, v, n, q);
+    std::uint64_t rem_hi {};
+    std::uint64_t rem_lo {};
 
-    return impl::from_words<T>(q);
+    const auto q {div3by2<false>(static_cast<std::uint64_t>(dividend.high), dividend.low,
+                                 static_cast<std::uint64_t>(divisor.high), divisor.low, rem_hi, rem_lo)};
 
+    return T{static_cast<high_word_type>(0), q};
 }
 
 template <typename T>
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr T knuth_div(const T& dividend, const T& divisor, T& remainder) noexcept
 {
     BOOST_INT128_ASSUME(divisor != static_cast<T>(0));
-    
-    #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
-
-    BOOST_INT128_IF_CONSTEXPR(!std::numeric_limits<T>::is_signed)
-    {
-        if (!BOOST_INT128_IS_CONSTANT_EVALUATED(dividend))
-        {
-            return impl::div_mod_msvc<true>(dividend, divisor, remainder);
-        }
-    }
-
 
-    #endif
-
-    std::uint32_t u[4]{};
-    std::uint32_t v[4]{};
-    std::uint32_t q[4]{};
+    using high_word_type = decltype(T{}.high);
 
-    const auto m{ impl::to_words(dividend, u) };
-    const auto n{ impl::to_words(divisor, v) };
+    std::uint64_t rem_hi {};
+    std::uint64_t rem_lo {};
 
-    impl::knuth_divide<true>(u, m, v, n, q);
+    const auto q {div3by2<true>(static_cast<std::uint64_t>(dividend.high), dividend.low,
+                                static_cast<std::uint64_t>(divisor.high), divisor.low, rem_hi, rem_lo)};
 
-    remainder = impl::from_words<T>(u);
+    remainder = T{static_cast<high_word_type>(rem_hi), rem_lo};
 
-    return impl::from_words<T>(q);
+    return T{static_cast<high_word_type>(0), q};
 }
 
 #ifdef _MSC_VER
diff --git a/include/boost/int128/detail/common_mul.hpp b/include/boost/int128/detail/common_mul.hpp
index be26c763..e0c1a8e1 100644
--- a/include/boost/int128/detail/common_mul.hpp
+++ b/include/boost/int128/detail/common_mul.hpp
@@ -10,7 +10,6 @@
 #ifndef BOOST_INT128_BUILD_MODULE
 
 #include <cstdint>
-#include <cstring>
 
 #endif
 
@@ -18,85 +17,89 @@ namespace boost {
 namespace int128 {
 namespace detail {
 
-// See: The Art of Computer Programming Volume 2 (Semi-numerical algorithms) section 4.3.1
-// Algorithm M: Multiplication of Non-negative integers
-template <typename ReturnType, std::size_t u_size, std::size_t v_size>
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType knuth_multiply(const std::uint32_t (&u)[u_size],
-                                                              const std::uint32_t (&v)[v_size]) noexcept
+// High 64 bits of the 64x64 -> 128 product, computed with four 32-bit partial products
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t umulh_generic(const std::uint64_t a, const std::uint64_t b) noexcept
 {
-    using high_word_type = decltype(ReturnType{}.high);
-
-    std::uint32_t w[u_size + v_size] {};
+    const std::uint64_t a_lo {a & UINT32_MAX};
+    const std::uint64_t a_hi {a >> 32U};
+    const std::uint64_t b_lo {b & UINT32_MAX};
+    const std::uint64_t b_hi {b >> 32U};
 
-    // M.1
-    for (std::size_t j {}; j < v_size; ++j)
-    {
-        // M.2
-        if (v[j] == 0)
-        {
-            w[j + u_size] = 0;
-            continue;
-        }
-
-        // M.3
-        std::uint64_t t {};
-        for (std::size_t i {}; i < u_size; ++i)
-        {
-            // M.4
-            t += static_cast<std::uint64_t>(u[i]) * v[j] + w[i + j];
-            w[i + j] = static_cast<std::uint32_t>(t);
-            t >>= 32u;
-        }
-
-        // M.5
-        w[j + u_size] = static_cast<std::uint32_t>(t);
-    }
+    const std::uint64_t lo_lo {a_lo * b_lo};
+    const std::uint64_t hi_lo {a_hi * b_lo};
+    const std::uint64_t lo_hi {a_lo * b_hi};
+    const std::uint64_t hi_hi {a_hi * b_hi};
 
-    const auto low {static_cast<std::uint64_t>(w[0]) | (static_cast<std::uint64_t>(w[1]) << 32)};
-    const auto high {static_cast<std::uint64_t>(w[2]) | (static_cast<std::uint64_t>(w[3]) << 32)};
+    const std::uint64_t cross {(lo_lo >> 32U) + (hi_lo & UINT32_MAX) + (lo_hi & UINT32_MAX)};
 
-    return {static_cast<high_word_type>(high), low};
+    return hi_hi + (hi_lo >> 32U) + (lo_hi >> 32U) + (cross >> 32U);
 }
 
-template <typename T>
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const T& x, std::uint32_t (&words)[4]) noexcept
+// Full 64x64 -> 128 product
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr std::uint64_t umul(const std::uint64_t a, const std::uint64_t b, std::uint64_t& hi) noexcept
 {
     #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
 
-    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x))
+    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(a))
     {
-        std::memcpy(&words, &x, sizeof(T));
-        return;
+        #if defined(BOOST_INT128_HAS_INT128)
+
+        const detail::builtin_u128 product {static_cast<detail::builtin_u128>(a) * static_cast<detail::builtin_u128>(b)};
+        hi = static_cast<std::uint64_t>(product >> 64U);
+        return static_cast<std::uint64_t>(product);
+
+        #elif defined(_M_AMD64) && !defined(__GNUC__) && !defined(__CUDA_ARCH__)
+
+        return _umul128(a, b, &hi);
+
+        #elif defined(_M_ARM64) && !defined(__CUDA_ARCH__)
+
+        hi = __umulh(a, b);
+        return a * b;
+
+        #endif
     }
 
     #endif
 
-    words[0] = static_cast<std::uint32_t>(x.low & UINT32_MAX);                                  // LCOV_EXCL_LINE
-    words[1] = static_cast<std::uint32_t>(x.low >> 32);                                         // LCOV_EXCL_LINE
-    words[2] = static_cast<std::uint32_t>(static_cast<std::uint64_t>(x.high) & UINT32_MAX);     // LCOV_EXCL_LINE
-    words[3] = static_cast<std::uint32_t>(static_cast<std::uint64_t>(x.high) >> 32);            // LCOV_EXCL_LINE
+    hi = umulh_generic(a, b);
+    return a * b;
 }
 
+// Low 128 bits of a 128x128 product
+template <typename ReturnType, typename T>
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const T& rhs) noexcept
+{
+    using high_word_type = decltype(ReturnType{}.high);
+
+    std::uint64_t result_high {};
+    const std::uint64_t result_low {umul(lhs.low, rhs.low, result_high)};
+
+    result_high += lhs.low * static_cast<std::uint64_t>(rhs.high);
+    result_high += static_cast<std::uint64_t>(lhs.high) * rhs.low;
+
+    return ReturnType{static_cast<high_word_type>(result_high), result_low};
+}
 
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint64_t x, std::uint32_t (&words)[2]) noexcept
+// Low 128 bits of a 128x64 product
+template <typename ReturnType, typename T>
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const std::uint64_t rhs) noexcept
 {
-    #ifndef BOOST_INT128_NO_CONSTEVAL_DETECTION
+    using high_word_type = decltype(ReturnType{}.high);
 
-    if (!BOOST_INT128_IS_CONSTANT_EVALUATED(x))
-    {
-        std::memcpy(&words, &x, sizeof(std::uint64_t));
-        return;
-    }
+    std::uint64_t result_high {};
+    const std::uint64_t result_low {umul(lhs.low, rhs, result_high)};
 
-    #endif
+    result_high += static_cast<std::uint64_t>(lhs.high) * rhs;
 
-    words[0] = static_cast<std::uint32_t>(x & UINT32_MAX);  // LCOV_EXCL_LINE
-    words[1] = static_cast<std::uint32_t>(x >> 32);         // LCOV_EXCL_LINE
+    return ReturnType{static_cast<high_word_type>(result_high), result_low};
 }
 
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr void to_words(const std::uint32_t x, std::uint32_t (&words)[1]) noexcept
+// Low 128 bits of a 128x32 product
+template <typename ReturnType, typename T>
+BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr ReturnType low_word_mul(const T& lhs, const std::uint32_t rhs) noexcept
 {
-    words[0] = x;
+    return low_word_mul<ReturnType>(lhs, static_cast<std::uint64_t>(rhs));
 }
 
 } // namespace detail
diff --git a/include/boost/int128/detail/config.hpp b/include/boost/int128/detail/config.hpp
index 28ad2df9..4a93c2c3 100644
--- a/include/boost/int128/detail/config.hpp
+++ b/include/boost/int128/detail/config.hpp
@@ -169,6 +169,11 @@ using builtin_u128 = std::_Unsigned128;
 
 #endif // Platform macros
 
+// Hardware 128-bit by 64-bit unsigned division via the x86-64 DIV instruction
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER) && !defined(__CUDA_ARCH__)
+#  define BOOST_INT128_HAS_X86_64_DIVQ
+#endif
+
 // The builtin is only constexpr from clang-7 or GCC-10
 #ifdef __has_builtin
 #  if __has_builtin(__builtin_sub_overflow) && ((defined(__clang__) && __clang_major__ >= 7) || (defined(__GNUC__) && __GNUC__ >= 10))
diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp
index 9fd11a3e..898ae962 100644
--- a/include/boost/int128/detail/int128_imp.hpp
+++ b/include/boost/int128/detail/int128_imp.hpp
@@ -2012,66 +2012,14 @@ BOOST_INT128_HOST_DEVICE inline int128_t& int128_t::operator-=(const Integer rhs
 
 namespace detail {
 
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t signed_shift_left_32(const std::uint64_t low) noexcept
-{
-    return {static_cast<std::int64_t>(low >> 32), low << 32};
-}
-
-BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t library_mul(const int128_t lhs, const int128_t rhs) noexcept
-{
-    const auto a {lhs.low >> 32U};
-    const auto b {lhs.low & UINT32_MAX};
-    const auto c {rhs.low >> 32U};
-    const auto d {rhs.low & UINT32_MAX};
-
-    int128_t result { static_cast<std::int64_t>(static_cast<std::uint64_t>(lhs.high) * rhs.low + lhs.low * static_cast<std::uint64_t>(rhs.high) + a * c), b * d };
-    result += signed_shift_left_32(a * d) + signed_shift_left_32(b * c);
-
-    return result;
-}
-
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint64_t rhs) noexcept
 {
-    const auto low_res{lhs.low * rhs};
-
-    const auto a_lo{lhs.low & UINT32_MAX};
-    const auto a_high{lhs.low >> 32U};
-    const auto b_lo{rhs & UINT32_MAX};
-    const auto b_high{rhs >> 32U};
-
-    const auto lo_lo{a_lo * b_lo};
-    const auto lo_hi{a_lo * b_high};
-    const auto hi_lo{a_high * b_lo};
-    const auto hi_hi{a_high * b_high};
-
-    const auto mid{(lo_lo >> 32U) + (lo_hi & UINT32_MAX) + (hi_lo & UINT32_MAX)};
-
-    const auto carry{hi_hi + (lo_hi >> 32) + (hi_lo >> 32) + (mid >> 32)};
-
-    // Compute the high word in the unsigned domain so that the multiplication
-    // and addition wrap modulo 2^64.
-    const auto high_res{static_cast<std::int64_t>(static_cast<std::uint64_t>(lhs.high) * rhs + carry)};
-
-    return {high_res, low_res};
+    return low_word_mul<int128_t>(lhs, rhs);
 }
 
 BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mul(const int128_t lhs, const std::uint32_t rhs) noexcept
 {
-    const auto low_res{lhs.low * rhs};
-
-    const auto a_lo{lhs.low & UINT32_MAX};
-    const auto a_hi{lhs.low >> 32U};
-
-    const auto lo_lo{a_lo * rhs};
-    const auto hi_lo{a_hi * rhs};
-
-    const auto mid{(lo_lo >> 32U) + (hi_lo & UINT32_MAX)};
-
-    const auto carry{(hi_lo >> 32U) + (mid >> 32U)};
-
-    const auto high_res{static_cast<std::int64_t>(static_cast<std::uint64_t>(lhs.high) * rhs + carry)};
-
-    return {high_res, low_res};
+    return low_word_mul<int128_t>(lhs, rhs);
 }
 
 #if defined(_M_AMD64) && !defined(__GNUC__)
@@ -2096,7 +2044,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
     {
-        return library_mul(lhs, rhs);
+        return low_word_mul<int128_t>(lhs, rhs);
     }
     else
     {
@@ -2126,7 +2074,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
 
     #  else
 
-    return library_mul(lhs, rhs);
+    return low_word_mul<int128_t>(lhs, rhs);
 
     #  endif
 
@@ -2138,34 +2086,16 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr int128_t default_mu
 
     if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
     {
-        return library_mul(lhs, rhs); // LCOV_EXCL_LINE
+        return low_word_mul<int128_t>(lhs, rhs); // LCOV_EXCL_LINE
     }
     else
     {
         return msvc_amd64_mul(lhs, rhs);
     }
 
-    #elif (defined(_M_IX86) || defined(_M_ARM) || defined(__arm__)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
-
-    if (BOOST_INT128_IS_CONSTANT_EVALUATED(rhs))
-    {
-        return library_mul(lhs, rhs); // LCOV_EXCL_LINE
-    }
-    else
-    {
-        std::uint32_t lhs_words[4] {};
-        std::uint32_t rhs_words[4] {};
-
-        // Since in all likelihood this equates to memcpy we don't need to convert to non-negative integers and back
-        to_words(lhs, lhs_words);
-        to_words(rhs, rhs_words);
-
-        return knuth_multiply<int128_t>(lhs_words, rhs_words);
-    }
-
     #else
 
-    return library_mul(lhs, rhs);
+    return low_word_mul<int128_t>(lhs, rhs);
 
     #endif
 }
@@ -2276,21 +2206,16 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const
     {
         return {0,0};
     }
-    #if defined(BOOST_INT128_HAS_INT128)
-
-    return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / static_cast<detail::builtin_i128>(rhs));
 
-    #else
-
-    int128_t quotient {};
     const auto negative_res {(lhs.high < 0) != (rhs.high < 0)};
 
-    if (abs_rhs.high != 0)
-    {
-        quotient = detail::knuth_div(abs_lhs, abs_rhs);
-    }
-    else
+    // Narrow fast path: when the divisor magnitude fits in 64 bits, divide the magnitudes with
+    // the hardware-accelerated one_word_div and reapply the sign. This reuses the abs values
+    // computed above and beats native signed division (the out-of-line __divti3) for this case.
+    if (abs_rhs.high == 0)
     {
+        int128_t quotient {};
+
         if (abs_lhs.high == 0)
         {
             quotient = {0, abs_lhs.low / abs_rhs.low};
@@ -2299,9 +2224,19 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t operator/(const
         {
             detail::one_word_div(abs_lhs, abs_rhs.low, quotient);
         }
+
+        return negative_res ? -quotient : quotient;
     }
 
+    #if defined(BOOST_INT128_HAS_INT128)
+
+    return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) / static_cast<detail::builtin_i128>(rhs));
+
+    #else
+
+    const auto quotient {detail::knuth_div(abs_lhs, abs_rhs)};
     return negative_res ? -quotient : quotient;
+
     #endif
 }
 
@@ -2538,23 +2473,15 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const
     {
         return lhs;
     }
-    #if defined(BOOST_INT128_HAS_INT128)
-    else
-    {
-        return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) % static_cast<detail::builtin_i128>(rhs));
-    }
-    #else
 
-    const auto is_neg{lhs < 0};
-    
-    int128_t remainder {};
+    const auto is_neg {lhs < 0};
 
-    if (abs_rhs.high != 0)
-    {
-        detail::knuth_div(abs_lhs, abs_rhs, remainder);
-    }
-    else
+    // Narrow fast path: when the divisor magnitude fits in 64 bits, take the remainder of the
+    // magnitudes with the hardware-accelerated one_word_div and reapply the dividend's sign.
+    if (abs_rhs.high == 0)
     {
+        int128_t remainder {};
+
         if (abs_lhs.high == 0)
         {
             remainder = int128_t{0, abs_lhs.low % abs_rhs.low};
@@ -2562,11 +2489,20 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t operator%(const int128_t lhs, const
         else
         {
             int128_t quotient {};
-
             detail::one_word_div(abs_lhs, abs_rhs.low, quotient, remainder);
         }
+
+        return is_neg ? -remainder : remainder;
     }
 
+    #if defined(BOOST_INT128_HAS_INT128)
+
+    return static_cast<int128_t>(static_cast<detail::builtin_i128>(lhs) % static_cast<detail::builtin_i128>(rhs));
+
+    #else
+
+    int128_t remainder {};
+    detail::knuth_div(abs_lhs, abs_rhs, remainder);
     return is_neg ? -remainder : remainder;
 
     #endif
diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp
index 1788433d..e1b8d73d 100644
--- a/include/boost/int128/detail/uint128_imp.hpp
+++ b/include/boost/int128/detail/uint128_imp.hpp
@@ -2271,11 +2271,9 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_m
 
     #  endif
 
-    #elif (defined(__s390x__) || defined(__s390x__)) && defined(__GNUC__)
-    #  define BOOST_INT128_HIDE_MUL
-
-        return static_cast<uint128_t>(static_cast<builtin_u128>(lhs) * static_cast<builtin_u128>(rhs));
-
+    // s390x intentionally falls through to the synthetic low_word_mul below. Casting to builtin_u128
+    // makes GCC reconstruct the value through a vector-unit stack round-trip that is several times
+    // slower, and the memcpy path is unsafe for the narrow (scalar rhs) overloads on big-endian.
     #elif ((defined(_M_AMD64) && !defined(__GNUC__)) || defined(_M_ARM64)) && !defined(BOOST_INT128_NO_CONSTEVAL_DETECTION)
 
     if (!BOOST_INT128_IS_CONSTANT_EVALUATED(lhs))
@@ -2288,18 +2286,7 @@ BOOST_INT128_HOST_DEVICE BOOST_INT128_FORCE_INLINE constexpr uint128_t default_m
     // We need to hide this if we use a non-const eval method above to avoid a litany of cross-platform warnings
     #ifndef BOOST_INT128_HIDE_MUL
 
-    constexpr std::size_t rhs_words_needed {std::is_same<UnsignedInteger, std::uint32_t>::value ? 1 :
-                                            std::is_same<UnsignedInteger, std::uint64_t>::value ? 2 :
-                                            std::is_same<UnsignedInteger, uint128_t>::value ? 4 : 0};
-
-    static_assert(rhs_words_needed != 0, "Must be 32, 64 or 128 bit unsigned integer");
-
-    std::uint32_t lhs_words[4] {};
-    std::uint32_t rhs_words[rhs_words_needed] {};
-    to_words(lhs, lhs_words);
-    to_words(rhs, rhs_words);
-
-    return knuth_multiply<uint128_t>(lhs_words, rhs_words);
+    return low_word_mul<uint128_t>(lhs, rhs);
 
     #else
     #undef BOOST_INT128_HIDE_MUL
@@ -2506,31 +2493,30 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t operator/(const uint128_t lhs, cons
     {
         return {0, 0};
     }
-    #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
-    else
-    {
-        return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) / static_cast<detail::builtin_u128>(rhs));
-    }
-    #else
-    else if (rhs.high != 0U)
-    {
-        return detail::knuth_div(lhs, rhs);
-    }
-    else
+
+    // A divisor that fits in 64 bits is handled by the hardware-accelerated narrow path. This
+    // beats the native 128/128 divide for this common case on every platform (it avoids the
+    // out-of-line __udivti3 call on GCC/Clang and uses divq / _udiv128 directly where present).
+    if (rhs.high == 0U)
     {
         if (lhs.high == 0U)
         {
             return {0, lhs.low / rhs.low};
         }
-        else
-        {
-            uint128_t quotient {};
-
-            detail::one_word_div(lhs, rhs.low, quotient);
 
-            return quotient;
-        }
+        uint128_t quotient {};
+        detail::one_word_div(lhs, rhs.low, quotient);
+        return quotient;
     }
+
+    #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
+
+    return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) / static_cast<detail::builtin_u128>(rhs));
+
+    #else
+
+    return detail::knuth_div(lhs, rhs);
+
     #endif
 }
 
@@ -2665,38 +2651,36 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t operator%(const uint128_t lhs, cons
     {
         return {0, 0};
     }
-    else if (rhs > lhs)
+    if (rhs > lhs)
     {
         return lhs;
     }
-    #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
-    else
-    {
-        return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) % static_cast<detail::builtin_u128>(rhs));
-    }
-    #else
-    else if (rhs.high != 0U)
-    {
-        uint128_t remainder {};
-        detail::knuth_div(lhs, rhs, remainder);
-        return remainder;
-    }
-    else
+
+    // A divisor that fits in 64 bits is handled by the hardware-accelerated narrow path, which
+    // beats the native 128/128 divide for this common case on every platform.
+    if (rhs.high == 0U)
     {
         if (lhs.high == 0U)
         {
             return {0, lhs.low % rhs.low};
         }
-        else
-        {
-            uint128_t quotient {};
-            uint128_t remainder {};
-
-            detail::one_word_div(lhs, rhs.low, quotient, remainder);
 
-            return remainder;
-        }
+        uint128_t quotient {};
+        uint128_t remainder {};
+        detail::one_word_div(lhs, rhs.low, quotient, remainder);
+        return remainder;
     }
+
+    #if defined(BOOST_INT128_HAS_INT128) && !defined(__s390__) && !defined(__s390x__)
+
+    return static_cast<uint128_t>(static_cast<detail::builtin_u128>(lhs) % static_cast<detail::builtin_u128>(rhs));
+
+    #else
+
+    uint128_t remainder {};
+    detail::knuth_div(lhs, rhs, remainder);
+    return remainder;
+
     #endif
 }
 
diff --git a/test/Jamfile b/test/Jamfile
index cae51584..2be537df 100644
--- a/test/Jamfile
+++ b/test/Jamfile
@@ -44,6 +44,13 @@ project : requirements
     <toolset>clang:<warnings-as-errors>on
     <toolset>gcc:<warnings-as-errors>on
 
+    # The b2 sanitizer features only add -fsanitize flags; define the macros the
+    # tests use to skip checks that intentionally exercise UB (e.g. shift tests).
+    <undefined-sanitizer>on:<define>UBSAN=1
+    <undefined-sanitizer>norecover:<define>UBSAN=1
+    <address-sanitizer>on:<define>ASAN=1
+    <address-sanitizer>norecover:<define>ASAN=1
+
   [ requires cxx14_decltype_auto cxx14_generic_lambdas cxx14_return_type_deduction cxx14_variable_templates cxx14_constexpr ]
   ;
 
@@ -88,6 +95,7 @@ run test_format.cpp ;
 run test_fmt_format.cpp ;
 
 run test_div.cpp ;
+run test_div_primitives.cpp ;
 
 run test_num_digits.cpp ;
 run test_spaceship_operator.cpp ;
diff --git a/test/benchmark_i128.cpp b/test/benchmark_i128.cpp
index ba0a4f0d..4433a217 100644
--- a/test/benchmark_i128.cpp
+++ b/test/benchmark_i128.cpp
@@ -319,6 +319,36 @@ BOOST_INT128_NO_INLINE void test_two_element_operation(const std::vector<T>& dat
     std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n";
 }
 
+// Benchmarks the narrow division overloads (128-bit divided by a 64-bit or 32-bit value),
+// which exercise the hardware-accelerated one_word_div path rather than the full 128/128 divide.
+template <bool HalfWord, typename T>
+BOOST_INT128_NO_INLINE void test_narrow_division(const std::vector<T>& data_vec, const char* operation, const char* type)
+{
+    const auto t1 = std::chrono::steady_clock::now();
+    std::int64_t s = 0; // discard variable
+
+    for (std::size_t k {}; k < K; ++k)
+    {
+        for (std::size_t i {}; i < data_vec.size() - 1U; ++i)
+        {
+            if (HalfWord)
+            {
+                const auto divisor = static_cast<std::uint32_t>(data_vec[i + 1]) | 1U;
+                s += static_cast<std::int64_t>(data_vec[i] / divisor);
+            }
+            else
+            {
+                const auto divisor = static_cast<std::uint64_t>(data_vec[i + 1]) | UINT64_C(1);
+                s += static_cast<std::int64_t>(data_vec[i] / divisor);
+            }
+        }
+    }
+
+    const auto t2 = std::chrono::steady_clock::now();
+
+    std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n";
+}
+
 std::vector<int> generate_shift_vector()
 {
     std::random_device rd;
@@ -473,6 +503,32 @@ int main()
         #endif
 
         std::cerr << std::endl;
+
+        #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)
+        test_narrow_division<false>(builtin_vector, "div64", "Builtin");
+        #endif
+
+        test_narrow_division<false>(library_vector, "div64", "Library");
+        test_narrow_division<false>(mp_vector, "div64", "mp::i128");
+
+        #ifdef BOOST_INT128_BENCHMARK_ABSL
+        test_narrow_division<false>(absl_vector, "div64", "absl::i128");
+        #endif
+
+        std::cerr << std::endl;
+
+        #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)
+        test_narrow_division<true>(builtin_vector, "div32", "Builtin");
+        #endif
+
+        test_narrow_division<true>(library_vector, "div32", "Library");
+        test_narrow_division<true>(mp_vector, "div32", "mp::i128");
+
+        #ifdef BOOST_INT128_BENCHMARK_ABSL
+        test_narrow_division<true>(absl_vector, "div32", "absl::i128");
+        #endif
+
+        std::cerr << std::endl;
     }
     // Single word operations
     {
diff --git a/test/benchmark_u128.cpp b/test/benchmark_u128.cpp
index a8a88996..767a1bb5 100644
--- a/test/benchmark_u128.cpp
+++ b/test/benchmark_u128.cpp
@@ -337,6 +337,36 @@ BOOST_INT128_NO_INLINE void test_two_element_operation(const std::vector<T>& dat
     std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n";
 }
 
+// Benchmarks the narrow division overloads (128-bit divided by a 64-bit or 32-bit value),
+// which exercise the hardware-accelerated one_word_div path rather than the full 128/128 divide.
+template <bool HalfWord, typename T>
+BOOST_INT128_NO_INLINE void test_narrow_division(const std::vector<T>& data_vec, const char* operation, const char* type)
+{
+    const auto t1 = std::chrono::steady_clock::now();
+    std::uint64_t s = 0; // discard variable
+
+    for (std::size_t k {}; k < K; ++k)
+    {
+        for (std::size_t i {}; i < data_vec.size() - 1U; ++i)
+        {
+            if (HalfWord)
+            {
+                const auto divisor = static_cast<std::uint32_t>(data_vec[i + 1]) | 1U;
+                s += static_cast<std::uint64_t>(data_vec[i] / divisor);
+            }
+            else
+            {
+                const auto divisor = static_cast<std::uint64_t>(data_vec[i + 1]) | UINT64_C(1);
+                s += static_cast<std::uint64_t>(data_vec[i] / divisor);
+            }
+        }
+    }
+
+    const auto t2 = std::chrono::steady_clock::now();
+
+    std::cerr << operation << "<" << std::left << std::setw(11) << type << ">: " << std::setw( 10 ) << ( t2 - t1 ) / 1us << " us (s=" << s << ")\n";
+}
+
 template <typename T>
 BOOST_INT128_NO_INLINE void test_gcd(const std::vector<T>& data_vec, const char* type)
 {
@@ -586,6 +616,32 @@ int main()
 
         std::cerr << std::endl;
 
+        #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)
+        test_narrow_division<false>(builtin_vector, "div64", "Builtin");
+        #endif
+
+        test_narrow_division<false>(library_vector, "div64", "Library");
+        test_narrow_division<false>(mp_vector, "div64", "mp::u128");
+
+        #ifdef BOOST_INT128_BENCHMARK_ABSL
+        test_narrow_division<false>(absl_vector, "div64", "absl::u128");
+        #endif
+
+        std::cerr << std::endl;
+
+        #if defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)
+        test_narrow_division<true>(builtin_vector, "div32", "Builtin");
+        #endif
+
+        test_narrow_division<true>(library_vector, "div32", "Library");
+        test_narrow_division<true>(mp_vector, "div32", "mp::u128");
+
+        #ifdef BOOST_INT128_BENCHMARK_ABSL
+        test_narrow_division<true>(absl_vector, "div32", "absl::u128");
+        #endif
+
+        std::cerr << std::endl;
+
         #if (defined(BOOST_INT128_HAS_INT128) || defined(BOOST_INT128_HAS_MSVC_INTERNAL_I128)) && defined(BOOST_INT128_BENCHMARK_BUILTIN_GCD)
         //test_gcd(builtin_vector, "Builtin");
         #endif
diff --git a/test/test_ckd.cpp b/test/test_ckd.cpp
index 35b2a84c..e79343cc 100644
--- a/test/test_ckd.cpp
+++ b/test/test_ckd.cpp
@@ -122,13 +122,13 @@ bool ref_std_mul_overflow(const A a, const B b, R* r) noexcept
 }
 
 template <typename T1, typename T2, typename T3, typename Ref, typename Ckd>
-void check_op(const T2 a, const T3 b, Ref ref_overflow, Ckd ckd_overflow)
+void check_op(const T2 lhs, const T3 rhs, Ref ref_overflow, Ckd ckd_overflow)
 {
     T1 expected {};
-    const bool expected_overflow {ref_overflow(a, b, &expected)};
+    const bool expected_overflow {ref_overflow(lhs, rhs, &expected)};
 
     T1 got {};
-    const bool got_overflow {ckd_overflow(&got, a, b)};
+    const bool got_overflow {ckd_overflow(&got, lhs, rhs)};
 
     BOOST_TEST_EQ(got_overflow, expected_overflow);
     BOOST_TEST(got == expected);
@@ -479,7 +479,7 @@ void test_mul_edges()
 // constexpr usability for all three operations.
 //
 
-#if defined(__GNUC__) && __GNUC__ == 7 && !defined(__clang__) && !defined(__SIZEOF_INT128__)
+#if defined(__GNUC__) && __GNUC__ <= 7 && !defined(__clang__) && !defined(__SIZEOF_INT128__)
 #  define BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128
 #endif
 
@@ -515,12 +515,18 @@ constexpr int mul_value()
     return r;
 }
 
-#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128
 constexpr bool mul_overflows_i128_min()
 {
     int128_t r {0};
     return ckd_mul(&r, (std::numeric_limits<int128_t>::min)(), int128_t{-1});
 }
+
+#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128
+
+// MSVC 14.1 warns of integral overflow
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable: 4307)
 #endif
 
 void test_constexpr()
@@ -530,11 +536,15 @@ void test_constexpr()
     static_assert(mul_overflows_int_max(),  "INT_MAX * 2 overflows int");
     static_assert(sub_value() == 2,         "5 - 3 == 2");
     static_assert(mul_value() == 42,        "6 * 7 == 42");
-#ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128
     static_assert(mul_overflows_i128_min(), "INT128_MIN * -1 overflows int128_t");
-#endif
 }
 
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+#endif
+
 int main()
 {
     test_standard_oracle();
@@ -542,7 +552,10 @@ int main()
     test_add_edges();
     test_sub_edges();
     test_mul_edges();
+
+    #ifndef BOOST_INT128_TEST_CKD_NO_CONSTEXPR_128
     test_constexpr();
+    #endif
 
     return boost::report_errors();
 }
diff --git a/test/test_consteval_funcs.cpp b/test/test_consteval_funcs.cpp
index 02f0b760..64f1e79d 100644
--- a/test/test_consteval_funcs.cpp
+++ b/test/test_consteval_funcs.cpp
@@ -4,6 +4,12 @@
 
 #include <boost/int128.hpp>
 
+// Only warns on MSVC 14.1
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable:4307)
+#endif
+
 #if defined(__cpp_consteval) && __cpp_consteval >= 201811L
 #  define BOOST_INT128_CONSTEVAL consteval
 #else
diff --git a/test/test_div_primitives.cpp b/test/test_div_primitives.cpp
new file mode 100644
index 00000000..694f0fef
--- /dev/null
+++ b/test/test_div_primitives.cpp
@@ -0,0 +1,266 @@
+// Copyright 2025 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+
+// Validates the low-level division building blocks in detail/common_div.hpp:
+//   * udiv_2by1 / divlu : 128/64 -> 64-bit quotient + remainder
+//   * div3by2           : 128/128 (divisor >= 2^64) -> single 64-bit quotient + 128-bit remainder
+//
+// div3by2 is cross-checked against the independent 32-bit-limb Knuth Algorithm D
+// (impl::knuth_divide), which is a completely separate implementation, so this check is valid
+// on every platform. Where a native 128-bit integer exists it is also used as an oracle.
+
+#include <boost/int128/int128.hpp>
+#include <boost/int128/cstdlib.hpp>
+#include <boost/int128/iostream.hpp>
+#include <boost/int128/detail/common_div.hpp>
+#include <boost/core/lightweight_test.hpp>
+#include <random>
+#include <cstdint>
+
+using namespace boost::int128;
+
+static std::mt19937_64 rng(0xC0FFEEULL);
+static std::uniform_int_distribution<std::uint64_t> dist(0, UINT64_MAX);
+
+// Independent oracle for 128/128 division with divisor >= 2^64, using the 32-bit-limb
+// Knuth Algorithm D that the library retains. Returns quotient and remainder.
+static void knuth_oracle(const std::uint64_t uh, const std::uint64_t ul,
+                         const std::uint64_t vh, const std::uint64_t vl,
+                         uint128_t& quot, uint128_t& rem)
+{
+    const uint128_t u_val {uh, ul};
+    const uint128_t v_val {vh, vl};
+
+    if (u_val < v_val)
+    {
+        quot = uint128_t{UINT64_C(0)};
+        rem = u_val;
+        return;
+    }
+
+    std::uint32_t u[4] {};
+    std::uint32_t v[4] {};
+    std::uint32_t q[4] {};
+
+    const auto m {detail::impl::to_words(u_val, u)};
+    const auto n {detail::impl::to_words(v_val, v)};
+
+    detail::impl::knuth_divide<true>(u, m, v, n, q);
+
+    quot = detail::impl::from_words<uint128_t>(q);
+    rem = detail::impl::from_words<uint128_t>(u);
+}
+
+static void check_div3by2(const std::uint64_t uh, const std::uint64_t ul,
+                          const std::uint64_t vh, const std::uint64_t vl)
+{
+    std::uint64_t rem_hi {};
+    std::uint64_t rem_lo {};
+    const auto q {detail::div3by2<true>(uh, ul, vh, vl, rem_hi, rem_lo)};
+
+    uint128_t expected_q {};
+    uint128_t expected_r {};
+    knuth_oracle(uh, ul, vh, vl, expected_q, expected_r);
+
+    // The quotient always fits in 64 bits when the divisor is >= 2^64
+    BOOST_TEST_EQ(expected_q.high, UINT64_C(0));
+    BOOST_TEST_EQ(q, expected_q.low);
+    BOOST_TEST_EQ(uint128_t(rem_hi, rem_lo), expected_r);
+}
+
+static void test_div3by2_random()
+{
+    constexpr int iters {2000000};
+    for (int i {}; i < iters; ++i)
+    {
+        const auto uh {dist(rng)};
+        const auto ul {dist(rng)};
+        auto vh {dist(rng)};
+        const auto vl {dist(rng)};
+
+        if (vh == 0)
+        {
+            vh = 1; // divisor must be >= 2^64 for div3by2
+        }
+
+        check_div3by2(uh, ul, vh, vl);
+    }
+}
+
+static void test_div3by2_edges()
+{
+    const std::uint64_t test_words[] {
+        UINT64_C(0), UINT64_C(1), UINT64_C(2), UINT64_C(3),
+        UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000),
+        UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x0123456789ABCDEF),
+        UINT64_C(0xFFFFFFFF), UINT64_C(0x100000000)
+    };
+
+    for (const auto vh : test_words)
+    {
+        if (vh == 0)
+        {
+            continue; // div3by2 requires vh != 0
+        }
+
+        for (const auto vl : test_words)
+        {
+            for (const auto uh : test_words)
+            {
+                for (const auto ul : test_words)
+                {
+                    check_div3by2(uh, ul, vh, vl);
+                }
+            }
+        }
+    }
+
+    // abs(INT128_MIN) == 2^127 as a dividend, divided by a range of >= 2^64 divisors
+    for (const auto vl : test_words)
+    {
+        check_div3by2(UINT64_C(0x8000000000000000), UINT64_C(0), UINT64_C(0x8000000000000001), vl);
+        check_div3by2(UINT64_C(0x8000000000000000), UINT64_C(0), UINT64_C(0xFFFFFFFFFFFFFFFF), vl);
+    }
+}
+
+#if defined(BOOST_INT128_HAS_INT128)
+
+// Construct dividends of the exact form V*q + offset to stress the correction / add-back path,
+// where the single-digit quotient estimate is most likely to be one too large.
+static void test_div3by2_boundary()
+{
+    const std::uint64_t div_hi[] {
+        UINT64_C(1), UINT64_C(0x8000000000000000), UINT64_C(0xFFFFFFFFFFFFFFFF),
+        UINT64_C(0x0123456789ABCDEF), UINT64_C(0x00000000FFFFFFFF)
+    };
+    const std::uint64_t div_lo[] {
+        UINT64_C(0), UINT64_C(1), UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0xDEADBEEFCAFEBABE)
+    };
+    const std::uint64_t quotients[] {
+        UINT64_C(1), UINT64_C(2), UINT64_C(7), UINT64_C(0xFFFFFFFF),
+        UINT64_C(0xFFFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000), UINT64_C(0x123456789)
+    };
+
+    for (const auto vh : div_hi)
+    {
+        for (const auto vl : div_lo)
+        {
+            const detail::builtin_u128 v_val {(static_cast<detail::builtin_u128>(vh) << 64) | vl};
+
+            for (const auto q : quotients)
+            {
+                // offsets just below the divisor are where the maximum remainder lives
+                const detail::builtin_u128 offsets[] {
+                    detail::builtin_u128{0}, detail::builtin_u128{1}, v_val - 1, v_val >> 1
+                };
+
+                for (const auto off : offsets)
+                {
+                    const detail::builtin_u128 prod {v_val * q};
+
+                    // skip combinations where V*q already overflows 128 bits
+                    if (q != 0 && (prod / q) != v_val)
+                    {
+                        continue;
+                    }
+
+                    const detail::builtin_u128 u_val {prod + off};
+                    if (u_val < prod)
+                    {
+                        continue; // offset pushed us past 2^128
+                    }
+
+                    const auto uh {static_cast<std::uint64_t>(u_val >> 64)};
+                    const auto ul {static_cast<std::uint64_t>(u_val)};
+
+                    std::uint64_t rem_hi {};
+                    std::uint64_t rem_lo {};
+                    const auto got_q {detail::div3by2<true>(uh, ul, vh, vl, rem_hi, rem_lo)};
+
+                    BOOST_TEST_EQ(got_q, static_cast<std::uint64_t>(u_val / v_val));
+                    BOOST_TEST_EQ(uint128_t(rem_hi, rem_lo), static_cast<uint128_t>(u_val % v_val));
+                }
+            }
+        }
+    }
+}
+
+static void check_2by1(const std::uint64_t u1, const std::uint64_t u0, const std::uint64_t d)
+{
+    const detail::builtin_u128 full {(static_cast<detail::builtin_u128>(u1) << 64) | u0};
+    const auto expected_q {static_cast<std::uint64_t>(full / d)};
+    const auto expected_r {static_cast<std::uint64_t>(full % d)};
+
+    std::uint64_t r {};
+    const auto q {detail::udiv_2by1(u1, u0, d, r)};
+    BOOST_TEST_EQ(q, expected_q);
+    BOOST_TEST_EQ(r, expected_r);
+
+    // divlu is the portable fallback that udiv_2by1 uses off x86-64 / MSVC; test it directly too
+    std::uint64_t r2 {};
+    const auto q2 {detail::divlu(u1, u0, d, r2)};
+    BOOST_TEST_EQ(q2, expected_q);
+    BOOST_TEST_EQ(r2, expected_r);
+}
+
+static void test_udiv_2by1_random()
+{
+    constexpr int iters {2000000};
+    for (int i {}; i < iters; ++i)
+    {
+        const auto u0 {dist(rng)};
+        auto d {dist(rng)};
+        if (d == 0)
+        {
+            d = 1;
+        }
+        const auto u1 {dist(rng) % d}; // precondition: u1 < d
+
+        check_2by1(u1, u0, d);
+    }
+}
+
+static void test_udiv_2by1_edges()
+{
+    const std::uint64_t divisors[] {
+        UINT64_C(1), UINT64_C(2), UINT64_C(3), UINT64_C(10),
+        UINT64_C(0xFFFFFFFF), UINT64_C(0x100000000), UINT64_C(0x80000000),
+        UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_C(0x8000000000000000), UINT64_C(0xFFFFFFFFFFFFFFFF)
+    };
+    const std::uint64_t lows[] {
+        UINT64_C(0), UINT64_C(1), UINT64_C(0x80000000),
+        UINT64_C(0xFFFFFFFF), UINT64_C(0xFFFFFFFFFFFFFFFF)
+    };
+
+    for (const auto d : divisors)
+    {
+        for (const auto u0 : lows)
+        {
+            check_2by1(UINT64_C(0), u0, d);        // u1 == 0
+            check_2by1(d - 1, u0, d);              // u1 == d - 1 (maximal)
+            if (d > 1)
+            {
+                check_2by1(d / 2, u0, d);
+            }
+        }
+    }
+}
+
+#endif // BOOST_INT128_HAS_INT128
+
+int main()
+{
+    test_div3by2_random();
+    test_div3by2_edges();
+
+    #if defined(BOOST_INT128_HAS_INT128)
+
+    test_div3by2_boundary();
+    test_udiv_2by1_random();
+    test_udiv_2by1_edges();
+
+    #endif
+
+    return boost::report_errors();
+}
diff --git a/test/test_i128.cpp b/test/test_i128.cpp
index 254b497e..5dd982ff 100644
--- a/test/test_i128.cpp
+++ b/test/test_i128.cpp
@@ -77,7 +77,16 @@ IntType get_root_max()
 template <typename IntType>
 IntType get_root_min()
 {
-    return static_cast<IntType>(std::sqrt(std::numeric_limits<IntType>::min()));
+    // numeric_limits<IntType>::min() is negative for signed IntType, so sqrt() would be
+    // NaN and the cast UB; use the negative of the positive root instead.
+    BOOST_INT128_IF_CONSTEXPR (std::is_signed<IntType>::value)
+    {
+        return static_cast<IntType>(-get_root_max<IntType>());
+    }
+    else
+    {
+        return static_cast<IntType>(0);
+    }
 }
 
 #include <boost/random/uniform_int_distribution.hpp>
diff --git a/test/test_u128.cpp b/test/test_u128.cpp
index 092bacdd..925f2c73 100644
--- a/test/test_u128.cpp
+++ b/test/test_u128.cpp
@@ -67,7 +67,16 @@ T get_root_max()
 template <typename T>
 T get_root_min()
 {
-    return static_cast<T>(std::sqrt(std::numeric_limits<T>::min()));
+    // numeric_limits<T>::min() is negative for signed T, so sqrt() would be NaN
+    // and the cast UB; use the negative of the positive root instead.
+    BOOST_INT128_IF_CONSTEXPR (std::is_signed<T>::value)
+    {
+        return static_cast<T>(-get_root_max<T>());
+    }
+    else
+    {
+        return static_cast<T>(0);
+    }
 }
 
 #include <boost/random/uniform_int_distribution.hpp>
diff --git a/test/test_u128_no_sign_conv.cpp b/test/test_u128_no_sign_conv.cpp
index 55d08a31..729161ac 100644
--- a/test/test_u128_no_sign_conv.cpp
+++ b/test/test_u128_no_sign_conv.cpp
@@ -66,7 +66,16 @@ T get_root_max()
 template <typename T>
 T get_root_min()
 {
-    return static_cast<T>(std::sqrt(std::numeric_limits<T>::min()));
+    // numeric_limits<T>::min() is negative for signed T, so sqrt() would be NaN
+    // and the cast UB; use the negative of the positive root instead.
+    BOOST_INT128_IF_CONSTEXPR (std::is_signed<T>::value)
+    {
+        return static_cast<T>(-get_root_max<T>());
+    }
+    else
+    {
+        return static_cast<T>(0);
+    }
 }
 
 #include <boost/random/uniform_int_distribution.hpp>
diff --git a/test/test_x64_msvc_div.cpp b/test/test_x64_msvc_div.cpp
index 1d3f1e73..40cbb70c 100644
--- a/test/test_x64_msvc_div.cpp
+++ b/test/test_x64_msvc_div.cpp
@@ -2,93 +2,118 @@
 // Distributed under the Boost Software License, Version 1.0.
 // https://www.boost.org/LICENSE_1_0.txt
 
+// On MSVC x64 the division building blocks use the hardware intrinsics _udiv128 (via udiv_2by1)
+// and _umul128 (via umul, inside div3by2). This validates that intrinsic path against
+// intrinsic-free references on the same inputs: the portable Hacker's Delight divlu, and the
+// 32-bit-limb Knuth Algorithm D. Those references are in turn checked against a native 128-bit
+// integer on the platforms that have one (see test_div_primitives.cpp), so agreement here pins
+// down the MSVC intrinsic wiring specifically.
+
 #include <boost/int128.hpp>
+#include <boost/int128/detail/common_div.hpp>
 #include <boost/core/lightweight_test.hpp>
 #include <random>
 
 #if defined(_M_AMD64) && !defined(__GNUC__) && !defined(__clang__) && _MSC_VER >= 1920
 
+using boost::int128::uint128_t;
+
 static std::mt19937_64 rng{42};
-static constexpr std::size_t N{1024U};
+static constexpr std::size_t N{4096U};
 static std::uniform_int_distribution<std::uint64_t> dist{UINT64_C(0), UINT64_MAX};
-static std::uniform_int_distribution<std::uint32_t> dist32{UINT32_C(0), UINT32_MAX};
 
-void test_two_words()
+// Independent reference for 128/128 (divisor >= 2^64): the 32-bit-limb Knuth Algorithm D, which
+// uses no 64-bit-divide or multiply intrinsics.
+static void knuth_oracle(const std::uint64_t uh, const std::uint64_t ul,
+                         const std::uint64_t vh, const std::uint64_t vl,
+                         uint128_t& quot, uint128_t& rem)
 {
-    for (std::size_t i{}; i < N; ++i)
-    {
-        boost::int128::uint128_t lhs{dist(rng), dist(rng)};
-        boost::int128::uint128_t rhs{dist(rng), dist(rng)};
+    const uint128_t u_val{uh, ul};
+    const uint128_t v_val{vh, vl};
 
-        // Guarantee lhs is greater than rhs
-        if (lhs < rhs)
-        {
-            std::swap(lhs, rhs);
-        }
+    if (u_val < v_val)
+    {
+        quot = uint128_t{UINT64_C(0)};
+        rem = u_val;
+        return;
+    }
 
+    std::uint32_t u[4]{};
+    std::uint32_t v[4]{};
+    std::uint32_t q[4]{};
 
-        boost::int128::uint128_t remainder{};
-        const auto quotient{boost::int128::detail::impl::div_mod_msvc<true>(lhs, rhs, remainder)};
+    const auto m{boost::int128::detail::impl::to_words(u_val, u)};
+    const auto n{boost::int128::detail::impl::to_words(v_val, v)};
 
-        boost::int128::uint128_t knuth_remainder{};
-        const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)};
+    boost::int128::detail::impl::knuth_divide<true>(u, m, v, n, q);
 
-        BOOST_TEST_EQ(remainder, knuth_remainder);
-        BOOST_TEST_EQ(quotient, knuth_quotient);
-    }
+    quot = boost::int128::detail::impl::from_words<uint128_t>(q);
+    rem = boost::int128::detail::impl::from_words<uint128_t>(u);
 }
 
-void test_four_by_three()
+// _udiv128 (udiv_2by1) versus the portable divlu, for 128/64 -> 64.
+void test_udiv_2by1()
 {
     for (std::size_t i{}; i < N; ++i)
     {
-        boost::int128::uint128_t lhs{dist(rng), dist(rng)};
-        boost::int128::uint128_t rhs{dist32(rng), dist(rng)};
+        const auto u0{dist(rng)};
+        auto d{dist(rng)};
+        if (d == 0)
+        {
+            d = 1;
+        }
+        const auto u1{dist(rng) % d}; // precondition u1 < d
 
-        boost::int128::uint128_t remainder{};
-        const auto quotient{boost::int128::detail::impl::div_mod_msvc<true>(lhs, rhs, remainder)};
+        std::uint64_t r_intrin{};
+        const auto q_intrin{boost::int128::detail::udiv_2by1(u1, u0, d, r_intrin)};
 
-        boost::int128::uint128_t knuth_remainder{};
-        const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)};
+        std::uint64_t r_soft{};
+        const auto q_soft{boost::int128::detail::divlu(u1, u0, d, r_soft)};
 
-        BOOST_TEST_EQ(remainder, knuth_remainder);
-        BOOST_TEST_EQ(quotient, knuth_quotient);
+        BOOST_TEST_EQ(q_intrin, q_soft);
+        BOOST_TEST_EQ(r_intrin, r_soft);
     }
+}
 
-    // The biggest gap we can have between 2 word unsigned values
-    {
-        constexpr auto lhs{(std::numeric_limits<boost::int128::uint128_t>::max)()};
-        constexpr boost::int128::uint128_t rhs{1,0};
+static void check_div3by2(const std::uint64_t uh, const std::uint64_t ul,
+                          const std::uint64_t vh, const std::uint64_t vl)
+{
+    std::uint64_t rh{};
+    std::uint64_t rl{};
+    const auto q{boost::int128::detail::div3by2<true>(uh, ul, vh, vl, rh, rl)};
 
-        boost::int128::uint128_t remainder{};
-        const auto quotient{boost::int128::detail::impl::div_mod_msvc<true>(lhs, rhs, remainder)};
+    uint128_t expected_q{};
+    uint128_t expected_r{};
+    knuth_oracle(uh, ul, vh, vl, expected_q, expected_r);
 
-        boost::int128::uint128_t knuth_remainder{};
-        const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)};
+    BOOST_TEST_EQ(expected_q.high, UINT64_C(0));
+    BOOST_TEST_EQ(q, expected_q.low);
+    BOOST_TEST_EQ(uint128_t(rh, rl), expected_r);
+}
 
-        BOOST_TEST_EQ(remainder, knuth_remainder);
-        BOOST_TEST_EQ(quotient, knuth_quotient);
-    }
-    // And again for signed
+// _udiv128 + _umul128 (div3by2) versus the 32-bit-limb Knuth reference, for 128/128 -> 64.
+void test_div3by2()
+{
+    for (std::size_t i{}; i < N; ++i)
     {
-        constexpr auto lhs{static_cast<boost::int128::uint128_t>((std::numeric_limits<boost::int128::int128_t>::max)())};
-        constexpr boost::int128::uint128_t rhs{1,0};
-
-        boost::int128::uint128_t remainder{};
-        const auto quotient{boost::int128::detail::impl::div_mod_msvc<true>(lhs, rhs, remainder)};
-
-        boost::int128::uint128_t knuth_remainder{};
-        const auto knuth_quotient{boost::int128::detail::knuth_div(lhs, rhs, knuth_remainder)};
-
-        BOOST_TEST_EQ(remainder, knuth_remainder);
-        BOOST_TEST_EQ(quotient, knuth_quotient);
+        auto vh{dist(rng)};
+        if (vh == 0)
+        {
+            vh = 1; // div3by2 requires divisor >= 2^64
+        }
+        check_div3by2(dist(rng), dist(rng), vh, dist(rng));
     }
+
+    // The widest gap between two-word unsigned values, and the signed-max case the original test
+    // exercised, both dividing by exactly 2^64.
+    check_div3by2(UINT64_MAX, UINT64_MAX, UINT64_C(1), UINT64_C(0));
+    check_div3by2(UINT64_C(0x7FFFFFFFFFFFFFFF), UINT64_MAX, UINT64_C(1), UINT64_C(0));
 }
 
 int main()
 {
-    test_two_words();
-    test_four_by_three();
+    test_udiv_2by1();
+    test_div3by2();
 
     return boost::report_errors();
 }
@@ -100,4 +125,4 @@ int main()
     return 0;
 }
 
-#endif 
+#endif