Merge #7352: perf: optimize division to uint32 to make re-target-pow calculation much faster

PastaPastaPasta · PastaPastaPasta · commit 68d8fa3f45c0 · 2026-06-13T01:32:54.000-05:00
e8435d3 test: add regression test for faster division of base_uint<256> to uint32_t (Konstantin Akimov) de1e56a perf: optimize division to uint32 to make re-target-pow calculation much faster (Konstantin Akimov) Pull request description: ## Issue being fixed or feature implemented Dash retargets difficulty every block via DarkGravityWave (24-block loop), so every header processed during sync runs several 256-bit divisions. Bitcoin only retargets once per 2016 blocks. Every division in DGW is by a small integer (nCountBlocks+1) * nTargetTimespan (3600). But the code routes through the only available overload, `operator/=(const base_uint&)`, which does bit-by-bit long division: ~224 iterations, each doing a full-width `div >>= 1` During header sync up to 35% of CPU is wasted on this division on release build (accordingly `perf`): ``` 20.36% d-msghand dash-qt [.] base_uint<256u>::operator>>=(unsigned int) 15.00% d-msghand dash-qt [.] base_uint<256u>::operator/=(base_uint<256u> const&) ``` ## What was done? Implemented optimization for division to unsigned values that could fit inside uint32_t. ## How Has This Been Tested? Got GUIX build from changes in this PR and develop: ``` [develop] 5af9f57/src/qt/dash-qt -datadir=/tmp/dd -reindex real 7m42.152s user 7m0.396s sys 0m3.328s [PR] 8608a3c3bbcd/src/qt/dash-qt -datadir=/tmp/dd -reindex real 4m17.294s user 2m55.474s sys 0m2.731s ``` Perf stats are updated as expected: ``` - 1.61% 0.06% d-msghand dash-qt [.] base_uint<256u>::operator/=(base_uint<256u> const&) - 1.55% base_uint<256u>::operator/=(base_uint<256u> const&) 0.92% base_uint<256u>::operator>>=(unsigned int) + 1.48% 0.57% d-msghand dash-qt [.] base_uint<256u>::operator>>=(unsigned int) ``` So total header sync just a half of the time. Please note, that header sync will be slightly slower due to #7320 so overall win is smaller. ## Breaking Changes N/A ## Checklist: - [x] I have performed a self-review of my own code - [x] I have commented my code, particularly in hard-to-understand areas - [x] I have added or updated relevant unit/integration/functional/e2e tests - [ ] I have made corresponding changes to the documentation - [x] I have assigned this pull request to a milestone _(for repository code-owners and collaborators only)_ ACKs for top commit: UdjinM6: utACK e8435d3 Tree-SHA512: 797c446cbfad6a1eecb98bee893d530e88db7576d0d15faa36c0d9ccb6d3872313c29f977194ba42cfdedf837a7227018dc750fa87b1f376106831204b08f650
diff --git a/src/arith_uint256.cpp b/src/arith_uint256.cpp
@@ -5,6 +5,7 @@
 
 #include <arith_uint256.h>
 
+#include <limits>
 #include <uint256.h>
 #include <crypto/common.h>
 
@@ -72,6 +73,23 @@ base_uint<BITS>& base_uint<BITS>::operator*=(const base_uint& b)
     return *this;
 }
 
+template <unsigned int BITS>
+base_uint<BITS>& base_uint<BITS>::operator/=(uint64_t b32)
+{
+    if (b32 == 0)
+        throw uint_error("Division by zero");
+    if (b32 > std::numeric_limits<uint32_t>::max()) {
+        return *this /= base_uint(b32);
+    }
+    uint64_t rem = 0;
+    for (int i = WIDTH - 1; i >= 0; --i) {
+        uint64_t cur = (rem << 32) | pn[i];
+        pn[i] = static_cast<uint32_t>(cur / b32);
+        rem = cur % b32;
+    }
+    return *this;
+}
+
 template <unsigned int BITS>
 base_uint<BITS>& base_uint<BITS>::operator/=(const base_uint& b)
 {
diff --git a/src/arith_uint256.h b/src/arith_uint256.h
@@ -158,6 +158,7 @@ class base_uint
 
     base_uint& operator*=(uint32_t b32);
     base_uint& operator*=(const base_uint& b);
+    base_uint& operator/=(uint64_t b32);
     base_uint& operator/=(const base_uint& b);
 
     base_uint& operator++()
@@ -207,6 +208,7 @@ class base_uint
     friend inline base_uint operator>>(const base_uint& a, int shift) { return base_uint(a) >>= shift; }
     friend inline base_uint operator<<(const base_uint& a, int shift) { return base_uint(a) <<= shift; }
     friend inline base_uint operator*(const base_uint& a, uint32_t b) { return base_uint(a) *= b; }
+    friend inline base_uint operator/(const base_uint& a, uint64_t b) { return base_uint(a) /= b; }
     friend inline bool operator==(const base_uint& a, const base_uint& b) { return memcmp(a.pn, b.pn, sizeof(a.pn)) == 0; }
     friend inline bool operator!=(const base_uint& a, const base_uint& b) { return memcmp(a.pn, b.pn, sizeof(a.pn)) != 0; }
     friend inline bool operator>(const base_uint& a, const base_uint& b) { return a.CompareTo(b) > 0; }
diff --git a/src/test/arith_uint256_tests.cpp b/src/test/arith_uint256_tests.cpp
@@ -361,6 +361,19 @@ BOOST_AUTO_TEST_CASE( divide )
     BOOST_CHECK(R2L / MaxL == ZeroL);
     BOOST_CHECK(MaxL / R2L == 1);
     BOOST_CHECK_THROW(R2L / ZeroL, uint_error);
+
+    // The integer-divisor overload must be bit-identical to dividing by the same
+    // value widened to arith_uint256, for both the fast word-wise path
+    // (<= uint32 max) and the arith_uint256 fallback above it.
+    for (uint64_t d : {uint64_t{1}, uint64_t{2}, uint64_t{24}, uint64_t{3600},
+                       uint64_t{std::numeric_limits<uint32_t>::max()},
+                       uint64_t{std::numeric_limits<uint32_t>::max()} + 1,
+                       uint64_t{0xFEDCBA9876543210}}) {
+        BOOST_CHECK(R1L / d == R1L / arith_uint256(d));
+        BOOST_CHECK(R2L / d == R2L / arith_uint256(d));
+        BOOST_CHECK(MaxL / d == MaxL / arith_uint256(d));
+    }
+    BOOST_CHECK_THROW(R1L / uint64_t{0}, uint_error);
 }