diff --git a/src/primitives.rs b/src/primitives.rs
index 756fadceb..3d2d1828a 100644
--- a/src/primitives.rs
+++ b/src/primitives.rs
@@ -1,15 +1,5 @@
 use crate::{Choice, WideWord, Word};
 
-/// Adds wide numbers represented by pairs of (least significant word, most significant word)
-/// and returns the result in the same format `(lo, hi)`.
-#[inline(always)]
-#[allow(clippy::cast_possible_truncation)]
-pub(crate) const fn addhilo(x_lo: Word, x_hi: Word, y_lo: Word, y_hi: Word) -> (Word, Word) {
-    let res = (((x_hi as WideWord) << Word::BITS) | (x_lo as WideWord))
-        + (((y_hi as WideWord) << Word::BITS) | (y_lo as WideWord));
-    (res as Word, (res >> Word::BITS) as Word)
-}
-
 /// Computes `lhs + rhs + carry`, returning the result along with the new carry (0, 1, or 2).
 #[inline(always)]
 #[allow(clippy::cast_possible_truncation)]
diff --git a/src/uint/div_limb.rs b/src/uint/div_limb.rs
index d2f6c356f..88d8de540 100644
--- a/src/uint/div_limb.rs
+++ b/src/uint/div_limb.rs
@@ -3,14 +3,14 @@
 //! (DOI: 10.1109/TC.2010.143, <https://gmplib.org/~tege/division-paper.pdf>).
 
 use crate::{
-    Choice, CtSelect, Limb, NonZero, Uint, WideWord, Word,
-    primitives::{addhilo, widening_mul},
-    word,
+    Choice, CtSelect, Limb, NonZero, Uint, WideWord, Word, primitives::widening_mul, word,
 };
 
 cpubits::cpubits! {
     32 => {
         /// Calculates the reciprocal of the given 32-bit divisor with the highmost bit set.
+        ///
+        /// This method corresponds to Algorithm 3
         pub const fn reciprocal(d: Word) -> Word {
             debug_assert!(d >= (1 << (Word::BITS - 1)));
 
@@ -34,7 +34,7 @@ cpubits::cpubits! {
 
             // The paper has `(v2 + 1) * d / 2^32` (there's another 2^32, but it's accounted for later).
             // If `v2 == 2^32-1` this should give `d`, but we can't achieve this in our wrapping arithmetic.
-            // Hence the `ct_select()`.
+            // Hence the `word::select()`.
             let x = v2.wrapping_add(1);
             let (_lo, hi) = widening_mul(x, d);
             let hi = word::select(d, hi, Choice::from_u32_nz(x));
@@ -44,6 +44,8 @@ cpubits::cpubits! {
     }
     64 => {
         /// Calculates the reciprocal of the given 64-bit divisor with the highmost bit set.
+        ///
+        /// This method corresponds to Algorithm 2
         pub const fn reciprocal(d: Word) -> Word {
             debug_assert!(d >= (1 << (Word::BITS - 1)));
 
@@ -64,7 +66,7 @@ cpubits::cpubits! {
 
             // The paper has `(v3 + 1) * d / 2^64` (there's another 2^64, but it's accounted for later).
             // If `v3 == 2^64-1` this should give `d`, but we can't achieve this in our wrapping arithmetic.
-            // Hence the `ct_select()`.
+            // Hence the `word::select()`.
             let x = v3.wrapping_add(1);
             let (_lo, hi) = widening_mul(x, d);
             let hi = word::select(d, hi, word::choice_from_nz(x));
@@ -104,15 +106,18 @@ const fn short_div(mut dividend: u32, dividend_bits: u32, divisor: u32, divisor_
 
 /// Calculate the quotient and the remainder of the division of a wide word
 /// (supplied as high and low words) by `d`, with a precalculated reciprocal `v`.
+///
+/// This method corresponds to Algorithm 4
 #[inline(always)]
-pub(crate) const fn div2by1(u1: Word, u0: Word, reciprocal: &Reciprocal) -> (Word, Word) {
+pub(crate) const fn div2by1(u0: Word, u1: Word, reciprocal: &Reciprocal) -> (Word, Word) {
     let d = reciprocal.divisor_normalized;
+    let v = reciprocal.reciprocal;
 
-    debug_assert!(d >= (1 << (Word::BITS - 1)));
-    debug_assert!(u1 < d);
+    debug_assert!(d >= (1 << (Word::BITS - 1)), "divisor top bit unset");
+    debug_assert!(u1 < d, "dividend >= divisor");
 
-    let (q0, q1) = widening_mul(reciprocal.reciprocal, u1);
-    let (q0, q1) = addhilo(q0, q1, u0, u1);
+    let q = (v as WideWord * u1 as WideWord) + word::join(u0, u1);
+    let (q0, q1) = word::split_wide(q);
     let q1 = q1.wrapping_add(1);
     let r = u0.wrapping_sub(q1.wrapping_mul(d));
 
@@ -136,48 +141,44 @@ pub(crate) const fn div2by1(u1: Word, u0: Word, reciprocal: &Reciprocal) -> (Wor
 /// calculates `q` such that `q - 1 <= floor(u / v) <= q`.
 /// In place of `v1` takes its reciprocal, and assumes that `v` was already pre-shifted
 /// so that v1 has its most significant bit set (that is, the reciprocal's `shift` is 0).
+///
+// This method corresponds to Algorithm 5
 #[inline(always)]
+#[allow(clippy::cast_possible_truncation)]
 pub(crate) const fn div3by2(
-    u2: Word,
-    u1: Word,
-    u0: Word,
-    v1_reciprocal: &Reciprocal,
-    v0: Word,
-) -> Word {
-    debug_assert!(v1_reciprocal.shift == 0);
-    debug_assert!(u2 <= v1_reciprocal.divisor_normalized);
-
-    // This method corresponds to Algorithm Q:
-    // https://janmr.com/blog/2014/04/basic-multiple-precision-long-division/
-
-    let q_maxed = word::choice_from_eq(u2, v1_reciprocal.divisor_normalized);
-    let (mut quo, rem) = div2by1(word::select(u2, 0, q_maxed), u1, v1_reciprocal);
+    (u0, u1, u2): (Word, Word, Word),
+    (d0, d1): (Word, Word),
+    v: Word,
+) -> (Word, WideWord) {
+    let d = word::join(d0, d1);
+    let u_hi = word::join(u1, u2);
+
+    debug_assert!(d >= (1 << (WideWord::BITS - 1)), "divisor top bit unset");
+    debug_assert!(u_hi <= d, "dividend > divisor");
+
+    let q = (v as WideWord * u2 as WideWord) + u_hi;
+    let q1w = q >> Word::BITS;
+    let r1 = u1.wrapping_sub((q1w as Word).wrapping_mul(d1));
+    let t = d0 as WideWord * q1w;
+    let r = word::join(u0, r1).wrapping_sub(t).wrapping_sub(d);
+
+    let r1_ge_q0 = word::choice_from_le(q as Word, (r >> Word::BITS) as Word);
+    let q1 = q1w as Word;
+    let q1 = word::select(q1.wrapping_add(1), q1, r1_ge_q0);
+    let r = word::select_wide(r, r.wrapping_add(d), r1_ge_q0);
+
+    let r_ge_d = word::choice_from_wide_le(d, r);
+    let q1 = word::select(q1, q1.wrapping_add(1), r_ge_d);
+    let r = word::select_wide(r, r.wrapping_sub(d), r_ge_d);
+
     // When the leading dividend word equals the leading divisor word, cap the quotient
-    // at Word::MAX and set the remainder to the sum of the top dividend words.
-    quo = word::select(quo, Word::MAX, q_maxed);
-    let mut rem = word::select_wide(
-        rem as WideWord,
-        (u2 as WideWord) + (u1 as WideWord),
-        q_maxed,
-    );
-
-    let mut i = 0;
-    while i < 2 {
-        let qy = (quo as WideWord) * (v0 as WideWord);
-        let rx = (rem << Word::BITS) | (u0 as WideWord);
-        // If r < b and q*y[-2] > r*x[-1], then set q = q - 1 and r = r + v1
-        let done =
-            word::choice_from_nz((rem >> Word::BITS) as Word).or(word::choice_from_wide_le(qy, rx));
-        quo = word::select(quo.wrapping_sub(1), quo, done);
-        rem = word::select_wide(
-            rem + (v1_reciprocal.divisor_normalized as WideWord),
-            rem,
-            done,
-        );
-        i += 1;
-    }
+    // at WideWord::MAX and update the remainder. This differs from the original algorithm
+    // but is required for multi-word division.
+    let maxed = word::choice_from_wide_eq(u_hi, d);
+    let q1 = word::select(q1, Word::MAX, maxed);
+    let r = word::select_wide(r, d.saturating_add(u0 as WideWord), maxed);
 
-    quo
+    (q1, r)
 }
 
 /// A pre-calculated reciprocal for division by a single limb.
@@ -229,6 +230,35 @@ impl Reciprocal {
     pub const fn shift(&self) -> u32 {
         self.shift
     }
+
+    /// Adjusted reciprocal for 3x2 division
+    ///
+    /// This method corresponds to Algorithm 6
+    #[must_use]
+    pub const fn reciprocal_3by2(&self, d0: Word, d1: Word) -> Word {
+        debug_assert!(self.shift == 0 && d1 == self.divisor_normalized);
+
+        let v = self.reciprocal;
+        let p = d1.wrapping_mul(v).wrapping_add(d0);
+
+        let p_lt_d0 = word::choice_from_lt(p, d0);
+        let v = word::select(v, v.wrapping_sub(1), p_lt_d0);
+
+        let p_ge_d1 = word::choice_from_le(d1, p).and(p_lt_d0);
+        let v = word::select(v, v.wrapping_sub(1), p_ge_d1);
+        let p = word::select(p, p.wrapping_sub(d1), p_ge_d1);
+        let p = word::select(p, p.wrapping_sub(d1), p_lt_d0);
+
+        let (t0, t1) = widening_mul(v, d0);
+        let p = p.wrapping_add(t1);
+
+        let p_lt_t1 = word::choice_from_lt(p, t1);
+        let v = word::select(v, v.wrapping_sub(1), p_lt_t1);
+        let d = word::join(d0, d1);
+        let t0p = word::join(t0, p);
+        let t0p_ge_d = word::choice_from_wide_le(d, t0p).and(p_lt_t1);
+        word::select(v, v.wrapping_sub(1), t0p_ge_d)
+    }
 }
 
 impl CtSelect for Reciprocal {
@@ -265,7 +295,7 @@ pub(crate) const fn rem_limb_with_reciprocal<const L: usize>(
     let mut j = L;
     while j > 0 {
         j -= 1;
-        let (_, rj) = div2by1(r, u_shifted.as_limbs()[j].0, reciprocal);
+        let (_, rj) = div2by1(u_shifted.as_limbs()[j].0, r, reciprocal);
         r = rj;
     }
     Limb(r >> reciprocal.shift)
@@ -281,8 +311,50 @@ pub(crate) const fn mul_rem(a: Limb, b: Limb, d: NonZero<Limb>) -> Limb {
 
 #[cfg(test)]
 mod tests {
-    use super::{Reciprocal, div2by1};
-    use crate::{Limb, NonZero, Word};
+    use super::{Reciprocal, div2by1, reciprocal};
+    use crate::{Limb, NonZero, Uint, WideWord, Word, word};
+
+    #[test]
+    fn reciprocal_valid() {
+        #![allow(clippy::integer_division_remainder_used, reason = "test")]
+        fn test(d: Word) {
+            let v = reciprocal(d);
+
+            // the reciprocal must be equal to floor((β^2 - 1) / d) - β
+            // v = floor((β^2 - 1) / d) - β = floor((β - 1 - d)*β + β - 1>/d)
+            let expected = WideWord::MAX / WideWord::from(d) - WideWord::from(Word::MAX) - 1;
+            assert_eq!(WideWord::from(v), expected);
+        }
+
+        test(Word::MAX);
+        test(1 << (Word::BITS - 1));
+        test((1 << (Word::BITS - 1)) | 1);
+    }
+
+    #[test]
+    fn reciprocal_3by2_valid() {
+        fn test(d: WideWord) {
+            let (d0, d1) = word::split_wide(d);
+            let v0 = Reciprocal::new(NonZero::<Limb>::new_unwrap(Limb(d1)));
+            let v = v0.reciprocal_3by2(d0, d1);
+
+            // the reciprocal must be equal to v = floor((β^3 − 1)/d) − β
+            // (β^3 − βd - 1)/d - 1 < v <= (β^3 − βd - 1)/d
+            // β^3 − βd - 1 - d < v*d <= β^3 − βd - 1
+            // β^3-1 - d < (v+β)d <= β^3-1
+            let actual = (Uint::<3>::from_word(v)
+                + Uint::<3>::ZERO.set_bit_vartime(Word::BITS, true))
+            .checked_mul(&Uint::<3>::from_wide_word(d))
+            .expect("overflow");
+            let min = Uint::<3>::MAX - Uint::<3>::from_wide_word(d);
+            assert!(actual > min, "{actual} <= {min}");
+        }
+
+        test(WideWord::MAX);
+        test(1 << (WideWord::BITS - 1));
+        test((1 << (WideWord::BITS - 1)) | 1);
+    }
+
     #[test]
     fn div2by1_overflow() {
         // A regression test for a situation when in div2by1() an operation (`q1 + 1`)
@@ -290,7 +362,7 @@ mod tests {
         // still overflows because we're calculating the results for both branches.
         let r = Reciprocal::new(NonZero::new(Limb(Word::MAX - 1)).unwrap());
         assert_eq!(
-            div2by1(Word::MAX - 2, Word::MAX - 63, &r),
+            div2by1(Word::MAX - 63, Word::MAX - 2, &r),
             (Word::MAX, Word::MAX - 65)
         );
     }
diff --git a/src/uint/ref_type/div.rs b/src/uint/ref_type/div.rs
index f32e8a34b..628241df3 100644
--- a/src/uint/ref_type/div.rs
+++ b/src/uint/ref_type/div.rs
@@ -234,7 +234,7 @@ impl UintRef {
         // but this can only be the case if `limb_div` is falsy, in which case we discard
         // the result anyway, so we conditionally set `x_hi` to zero for this branch.
         let x_hi_adjusted = Limb::select(Limb::ZERO, x_hi, limb_div);
-        let (quo2, rem2) = div2by1(x_hi_adjusted.0, x.limbs[0].0, &reciprocal);
+        let (quo2, rem2) = div2by1(x.limbs[0].0, x_hi_adjusted.0, &reciprocal);
 
         // Adjust the quotient for single limb division
         x.limbs[0] = Limb::select(x.limbs[0], Limb(quo2), limb_div);
@@ -293,14 +293,15 @@ impl UintRef {
         let mut i;
         let mut carry;
 
+        // Compute the adjusted reciprocal
+        let v = reciprocal.reciprocal_3by2(y.limbs[ysize - 2].0, y.limbs[ysize - 1].0);
+
         while xi > 0 {
             // Divide high dividend words by the high divisor word to estimate the quotient word
-            let mut quo = div3by2(
-                x_hi.0,
-                x_xi.0,
-                x.limbs[xi - 1].0,
-                &reciprocal,
-                y.limbs[ysize - 2].0,
+            let (mut quo, _) = div3by2(
+                (x.limbs[xi - 1].0, x_xi.0, x_hi.0),
+                (y.limbs[ysize - 2].0, y.limbs[ysize - 1].0),
+                v,
             );
 
             // This loop is a no-op once xi is smaller than the number of words in the divisor
@@ -415,7 +416,7 @@ impl UintRef {
         // but this can only be the case if `limb_div` is falsy, in which case we discard
         // the result anyway, so we conditionally set `x_hi` to zero for this branch.
         let x_hi_adjusted = Limb::select(Limb::ZERO, x_hi, limb_div);
-        let (_, rem2) = div2by1(x_hi_adjusted.0, x.limbs[0].0, &reciprocal);
+        let (_, rem2) = div2by1(x.limbs[0].0, x_hi_adjusted.0, &reciprocal);
 
         // Copy out the low limb of the remainder
         y.limbs[0] = Limb::select(x.limbs[0], Limb(rem2), limb_div);
@@ -465,17 +466,18 @@ impl UintRef {
         let mut i;
         let mut carry;
 
+        // Compute the adjusted reciprocal
+        let v = reciprocal.reciprocal_3by2(y.limbs[ysize - 2].0, y.limbs[ysize - 1].0);
+
         // We proceed similarly to `div_rem_large_shifted()` applied to the high half of
         // the dividend, fetching the limbs from the lower part as we go.
 
         while xi > 0 {
             // Divide high dividend words by the high divisor word to estimate the quotient word
-            let mut quo = div3by2(
-                x_hi.0,
-                x_xi.0,
-                x.limbs[xi - 1].0,
-                &reciprocal,
-                y.limbs[ysize - 2].0,
+            let (mut quo, _) = div3by2(
+                (x.limbs[xi - 1].0, x_xi.0, x_hi.0),
+                (y.limbs[ysize - 2].0, y.limbs[ysize - 1].0),
+                v,
             );
 
             // This loop is a no-op once xi is smaller than the number of words in the divisor
@@ -563,7 +565,7 @@ impl UintRef {
         let mut j = self.limbs.len();
         while j > 0 {
             j -= 1;
-            (self.limbs[j].0, hi.0) = div2by1(hi.0, self.limbs[j].0, reciprocal);
+            (self.limbs[j].0, hi.0) = div2by1(self.limbs[j].0, hi.0, reciprocal);
         }
         hi.shr(reciprocal.shift())
     }
@@ -594,9 +596,9 @@ impl UintRef {
             j -= 1;
             lo = self.limbs[j].0 << lshift;
             lo |= word::select(0, self.limbs[j - 1].0 >> rshift, nz);
-            (_, hi) = div2by1(hi, lo, reciprocal);
+            (_, hi) = div2by1(lo, hi, reciprocal);
         }
-        (_, hi) = div2by1(hi, self.limbs[0].0 << lshift, reciprocal);
+        (_, hi) = div2by1(self.limbs[0].0 << lshift, hi, reciprocal);
         Limb(hi >> lshift)
     }
 }
diff --git a/src/word.rs b/src/word.rs
index 30e7ecba3..a0e472cde 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -54,6 +54,7 @@ cpubits::cpubits! {
         pub(crate) const fn choice_from_wide_le(x: WideWord, y: WideWord) -> Choice {
             Choice::from_u128_le(x, y)
         }
+
     }
 }
 
@@ -63,6 +64,12 @@ pub(crate) const fn choice_from_eq(x: Word, y: Word) -> Choice {
     choice_from_nz(x ^ y).not()
 }
 
+/// Returns the truthy value if `x == y`, and the falsy value otherwise.
+#[inline]
+pub(crate) const fn choice_from_wide_eq(x: WideWord, y: WideWord) -> Choice {
+    choice_from_wide_nz(x ^ y).not()
+}
+
 /// Returns the truthy value if `x > y`, and the falsy value otherwise.
 #[inline]
 pub(crate) const fn choice_from_gt(x: Word, y: Word) -> Choice {
@@ -94,6 +101,12 @@ pub(crate) const fn choice_from_nz(value: Word) -> Choice {
     choice_from_lsb((value | value.wrapping_neg()) >> (Word::BITS - 1))
 }
 
+/// Returns the truthy value if `value != 0`, and the falsy value otherwise.
+#[inline]
+pub(crate) const fn choice_from_wide_nz(value: WideWord) -> Choice {
+    choice_from_lsb(((value | value.wrapping_neg()) >> (WideWord::BITS - 1)) as Word)
+}
+
 /// Return `b` if `self` is truthy, otherwise return `a`.
 #[inline]
 pub(crate) const fn select(a: Word, b: Word, choice: Choice) -> Word {
@@ -126,6 +139,17 @@ pub(crate) const fn choice_to_wide_mask(choice: Choice) -> WideWord {
     (choice.to_u8_vartime() as WideWord).wrapping_neg()
 }
 
+#[inline(always)]
+pub(crate) const fn join(lo: Word, hi: Word) -> WideWord {
+    ((hi as WideWord) << Word::BITS) | (lo as WideWord)
+}
+
+#[inline(always)]
+#[allow(clippy::cast_possible_truncation)]
+pub(crate) const fn split_wide(wide: WideWord) -> (Word, Word) {
+    (wide as Word, (wide >> Word::BITS) as Word)
+}
+
 #[cfg(test)]
 mod tests {
     use super::{Choice, WideWord, Word};
@@ -151,6 +175,14 @@ mod tests {
         assert!(!super::choice_from_wide_le(6, 5).to_bool());
     }
 
+    #[test]
+    fn join_split_wide() {
+        let a: Word = 1;
+        let b: Word = 2;
+        assert_eq!(super::split_wide(super::join(a, b)), (a, b));
+        assert_eq!(super::split_wide(super::join(b, a)), (b, a));
+    }
+
     #[test]
     fn select() {
         let a: Word = 1;