libm: Use funnel shifts to speed up u256 shifting

tgross35 · tgross35 · commit 7fc84f41db96 · 2026-03-22T20:58:18.000-05:00
Switch to an algorithm using word-sized operations on an array to do
coarse shifts, then funnel shifts for the bit shifts. The result is
quite close to what LLVM generates when using native `u256` types.
diff --git a/libm/src/math/support/big.rs b/libm/src/math/support/big.rs
@@ -6,8 +6,11 @@ mod tests;
 use core::{fmt, ops};
 
 use super::{DInt, HInt, Int, MinInt};
+use crate::support::Word;
 
 const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_WORDS: usize = (u128::BITS / Word::BITS) as usize;
+const U256_WORDS: usize = U128_WORDS * 2;
 
 /// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
@@ -31,6 +34,29 @@ impl u256 {
             hi: self.hi as i128,
         }
     }
+
+    /// Split into words, with the least significant word first.
+    fn to_words(self) -> [Word; U256_WORDS] {
+        // The result with 64-bit words will be: [lo.lo(), lo.hi(), hi.lo(), hi.hi()].
+        let mut ret: [Word; _] = [0; U256_WORDS];
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * Word::BITS;
+            ret[i] = (self.lo >> shift) as Word;
+            ret[i + U128_WORDS] = (self.hi >> shift) as Word;
+        }
+        ret
+    }
+
+    /// Perform the opposite of [`to_words`]
+    fn from_words(words: [Word; U256_WORDS]) -> Self {
+        let mut ret = u256::ZERO;
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * usize::BITS;
+            ret.lo |= (words[i] as u128) << shift;
+            ret.hi |= (words[i + U128_WORDS] as u128) << shift;
+        }
+        ret
+    }
 }
 
 /// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -58,6 +84,26 @@ impl i256 {
             hi: self.hi as u128,
         }
     }
+
+    fn to_words(self) -> [Word; U256_WORDS] {
+        let mut ret: [Word; _] = [0; U256_WORDS];
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * Word::BITS;
+            ret[i] = (self.lo >> shift) as Word;
+            ret[i + U128_WORDS] = (self.hi >> shift) as Word;
+        }
+        ret
+    }
+
+    fn from_words(words: [Word; U256_WORDS]) -> Self {
+        let mut ret = u256::ZERO;
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * usize::BITS;
+            ret.lo |= (words[i] as u128) << shift;
+            ret.hi |= (words[i + U128_WORDS] as u128) << shift;
+        }
+        ret.signed()
+    }
 }
 
 impl MinInt for u256 {
@@ -130,59 +176,95 @@ macro_rules! impl_common {
             }
         }
 
-        impl ops::Shl<u32> for $ty {
+        impl ops::Shr<u32> for $ty {
             type Output = Self;
 
-            fn shl(mut self, rhs: u32) -> Self::Output {
-                debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
-
-                let half_bits = Self::BITS / 2;
-                let low_mask = half_bits - 1;
-                let s = rhs & low_mask;
-
-                let lo = self.lo;
-                let hi = self.hi;
+            fn shr(self, rhs: u32) -> Self::Output {
+                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
 
-                self.lo = lo << s;
+                // Set up an array with the input in the low half, zeros in the upper half
+                let mut words: [Word; _] = [0; U256_WORDS * 2];
+                words[..U256_WORDS].copy_from_slice(&self.to_words());
+
+                if <$ty>::SIGNED {
+                    // For i256, branchlessly set the upper words to all ones if the input
+                    // is negative.
+                    let top_word = words[U256_WORDS - 1].cast_signed() >> (Word::BITS - 1);
+                    for x in &mut words[U256_WORDS..] {
+                        *x = top_word.cast_unsigned();
+                    }
+                }
 
-                if rhs & half_bits == 0 {
-                    self.hi = (lo >> (low_mask ^ s) >> 1) as _;
-                    self.hi |= hi << s;
-                } else {
-                    self.hi = self.lo as _;
-                    self.lo = 0;
+                let shift = rhs & 255; // limit to 255 in cases of overflow
+                let word_shift = (shift / Word::BITS) as usize;
+                let bit_shift = shift % Word::BITS;
+
+                let mut ret: [Word; _] = [0; U256_WORDS];
+
+                // Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
+                // these loops get unrolled.
+                for i in 0..U256_WORDS {
+                    if i < (U256_WORDS - 1) {
+                        let hi = words[word_shift + i + 1];
+                        let lo = words[word_shift + i];
+
+                        ret[i] = <Word as HInt>::funnel_shr(hi, lo, bit_shift);
+                    } else if <$ty>::SIGNED {
+                        // The upper word doesn't get any sign bits via a funnel shift, so we need
+                        // an arithmetic shift to preserve sign.
+                        let mut x = words[word_shift + i].cast_signed();
+                        x >>= bit_shift;
+                        ret[i] = x.cast_unsigned();
+                    } else {
+                        ret[i] = words[word_shift + i] >> bit_shift;
+                    }
                 }
-                self
+
+                <$ty>::from_words(ret)
             }
         }
+    };
+}
 
-        impl ops::Shr<u32> for $ty {
-            type Output = Self;
+impl ops::Shl<u32> for u256 {
+    type Output = Self;
 
-            fn shr(mut self, rhs: u32) -> Self::Output {
-                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+    fn shl(self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
 
-                let half_bits = Self::BITS / 2;
-                let low_mask = half_bits - 1;
-                let s = rhs & low_mask;
+        // Set up an array with the input in the low half, zeros in the upper half
+        let mut words: [Word; _] = [0; U256_WORDS * 2];
+        words[U256_WORDS..].copy_from_slice(&self.to_words());
 
-                let lo = self.lo;
-                let hi = self.hi;
+        let shift = rhs & 255; // limit to 255 in cases of overflow
+        let word_shift = U256_WORDS - (shift / Word::BITS) as usize;
+        let bit_shift = shift % Word::BITS;
 
-                self.hi = hi >> s;
+        let mut ret: [Word; _] = [0; U256_WORDS];
 
-                #[allow(unused_comparisons)]
-                if rhs & half_bits == 0 {
-                    self.lo = (hi << (low_mask ^ s) << 1) as _;
-                    self.lo |= lo >> s;
-                } else {
-                    self.lo = self.hi as _;
-                    self.hi = if hi < 0 { !0 } else { 0 };
-                }
-                self
+        // Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
+        // these loops get unrolled.
+        for i in 0..U256_WORDS {
+            if i == 0 {
+                ret[i] = words[word_shift + i] << bit_shift;
+            } else {
+                let hi = words[word_shift + i];
+                let lo = words[word_shift + i - 1];
+
+                ret[i] = <Word as HInt>::funnel_shl(hi, lo, bit_shift);
             }
         }
-    };
+
+        u256::from_words(ret)
+    }
+}
+
+impl ops::Shl<u32> for i256 {
+    type Output = Self;
+
+    fn shl(self, rhs: u32) -> Self::Output {
+        (self.unsigned() << rhs).signed()
+    }
 }
 
 impl_common!(i256);
diff --git a/libm/src/math/support/int_traits.rs b/libm/src/math/support/int_traits.rs
@@ -344,6 +344,24 @@ pub trait HInt: Int {
     fn zero_widen_mul(self, rhs: Self) -> Self::D;
     /// Widening multiplication. This cannot overflow.
     fn widen_mul(self, rhs: Self) -> Self::D;
+
+    // FIXME(msrv): Use funnel shifts from `core` as a trait on `Int` when available.
+
+    /// Concatenate `self` and `right`, shift by `shift`, and return the upper half.
+    fn funnel_shl(self, right: Self, shift: u32) -> Self {
+        assert!(!Self::SIGNED, "unsupported for signed integers");
+        assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
+        let n = Self::D::from_lo_hi(right, self);
+        (n << shift).hi()
+    }
+
+    /// Concatenate `self` and `right`, shift by `shift`, and return the lower half.
+    fn funnel_shr(self, right: Self, shift: u32) -> Self {
+        assert!(!Self::SIGNED, "unsupported for signed integers");
+        assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
+        let n = Self::D::from_lo_hi(right, self);
+        (n >> shift).lo()
+    }
 }
 
 macro_rules! impl_d_int {
diff --git a/libm/src/math/support/mod.rs b/libm/src/math/support/mod.rs
@@ -32,6 +32,21 @@ pub use hex_float::{hf32, hf64};
 pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
 pub use modular::linear_mul_reduction;
 
+cfg_if! {
+    if #[cfg(target_pointer_width = "16")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u16;
+    } else if #[cfg(target_pointer_width = "32")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u32;
+    } else if #[cfg(target_pointer_width = "64")] {
+        /// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
+        pub type Word = u64;
+    } else {
+        compile_error!("unsupported pointer width");
+    }
+}
+
 /// Hint to the compiler that the current path is cold.
 pub fn cold_path() {
     #[cfg(intrinsics_enabled)]
@@ -70,3 +85,14 @@ pub unsafe fn unchecked_div_isize(x: isize, y: isize) -> isize {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn word_size() {
+        assert_eq!(size_of::<Word>(), size_of::<usize>());
+        assert_eq!(align_of::<Word>(), align_of::<usize>());
+    }
+}