libm: Use funnel shifts to speed up u256 shifting

tgross35 · tgross35 · commit a35316db3984 · 2026-03-20T18:06:46.000-04:00
Switch to an algorithm using word-sized operations on an array to do
coarse shifts, then funnel shifts for the bit shifts. The result is
quite close to what LLVM generates when using native `u256` types.
diff --git a/compiler-builtins/src/lib.rs b/compiler-builtins/src/lib.rs
@@ -6,9 +6,10 @@
 #![feature(cfg_target_has_atomic)]
 #![feature(compiler_builtins)]
 #![feature(core_intrinsics)]
+#![feature(funnel_shifts)]
 #![feature(linkage)]
-#![feature(repr_simd)]
 #![feature(macro_metavar_expr_concat)]
+#![feature(repr_simd)]
 #![feature(rustc_attrs)]
 #![cfg_attr(f16_enabled, feature(f16))]
 #![cfg_attr(f128_enabled, feature(f128))]
diff --git a/libm/src/lib.rs b/libm/src/lib.rs
@@ -2,6 +2,7 @@
 #![no_std]
 #![cfg_attr(intrinsics_enabled, allow(internal_features))]
 #![cfg_attr(intrinsics_enabled, feature(core_intrinsics))]
+#![cfg_attr(intrinsics_enabled, feature(funnel_shifts))]
 #![cfg_attr(
     all(intrinsics_enabled, target_family = "wasm"),
     feature(wasm_numeric_instr)
diff --git a/libm/src/math/support/big.rs b/libm/src/math/support/big.rs
@@ -8,6 +8,8 @@ use core::ops;
 use super::{DInt, HInt, Int, MinInt};
 
 const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_WORDS: usize = (u128::BITS / usize::BITS) as usize;
+const U256_WORDS: usize = U128_WORDS * 2;
 
 /// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
@@ -31,6 +33,26 @@ impl u256 {
             hi: self.hi as i128,
         }
     }
+
+    fn to_words(self) -> [usize; U256_WORDS] {
+        let mut ret = [0usize; U256_WORDS];
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * usize::BITS;
+            ret[i] = (self.lo >> shift) as usize;
+            ret[i + U128_WORDS] = (self.hi >> shift) as usize;
+        }
+        ret
+    }
+
+    fn from_words(words: [usize; U256_WORDS]) -> Self {
+        let mut ret = u256::ZERO;
+        for i in 0..U128_WORDS {
+            let shift = i as u32 * usize::BITS;
+            ret.lo |= (words[i] as u128) << shift;
+            ret.hi |= (words[i + U128_WORDS] as u128) << shift;
+        }
+        ret
+    }
 }
 
 /// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -156,34 +178,86 @@ macro_rules! impl_common {
                 self
             }
         }
+    };
+}
 
-        impl ops::Shr<u32> for $ty {
-            type Output = Self;
+impl ops::Shr<u32> for i256 {
+    type Output = Self;
 
-            fn shr(mut self, rhs: u32) -> Self::Output {
-                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+    fn shr(mut self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
 
-                let half_bits = Self::BITS / 2;
-                let low_mask = half_bits - 1;
-                let s = rhs & low_mask;
+        let half_bits = Self::BITS / 2;
+        let low_mask = half_bits - 1;
+        let s = rhs & low_mask;
 
-                let lo = self.lo;
-                let hi = self.hi;
+        let lo = self.lo;
+        let hi = self.hi;
 
-                self.hi = hi >> s;
+        self.hi = hi >> s;
 
-                #[allow(unused_comparisons)]
-                if rhs & half_bits == 0 {
-                    self.lo = (hi << (low_mask ^ s) << 1) as _;
-                    self.lo |= lo >> s;
-                } else {
-                    self.lo = self.hi as _;
-                    self.hi = if hi < 0 { !0 } else { 0 };
+        #[allow(unused_comparisons)]
+        if rhs & half_bits == 0 {
+            self.lo = (hi << (low_mask ^ s) << 1) as _;
+            self.lo |= lo >> s;
+        } else {
+            self.lo = self.hi as _;
+            self.hi = if hi < 0 { !0 } else { 0 };
+        }
+        self
+    }
+}
+
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
+
+    fn shr(self, rhs: u32) -> Self::Output {
+        debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+
+        // Set up an array with the input in the low half, zeros in the upper half
+        let mut words = [0usize; U256_WORDS * 2];
+        words[..U256_WORDS].copy_from_slice(&self.to_words());
+
+        let shift = rhs & 255; // limit to 255 in cases of overflow
+        let word_shift = (shift / usize::BITS) as usize;
+        let bit_shift = shift % usize::BITS;
+
+        let mut ret = [0usize; U256_WORDS];
+
+        // Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
+        // these loops get unrolled.
+        cfg_if! {
+            if #[cfg(intrinsics_enabled)] {
+                // Use funnel shifts if available to handle the two-word input, which
+                // can be a single instruction (`shrd` on x86).
+                for i in 0..U256_WORDS {
+                    if i < (U256_WORDS - 1) {
+                        let hi = words[word_shift + i + 1];
+                        let lo = words[word_shift + i];
+                        ret[i] = hi.funnel_shr(lo, bit_shift);
+                    } else {
+                        ret[i] = words[word_shift + i] >> bit_shift
+                    }
                 }
-                self
+            } else {
+                // Otherwise, perform the narrowing shift as a combined left and right shift.
+                // This doesn't get optimized quite as well.
+                for i in 0..U256_WORDS {
+                    ret[i] = words[word_shift + i] >> bit_shift;
+                }
+
+                if bit_shift != 0 {
+                    let lshift = usize::BITS - bit_shift as u32;
+                    for i in 0..(U256_WORDS - 1) {
+                        ret[i] |= words[word_shift + i + 1] << lshift;
+                    }
+                }
+
             }
         }
-    };
+
+        u256::from_words(ret)
+    }
 }
 
 impl_common!(i256);