Skip to content

Commit fb548b7

Browse files
committed
libm: Use funnel shifts to speed up u256 shifting
Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.
1 parent eb39818 commit fb548b7

3 files changed

Lines changed: 154 additions & 38 deletions

File tree

libm/src/math/support/big.rs

Lines changed: 110 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ mod tests;
66
use core::{fmt, ops};
77

88
use super::{DInt, HInt, Int, MinInt};
9+
use crate::support::Word;
910

1011
const U128_LO_MASK: u128 = u64::MAX as u128;
12+
const U128_WORDS: usize = (u128::BITS / Word::BITS) as usize;
13+
const U256_WORDS: usize = U128_WORDS * 2;
1114

1215
/// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
1316
#[allow(non_camel_case_types)]
@@ -31,6 +34,29 @@ impl u256 {
3134
hi: self.hi as i128,
3235
}
3336
}
37+
38+
/// Split into words, with the least significant word first.
39+
fn to_words(self) -> [Word; U256_WORDS] {
40+
// The result with 64-bit words will be: [lo.lo(), lo.hi(), hi.lo(), hi.hi()].
41+
let mut ret: [Word; _] = [0; U256_WORDS];
42+
for i in 0..U128_WORDS {
43+
let shift = i as u32 * Word::BITS;
44+
ret[i] = (self.lo >> shift) as Word;
45+
ret[i + U128_WORDS] = (self.hi >> shift) as Word;
46+
}
47+
ret
48+
}
49+
50+
/// Perform the opposite of [`to_words`].
51+
fn from_words(words: [Word; U256_WORDS]) -> Self {
52+
let mut ret = u256::ZERO;
53+
for i in 0..U128_WORDS {
54+
let shift = i as u32 * usize::BITS;
55+
ret.lo |= (words[i] as u128) << shift;
56+
ret.hi |= (words[i + U128_WORDS] as u128) << shift;
57+
}
58+
ret
59+
}
3460
}
3561

3662
/// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -58,6 +84,16 @@ impl i256 {
5884
hi: self.hi as u128,
5985
}
6086
}
87+
88+
/// Split into words, with the least significant word first.
89+
fn to_words(self) -> [Word; U256_WORDS] {
90+
self.unsigned().to_words()
91+
}
92+
93+
/// Perform the opposite of [`to_words`].
94+
fn from_words(words: [Word; U256_WORDS]) -> Self {
95+
u256::from_words(words).signed()
96+
}
6197
}
6298

6399
impl MinInt for u256 {
@@ -130,59 +166,95 @@ macro_rules! impl_common {
130166
}
131167
}
132168

133-
impl ops::Shl<u32> for $ty {
169+
impl ops::Shr<u32> for $ty {
134170
type Output = Self;
135171

136-
fn shl(mut self, rhs: u32) -> Self::Output {
137-
debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
138-
139-
let half_bits = Self::BITS / 2;
140-
let low_mask = half_bits - 1;
141-
let s = rhs & low_mask;
142-
143-
let lo = self.lo;
144-
let hi = self.hi;
172+
fn shr(self, rhs: u32) -> Self::Output {
173+
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
145174

146-
self.lo = lo << s;
175+
// Set up an array with the input in the low half, zeros in the upper half
176+
let mut words: [Word; U256_WORDS * 2] = [0; _];
177+
words[..U256_WORDS].copy_from_slice(&self.to_words());
178+
179+
if <$ty>::SIGNED {
180+
// For i256, branchlessly set the upper words to all ones if the input
181+
// is negative.
182+
let top_word = words[U256_WORDS - 1].signed() >> (Word::BITS - 1);
183+
for x in &mut words[U256_WORDS..] {
184+
*x = top_word.unsigned();
185+
}
186+
}
147187

148-
if rhs & half_bits == 0 {
149-
self.hi = (lo >> (low_mask ^ s) >> 1) as _;
150-
self.hi |= hi << s;
151-
} else {
152-
self.hi = self.lo as _;
153-
self.lo = 0;
188+
let shift = rhs & 255; // limit to 255 in cases of overflow
189+
let word_shift = (shift / Word::BITS) as usize;
190+
let bit_shift = shift % Word::BITS;
191+
192+
let mut ret: [Word; U256_WORDS] = [0; _];
193+
194+
// Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
195+
// these loops get unrolled.
196+
for i in 0..U256_WORDS {
197+
if i < (U256_WORDS - 1) {
198+
let hi = words[word_shift + i + 1];
199+
let lo = words[word_shift + i];
200+
201+
ret[i] = <Word as HInt>::funnel_shr(hi, lo, bit_shift);
202+
} else if <$ty>::SIGNED {
203+
// The upper word doesn't get any sign bits via a funnel shift, so we need
204+
// an arithmetic shift to preserve sign.
205+
let mut x = words[word_shift + i].signed();
206+
x >>= bit_shift;
207+
ret[i] = x.unsigned();
208+
} else {
209+
ret[i] = words[word_shift + i] >> bit_shift;
210+
}
154211
}
155-
self
212+
213+
<$ty>::from_words(ret)
156214
}
157215
}
216+
};
217+
}
158218

159-
impl ops::Shr<u32> for $ty {
160-
type Output = Self;
219+
impl ops::Shl<u32> for u256 {
220+
type Output = Self;
161221

162-
fn shr(mut self, rhs: u32) -> Self::Output {
163-
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
222+
fn shl(self, rhs: u32) -> Self::Output {
223+
debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
164224

165-
let half_bits = Self::BITS / 2;
166-
let low_mask = half_bits - 1;
167-
let s = rhs & low_mask;
225+
// Set up an array with the input in the low half, zeros in the upper half
226+
let mut words: [Word; U256_WORDS * 2] = [0; _];
227+
words[U256_WORDS..].copy_from_slice(&self.to_words());
168228

169-
let lo = self.lo;
170-
let hi = self.hi;
229+
let shift = rhs & 255; // limit to 255 in cases of overflow
230+
let word_shift = U256_WORDS - (shift / Word::BITS) as usize;
231+
let bit_shift = shift % Word::BITS;
171232

172-
self.hi = hi >> s;
233+
let mut ret: [Word; U256_WORDS] = [0; _];
173234

174-
#[allow(unused_comparisons)]
175-
if rhs & half_bits == 0 {
176-
self.lo = (hi << (low_mask ^ s) << 1) as _;
177-
self.lo |= lo >> s;
178-
} else {
179-
self.lo = self.hi as _;
180-
self.hi = if hi < 0 { !0 } else { 0 };
181-
}
182-
self
235+
// Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
236+
// these loops get unrolled.
237+
for i in 0..U256_WORDS {
238+
if i == 0 {
239+
ret[i] = words[word_shift + i] << bit_shift;
240+
} else {
241+
let hi = words[word_shift + i];
242+
let lo = words[word_shift + i - 1];
243+
244+
ret[i] = <Word as HInt>::funnel_shl(hi, lo, bit_shift);
183245
}
184246
}
185-
};
247+
248+
u256::from_words(ret)
249+
}
250+
}
251+
252+
impl ops::Shl<u32> for i256 {
253+
type Output = Self;
254+
255+
fn shl(self, rhs: u32) -> Self::Output {
256+
(self.unsigned() << rhs).signed()
257+
}
186258
}
187259

188260
impl_common!(i256);

libm/src/math/support/int_traits.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,24 @@ pub trait HInt: Int {
344344
fn zero_widen_mul(self, rhs: Self) -> Self::D;
345345
/// Widening multiplication. This cannot overflow.
346346
fn widen_mul(self, rhs: Self) -> Self::D;
347+
348+
// FIXME(msrv): Use funnel shifts from `core` as a trait on `Int` when available.
349+
350+
/// Concatenate `self` and `right`, shift by `shift`, and return the upper half.
351+
fn funnel_shl(self, right: Self, shift: u32) -> Self {
352+
assert!(!Self::SIGNED, "unsupported for signed integers");
353+
assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
354+
let n = Self::D::from_lo_hi(right, self);
355+
(n << shift).hi()
356+
}
357+
358+
/// Concatenate `self` and `right`, shift by `shift`, and return the lower half.
359+
fn funnel_shr(self, right: Self, shift: u32) -> Self {
360+
assert!(!Self::SIGNED, "unsupported for signed integers");
361+
assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
362+
let n = Self::D::from_lo_hi(right, self);
363+
(n >> shift).lo()
364+
}
347365
}
348366

349367
macro_rules! impl_d_int {

libm/src/math/support/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,21 @@ pub use hex_float::{hf32, hf64};
3232
pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
3333
pub use modular::linear_mul_reduction;
3434

35+
cfg_if! {
36+
if #[cfg(target_pointer_width = "16")] {
37+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
38+
pub type Word = u16;
39+
} else if #[cfg(target_pointer_width = "32")] {
40+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
41+
pub type Word = u32;
42+
} else if #[cfg(target_pointer_width = "64")] {
43+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
44+
pub type Word = u64;
45+
} else {
46+
compile_error!("unsupported pointer width");
47+
}
48+
}
49+
3550
/// Hint to the compiler that the current path is cold.
3651
pub fn cold_path() {
3752
#[cfg(intrinsics_enabled)]
@@ -70,3 +85,14 @@ pub unsafe fn unchecked_div_isize(x: isize, y: isize) -> isize {
7085
}
7186
}
7287
}
88+
89+
#[cfg(test)]
90+
mod tests {
91+
use super::*;
92+
93+
#[test]
94+
fn word_size() {
95+
assert_eq!(size_of::<Word>(), size_of::<usize>());
96+
assert_eq!(align_of::<Word>(), align_of::<usize>());
97+
}
98+
}

0 commit comments

Comments
 (0)