Skip to content

Commit 7fc84f4

Browse files
committed
libm: Use funnel shifts to speed up u256 shifting
Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.
1 parent eb39818 commit 7fc84f4

3 files changed

Lines changed: 164 additions & 38 deletions

File tree

libm/src/math/support/big.rs

Lines changed: 120 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ mod tests;
66
use core::{fmt, ops};
77

88
use super::{DInt, HInt, Int, MinInt};
9+
use crate::support::Word;
910

1011
const U128_LO_MASK: u128 = u64::MAX as u128;
12+
const U128_WORDS: usize = (u128::BITS / Word::BITS) as usize;
13+
const U256_WORDS: usize = U128_WORDS * 2;
1114

1215
/// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
1316
#[allow(non_camel_case_types)]
@@ -31,6 +34,29 @@ impl u256 {
3134
hi: self.hi as i128,
3235
}
3336
}
37+
38+
/// Split into words, with the least significant word first.
39+
fn to_words(self) -> [Word; U256_WORDS] {
40+
// The result with 64-bit words will be: [lo.lo(), lo.hi(), hi.lo(), hi.hi()].
41+
let mut ret: [Word; _] = [0; U256_WORDS];
42+
for i in 0..U128_WORDS {
43+
let shift = i as u32 * Word::BITS;
44+
ret[i] = (self.lo >> shift) as Word;
45+
ret[i + U128_WORDS] = (self.hi >> shift) as Word;
46+
}
47+
ret
48+
}
49+
50+
/// Perform the opposite of [`to_words`]
51+
fn from_words(words: [Word; U256_WORDS]) -> Self {
52+
let mut ret = u256::ZERO;
53+
for i in 0..U128_WORDS {
54+
let shift = i as u32 * usize::BITS;
55+
ret.lo |= (words[i] as u128) << shift;
56+
ret.hi |= (words[i + U128_WORDS] as u128) << shift;
57+
}
58+
ret
59+
}
3460
}
3561

3662
/// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -58,6 +84,26 @@ impl i256 {
5884
hi: self.hi as u128,
5985
}
6086
}
87+
88+
fn to_words(self) -> [Word; U256_WORDS] {
89+
let mut ret: [Word; _] = [0; U256_WORDS];
90+
for i in 0..U128_WORDS {
91+
let shift = i as u32 * Word::BITS;
92+
ret[i] = (self.lo >> shift) as Word;
93+
ret[i + U128_WORDS] = (self.hi >> shift) as Word;
94+
}
95+
ret
96+
}
97+
98+
fn from_words(words: [Word; U256_WORDS]) -> Self {
99+
let mut ret = u256::ZERO;
100+
for i in 0..U128_WORDS {
101+
let shift = i as u32 * usize::BITS;
102+
ret.lo |= (words[i] as u128) << shift;
103+
ret.hi |= (words[i + U128_WORDS] as u128) << shift;
104+
}
105+
ret.signed()
106+
}
61107
}
62108

63109
impl MinInt for u256 {
@@ -130,59 +176,95 @@ macro_rules! impl_common {
130176
}
131177
}
132178

133-
impl ops::Shl<u32> for $ty {
179+
impl ops::Shr<u32> for $ty {
134180
type Output = Self;
135181

136-
fn shl(mut self, rhs: u32) -> Self::Output {
137-
debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
138-
139-
let half_bits = Self::BITS / 2;
140-
let low_mask = half_bits - 1;
141-
let s = rhs & low_mask;
142-
143-
let lo = self.lo;
144-
let hi = self.hi;
182+
fn shr(self, rhs: u32) -> Self::Output {
183+
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
145184

146-
self.lo = lo << s;
185+
// Set up an array with the input in the low half, zeros in the upper half
186+
let mut words: [Word; _] = [0; U256_WORDS * 2];
187+
words[..U256_WORDS].copy_from_slice(&self.to_words());
188+
189+
if <$ty>::SIGNED {
190+
// For i256, branchlessly set the upper words to all ones if the input
191+
// is negative.
192+
let top_word = words[U256_WORDS - 1].cast_signed() >> (Word::BITS - 1);
193+
for x in &mut words[U256_WORDS..] {
194+
*x = top_word.cast_unsigned();
195+
}
196+
}
147197

148-
if rhs & half_bits == 0 {
149-
self.hi = (lo >> (low_mask ^ s) >> 1) as _;
150-
self.hi |= hi << s;
151-
} else {
152-
self.hi = self.lo as _;
153-
self.lo = 0;
198+
let shift = rhs & 255; // limit to 255 in cases of overflow
199+
let word_shift = (shift / Word::BITS) as usize;
200+
let bit_shift = shift % Word::BITS;
201+
202+
let mut ret: [Word; _] = [0; U256_WORDS];
203+
204+
// Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
205+
// these loops get unrolled.
206+
for i in 0..U256_WORDS {
207+
if i < (U256_WORDS - 1) {
208+
let hi = words[word_shift + i + 1];
209+
let lo = words[word_shift + i];
210+
211+
ret[i] = <Word as HInt>::funnel_shr(hi, lo, bit_shift);
212+
} else if <$ty>::SIGNED {
213+
// The upper word doesn't get any sign bits via a funnel shift, so we need
214+
// an arithmetic shift to preserve sign.
215+
let mut x = words[word_shift + i].cast_signed();
216+
x >>= bit_shift;
217+
ret[i] = x.cast_unsigned();
218+
} else {
219+
ret[i] = words[word_shift + i] >> bit_shift;
220+
}
154221
}
155-
self
222+
223+
<$ty>::from_words(ret)
156224
}
157225
}
226+
};
227+
}
158228

159-
impl ops::Shr<u32> for $ty {
160-
type Output = Self;
229+
impl ops::Shl<u32> for u256 {
230+
type Output = Self;
161231

162-
fn shr(mut self, rhs: u32) -> Self::Output {
163-
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
232+
fn shl(self, rhs: u32) -> Self::Output {
233+
debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
164234

165-
let half_bits = Self::BITS / 2;
166-
let low_mask = half_bits - 1;
167-
let s = rhs & low_mask;
235+
// Set up an array with the input in the low half, zeros in the upper half
236+
let mut words: [Word; _] = [0; U256_WORDS * 2];
237+
words[U256_WORDS..].copy_from_slice(&self.to_words());
168238

169-
let lo = self.lo;
170-
let hi = self.hi;
239+
let shift = rhs & 255; // limit to 255 in cases of overflow
240+
let word_shift = U256_WORDS - (shift / Word::BITS) as usize;
241+
let bit_shift = shift % Word::BITS;
171242

172-
self.hi = hi >> s;
243+
let mut ret: [Word; _] = [0; U256_WORDS];
173244

174-
#[allow(unused_comparisons)]
175-
if rhs & half_bits == 0 {
176-
self.lo = (hi << (low_mask ^ s) << 1) as _;
177-
self.lo |= lo >> s;
178-
} else {
179-
self.lo = self.hi as _;
180-
self.hi = if hi < 0 { !0 } else { 0 };
181-
}
182-
self
245+
// Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
246+
// these loops get unrolled.
247+
for i in 0..U256_WORDS {
248+
if i == 0 {
249+
ret[i] = words[word_shift + i] << bit_shift;
250+
} else {
251+
let hi = words[word_shift + i];
252+
let lo = words[word_shift + i - 1];
253+
254+
ret[i] = <Word as HInt>::funnel_shl(hi, lo, bit_shift);
183255
}
184256
}
185-
};
257+
258+
u256::from_words(ret)
259+
}
260+
}
261+
262+
impl ops::Shl<u32> for i256 {
263+
type Output = Self;
264+
265+
fn shl(self, rhs: u32) -> Self::Output {
266+
(self.unsigned() << rhs).signed()
267+
}
186268
}
187269

188270
impl_common!(i256);

libm/src/math/support/int_traits.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,24 @@ pub trait HInt: Int {
344344
fn zero_widen_mul(self, rhs: Self) -> Self::D;
345345
/// Widening multiplication. This cannot overflow.
346346
fn widen_mul(self, rhs: Self) -> Self::D;
347+
348+
// FIXME(msrv): Use funnel shifts from `core` as a trait on `Int` when available.
349+
350+
/// Concatenate `self` and `right`, shift by `shift`, and return the upper half.
351+
fn funnel_shl(self, right: Self, shift: u32) -> Self {
352+
assert!(!Self::SIGNED, "unsupported for signed integers");
353+
assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
354+
let n = Self::D::from_lo_hi(right, self);
355+
(n << shift).hi()
356+
}
357+
358+
/// Concatenate `self` and `right`, shift by `shift`, and return the lower half.
359+
fn funnel_shr(self, right: Self, shift: u32) -> Self {
360+
assert!(!Self::SIGNED, "unsupported for signed integers");
361+
assert!(shift < Self::BITS, "attempt to funnel shift with overflow");
362+
let n = Self::D::from_lo_hi(right, self);
363+
(n >> shift).lo()
364+
}
347365
}
348366

349367
macro_rules! impl_d_int {

libm/src/math/support/mod.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,21 @@ pub use hex_float::{hf32, hf64};
3232
pub use int_traits::{CastFrom, CastInto, DInt, HInt, Int, MinInt, NarrowingDiv};
3333
pub use modular::linear_mul_reduction;
3434

35+
cfg_if! {
36+
if #[cfg(target_pointer_width = "16")] {
37+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
38+
pub type Word = u16;
39+
} else if #[cfg(target_pointer_width = "32")] {
40+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
41+
pub type Word = u32;
42+
} else if #[cfg(target_pointer_width = "64")] {
43+
/// Concrete sized integer compatible with `usize` (exists for using `DInt`/`HInt`).
44+
pub type Word = u64;
45+
} else {
46+
compile_error!("unsupported pointer width");
47+
}
48+
}
49+
3550
/// Hint to the compiler that the current path is cold.
3651
pub fn cold_path() {
3752
#[cfg(intrinsics_enabled)]
@@ -70,3 +85,14 @@ pub unsafe fn unchecked_div_isize(x: isize, y: isize) -> isize {
7085
}
7186
}
7287
}
88+
89+
#[cfg(test)]
90+
mod tests {
91+
use super::*;
92+
93+
#[test]
94+
fn word_size() {
95+
assert_eq!(size_of::<Word>(), size_of::<usize>());
96+
assert_eq!(align_of::<Word>(), align_of::<usize>());
97+
}
98+
}

0 commit comments

Comments
 (0)