Skip to content

Commit a35316d

Browse files
committed
libm: Use funnel shifts to speed up u256 shifting
Switch to an algorithm using word-sized operations on an array to do coarse shifts, then funnel shifts for the bit shifts. The result is quite close to what LLVM generates when using native `u256` types.
1 parent abd9905 commit a35316d

3 files changed

Lines changed: 96 additions & 20 deletions

File tree

compiler-builtins/src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
#![feature(cfg_target_has_atomic)]
77
#![feature(compiler_builtins)]
88
#![feature(core_intrinsics)]
9+
#![feature(funnel_shifts)]
910
#![feature(linkage)]
10-
#![feature(repr_simd)]
1111
#![feature(macro_metavar_expr_concat)]
12+
#![feature(repr_simd)]
1213
#![feature(rustc_attrs)]
1314
#![cfg_attr(f16_enabled, feature(f16))]
1415
#![cfg_attr(f128_enabled, feature(f128))]

libm/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#![no_std]
33
#![cfg_attr(intrinsics_enabled, allow(internal_features))]
44
#![cfg_attr(intrinsics_enabled, feature(core_intrinsics))]
5+
#![cfg_attr(intrinsics_enabled, feature(funnel_shifts))]
56
#![cfg_attr(
67
all(intrinsics_enabled, target_family = "wasm"),
78
feature(wasm_numeric_instr)

libm/src/math/support/big.rs

Lines changed: 93 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ use core::ops;
88
use super::{DInt, HInt, Int, MinInt};
99

1010
const U128_LO_MASK: u128 = u64::MAX as u128;
11+
const U128_WORDS: usize = (u128::BITS / usize::BITS) as usize;
12+
const U256_WORDS: usize = U128_WORDS * 2;
1113

1214
/// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
1315
#[allow(non_camel_case_types)]
@@ -31,6 +33,26 @@ impl u256 {
3133
hi: self.hi as i128,
3234
}
3335
}
36+
37+
fn to_words(self) -> [usize; U256_WORDS] {
38+
let mut ret = [0usize; U256_WORDS];
39+
for i in 0..U128_WORDS {
40+
let shift = i as u32 * usize::BITS;
41+
ret[i] = (self.lo >> shift) as usize;
42+
ret[i + U128_WORDS] = (self.hi >> shift) as usize;
43+
}
44+
ret
45+
}
46+
47+
fn from_words(words: [usize; U256_WORDS]) -> Self {
48+
let mut ret = u256::ZERO;
49+
for i in 0..U128_WORDS {
50+
let shift = i as u32 * usize::BITS;
51+
ret.lo |= (words[i] as u128) << shift;
52+
ret.hi |= (words[i + U128_WORDS] as u128) << shift;
53+
}
54+
ret
55+
}
3456
}
3557

3658
/// A 256-bit signed integer represented as two 128-bit native-endian limbs.
@@ -156,34 +178,86 @@ macro_rules! impl_common {
156178
self
157179
}
158180
}
181+
};
182+
}
159183

160-
impl ops::Shr<u32> for $ty {
161-
type Output = Self;
184+
impl ops::Shr<u32> for i256 {
185+
type Output = Self;
162186

163-
fn shr(mut self, rhs: u32) -> Self::Output {
164-
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
187+
fn shr(mut self, rhs: u32) -> Self::Output {
188+
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
165189

166-
let half_bits = Self::BITS / 2;
167-
let low_mask = half_bits - 1;
168-
let s = rhs & low_mask;
190+
let half_bits = Self::BITS / 2;
191+
let low_mask = half_bits - 1;
192+
let s = rhs & low_mask;
169193

170-
let lo = self.lo;
171-
let hi = self.hi;
194+
let lo = self.lo;
195+
let hi = self.hi;
172196

173-
self.hi = hi >> s;
197+
self.hi = hi >> s;
174198

175-
#[allow(unused_comparisons)]
176-
if rhs & half_bits == 0 {
177-
self.lo = (hi << (low_mask ^ s) << 1) as _;
178-
self.lo |= lo >> s;
179-
} else {
180-
self.lo = self.hi as _;
181-
self.hi = if hi < 0 { !0 } else { 0 };
199+
#[allow(unused_comparisons)]
200+
if rhs & half_bits == 0 {
201+
self.lo = (hi << (low_mask ^ s) << 1) as _;
202+
self.lo |= lo >> s;
203+
} else {
204+
self.lo = self.hi as _;
205+
self.hi = if hi < 0 { !0 } else { 0 };
206+
}
207+
self
208+
}
209+
}
210+
211+
impl ops::Shr<u32> for u256 {
212+
type Output = Self;
213+
214+
fn shr(self, rhs: u32) -> Self::Output {
215+
debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
216+
217+
// Set up an array with the input in the low half, zeros in the upper half
218+
let mut words = [0usize; U256_WORDS * 2];
219+
words[..U256_WORDS].copy_from_slice(&self.to_words());
220+
221+
let shift = rhs & 255; // limit to 255 in cases of overflow
222+
let word_shift = (shift / usize::BITS) as usize;
223+
let bit_shift = shift % usize::BITS;
224+
225+
let mut ret = [0usize; U256_WORDS];
226+
227+
// Each output word is a coarse (word-sized) shift plus a small bit shift. Note that
228+
// these loops get unrolled.
229+
cfg_if! {
230+
if #[cfg(intrinsics_enabled)] {
231+
// Use funnel shifts if available to handle the two-word input, which
232+
// can be a single instruction (`shrd` on x86).
233+
for i in 0..U256_WORDS {
234+
if i < (U256_WORDS - 1) {
235+
let hi = words[word_shift + i + 1];
236+
let lo = words[word_shift + i];
237+
ret[i] = hi.funnel_shr(lo, bit_shift);
238+
} else {
239+
ret[i] = words[word_shift + i] >> bit_shift
240+
}
182241
}
183-
self
242+
} else {
243+
// Otherwise, perform the narrowing shift as a combined left and right shift.
244+
// This doesn't get optimized quite as well.
245+
for i in 0..U256_WORDS {
246+
ret[i] = words[word_shift + i] >> bit_shift;
247+
}
248+
249+
if bit_shift != 0 {
250+
let lshift = usize::BITS - bit_shift as u32;
251+
for i in 0..(U256_WORDS - 1) {
252+
ret[i] |= words[word_shift + i + 1] << lshift;
253+
}
254+
}
255+
184256
}
185257
}
186-
};
258+
259+
u256::from_words(ret)
260+
}
187261
}
188262

189263
impl_common!(i256);

0 commit comments

Comments
 (0)