Skip to content

Commit 8ec4cf4

Browse files
committed
Replace zerocopy::transmute! with unsafe transmute
Code gen is identical and benchmarks unaffected.
1 parent 4ccd0c0 commit 8ec4cf4

1 file changed

Lines changed: 44 additions & 33 deletions

File tree

src/distr/integer.rs

Lines changed: 44 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -107,21 +107,50 @@ impl_nzint!(NonZeroI64, NonZeroI64::new);
107107
impl_nzint!(NonZeroI128, NonZeroI128::new);
108108

109109
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
110-
macro_rules! x86_intrinsic_impl {
111-
($meta:meta, $($intrinsic:ident),+) => {$(
112-
#[cfg($meta)]
113-
impl Distribution<$intrinsic> for StandardUniform {
114-
#[inline]
115-
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $intrinsic {
116-
// On proper hardware, this should compile to SIMD instructions
117-
// Verified on x86 Haswell with __m128i, __m256i
118-
let mut buf = [0_u8; core::mem::size_of::<$intrinsic>()];
119-
rng.fill_bytes(&mut buf);
120-
// x86 is little endian so no need for conversion
121-
zerocopy::transmute!(buf)
122-
}
123-
}
124-
)+};
110+
impl Distribution<__m128i> for StandardUniform {
111+
#[inline]
112+
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> __m128i {
113+
// NOTE: It's tempting to use the u128 impl here, but confusingly this
114+
// results in different code (return via rdx, r10 instead of rax, rdx
115+
// with u128 impl) and is much slower (+130 time). This version calls
116+
// impls::fill_bytes_via_next but performs well.
117+
118+
let mut buf = [0_u8; core::mem::size_of::<__m128i>()];
119+
rng.fill_bytes(&mut buf);
120+
// x86 is little endian so no need for conversion
121+
122+
// SAFETY: both source and result types are valid for all values.
123+
unsafe { core::mem::transmute(buf) }
124+
}
125+
}
126+
127+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
128+
impl Distribution<__m256i> for StandardUniform {
129+
#[inline]
130+
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> __m256i {
131+
let mut buf = [0_u8; core::mem::size_of::<__m256i>()];
132+
rng.fill_bytes(&mut buf);
133+
// x86 is little endian so no need for conversion
134+
135+
// SAFETY: both source and result types are valid for all values.
136+
unsafe { core::mem::transmute(buf) }
137+
}
138+
}
139+
140+
#[cfg(all(
141+
any(target_arch = "x86", target_arch = "x86_64"),
142+
feature = "simd_support"
143+
))]
144+
impl Distribution<__m512i> for StandardUniform {
145+
#[inline]
146+
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> __m512i {
147+
let mut buf = [0_u8; core::mem::size_of::<__m512i>()];
148+
rng.fill_bytes(&mut buf);
149+
// x86 is little endian so no need for conversion
150+
151+
// SAFETY: both source and result types are valid for all values.
152+
unsafe { core::mem::transmute(buf) }
153+
}
125154
}
126155

127156
#[cfg(feature = "simd_support")]
@@ -148,24 +177,6 @@ macro_rules! simd_impl {
148177
#[cfg(feature = "simd_support")]
149178
simd_impl!(u8, i8, u16, i16, u32, i32, u64, i64);
150179

151-
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
152-
x86_intrinsic_impl!(
153-
any(target_arch = "x86", target_arch = "x86_64"),
154-
__m128i,
155-
__m256i
156-
);
157-
#[cfg(all(
158-
any(target_arch = "x86", target_arch = "x86_64"),
159-
feature = "simd_support"
160-
))]
161-
x86_intrinsic_impl!(
162-
all(
163-
any(target_arch = "x86", target_arch = "x86_64"),
164-
feature = "simd_support"
165-
),
166-
__m512i
167-
);
168-
169180
#[cfg(test)]
170181
mod tests {
171182
use super::*;

0 commit comments

Comments
 (0)