Skip to content

Commit 65a1694

Browse files
authored
sha2: remove macros from sha256/x86_sha (#853)
Additionally, performs a minor refactoring.
1 parent bb10c3a commit 65a1694

3 files changed

Lines changed: 58 additions & 88 deletions

File tree

sha2/src/consts.rs

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
#![allow(dead_code)]
2-
31
pub(crate) type State256 = [u32; 8];
42
pub(crate) type State512 = [u64; 8];
53

@@ -68,25 +66,3 @@ pub(crate) const K64: [u64; 80] = [
6866
0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,
6967
0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817,
7068
];
71-
72-
/// Swapped round constants for SHA-256 family of digests
73-
pub(crate) static K32X4: [[u32; 4]; 16] = {
74-
let mut res = [[0u32; 4]; 16];
75-
let mut i = 0;
76-
while i < 16 {
77-
res[i] = [K32[4 * i + 3], K32[4 * i + 2], K32[4 * i + 1], K32[4 * i]];
78-
i += 1;
79-
}
80-
res
81-
};
82-
83-
/// Swapped round constants for SHA-512 family of digests
84-
pub(crate) const K64X2: [[u64; 2]; 40] = {
85-
let mut res = [[0u64; 2]; 40];
86-
let mut i = 0;
87-
while i < 16 {
88-
res[i] = [K64[4 * i + 1], K64[4 * i]];
89-
i += 1;
90-
}
91-
res
92-
};

sha2/src/sha256.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@ cfg_if::cfg_if! {
2020

2121
#[cfg(not(all(
2222
target_feature = "sha",
23-
target_feature = "sse2",
24-
target_feature = "ssse3",
2523
target_feature = "sse4.1",
2624
)))]
27-
compile_error!("x86-sha backend requires sha, sse2, ssse3, sse4.1 target features");
25+
compile_error!("x86-sha backend requires sha and sse4.1 target features");
2826

2927
fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
3028
// SAFETY: we checked above that the required target features are enabled
@@ -52,7 +50,7 @@ cfg_if::cfg_if! {
5250
cfg_if::cfg_if! {
5351
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
5452
mod x86_sha;
55-
cpufeatures::new!(shani_cpuid, "sha", "sse2", "ssse3", "sse4.1");
53+
cpufeatures::new!(shani_cpuid, "sha", "sse4.1");
5654
} else if #[cfg(target_arch = "aarch64")] {
5755
mod aarch64_sha2;
5856
cpufeatures::new!(sha2_hwcap, "sha2");

sha2/src/sha256/x86_sha.rs

Lines changed: 56 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -10,47 +10,57 @@ use core::arch::x86::*;
1010
#[cfg(target_arch = "x86_64")]
1111
use core::arch::x86_64::*;
1212

13-
#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
14-
unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i {
15-
let t1 = _mm_sha256msg1_epu32(v0, v1);
16-
let t2 = _mm_alignr_epi8(v3, v2, 4);
17-
let t3 = _mm_add_epi32(t1, t2);
18-
_mm_sha256msg2_epu32(t3, v3)
13+
#[target_feature(enable = "sha")]
14+
unsafe fn rounds4(r: usize, abef: &mut __m128i, cdgh: &mut __m128i, rest: __m128i) {
15+
use crate::consts::K32;
16+
let rk = _mm_set_epi32(
17+
K32[4 * r + 3] as i32,
18+
K32[4 * r + 2] as i32,
19+
K32[4 * r + 1] as i32,
20+
K32[4 * r] as i32,
21+
);
22+
let t1 = _mm_add_epi32(rest, rk);
23+
*cdgh = _mm_sha256rnds2_epu32(*cdgh, *abef, t1);
24+
let t2 = _mm_shuffle_epi32(t1, 0x0E);
25+
*abef = _mm_sha256rnds2_epu32(*abef, *cdgh, t2);
1926
}
2027

21-
macro_rules! rounds4 {
22-
($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
23-
let k = crate::consts::K32X4[$i];
24-
let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
25-
let t1 = _mm_add_epi32($rest, kv);
26-
$cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1);
27-
let t2 = _mm_shuffle_epi32(t1, 0x0E);
28-
$abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2);
29-
}};
28+
#[target_feature(enable = "sha,ssse3")]
29+
unsafe fn schedule_rounds16(
30+
r: usize,
31+
abef: &mut __m128i,
32+
cdgh: &mut __m128i,
33+
w: &mut [__m128i; 4],
34+
) {
35+
for i in 0..4 {
36+
let w0 = w[i];
37+
let w1 = w[(i + 1) % 4];
38+
let w2 = w[(i + 2) % 4];
39+
let w3 = w[(i + 3) % 4];
40+
41+
let t1 = _mm_sha256msg1_epu32(w0, w1);
42+
let t2 = _mm_alignr_epi8(w3, w2, 4);
43+
let t3 = _mm_add_epi32(t1, t2);
44+
45+
w[i] = _mm_sha256msg2_epu32(t3, w3);
46+
47+
rounds4(r + i, abef, cdgh, w[i]);
48+
}
3049
}
3150

32-
macro_rules! schedule_rounds4 {
33-
(
34-
$abef:ident, $cdgh:ident,
35-
$w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
36-
$i: expr
37-
) => {{
38-
$w4 = schedule($w0, $w1, $w2, $w3);
39-
rounds4!($abef, $cdgh, $w4, $i);
40-
}};
51+
#[target_feature(enable = "ssse3")]
52+
unsafe fn read_block(block: &[u8; 64]) -> [__m128i; 4] {
53+
let block_ptr: *const __m128i = block.as_ptr().cast();
54+
let mask = _mm_set_epi64x(0x0C0D_0E0F_0809_0A0B, 0x0405_0607_0001_0203);
55+
core::array::from_fn(|i| {
56+
let w = _mm_loadu_si128(block_ptr.add(i));
57+
_mm_shuffle_epi8(w, mask)
58+
})
4159
}
4260

43-
// we use unaligned loads with `__m128i` pointers
44-
#[allow(clippy::cast_ptr_alignment)]
45-
#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
61+
#[target_feature(enable = "sha,sse4.1")]
4662
pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
47-
#[allow(non_snake_case)]
48-
let MASK: __m128i = _mm_set_epi64x(
49-
0x0C0D_0E0F_0809_0A0Bu64 as i64,
50-
0x0405_0607_0001_0203u64 as i64,
51-
);
52-
53-
let state_ptr: *const __m128i = state.as_ptr().cast();
63+
let state_ptr: *mut __m128i = state.as_mut_ptr().cast();
5464
let dcba = _mm_loadu_si128(state_ptr.add(0));
5565
let hgfe = _mm_loadu_si128(state_ptr.add(1));
5666

@@ -63,29 +73,16 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
6373
let abef_save = abef;
6474
let cdgh_save = cdgh;
6575

66-
let block_ptr: *const __m128i = block.as_ptr().cast();
67-
let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(0)), MASK);
68-
let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(1)), MASK);
69-
let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(2)), MASK);
70-
let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(block_ptr.add(3)), MASK);
71-
let mut w4;
72-
73-
rounds4!(abef, cdgh, w0, 0);
74-
rounds4!(abef, cdgh, w1, 1);
75-
rounds4!(abef, cdgh, w2, 2);
76-
rounds4!(abef, cdgh, w3, 3);
77-
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
78-
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
79-
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
80-
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
81-
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
82-
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
83-
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
84-
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
85-
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
86-
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
87-
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
88-
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
76+
let mut w = read_block(block);
77+
78+
rounds4(0, &mut abef, &mut cdgh, w[0]);
79+
rounds4(1, &mut abef, &mut cdgh, w[1]);
80+
rounds4(2, &mut abef, &mut cdgh, w[2]);
81+
rounds4(3, &mut abef, &mut cdgh, w[3]);
82+
83+
schedule_rounds16(4, &mut abef, &mut cdgh, &mut w);
84+
schedule_rounds16(8, &mut abef, &mut cdgh, &mut w);
85+
schedule_rounds16(12, &mut abef, &mut cdgh, &mut w);
8986

9087
abef = _mm_add_epi32(abef, abef_save);
9188
cdgh = _mm_add_epi32(cdgh, cdgh_save);
@@ -96,7 +93,6 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
9693
let dcba = _mm_blend_epi16(feba, dchg, 0xF0);
9794
let hgef = _mm_alignr_epi8(dchg, feba, 8);
9895

99-
let state_ptr_mut: *mut __m128i = state.as_mut_ptr().cast();
100-
_mm_storeu_si128(state_ptr_mut.add(0), dcba);
101-
_mm_storeu_si128(state_ptr_mut.add(1), hgef);
96+
_mm_storeu_si128(state_ptr.add(0), dcba);
97+
_mm_storeu_si128(state_ptr.add(1), hgef);
10298
}

0 commit comments

Comments
 (0)