@@ -10,47 +10,57 @@ use core::arch::x86::*;
1010#[ cfg( target_arch = "x86_64" ) ]
1111use core:: arch:: x86_64:: * ;
1212
13- #[ target_feature( enable = "sha,sse2,ssse3,sse4.1" ) ]
14- unsafe fn schedule ( v0 : __m128i , v1 : __m128i , v2 : __m128i , v3 : __m128i ) -> __m128i {
15- let t1 = _mm_sha256msg1_epu32 ( v0, v1) ;
16- let t2 = _mm_alignr_epi8 ( v3, v2, 4 ) ;
17- let t3 = _mm_add_epi32 ( t1, t2) ;
18- _mm_sha256msg2_epu32 ( t3, v3)
13+ #[ target_feature( enable = "sha" ) ]
14+ unsafe fn rounds4 ( r : usize , abef : & mut __m128i , cdgh : & mut __m128i , rest : __m128i ) {
15+ use crate :: consts:: K32 ;
16+ let rk = _mm_set_epi32 (
17+ K32 [ 4 * r + 3 ] as i32 ,
18+ K32 [ 4 * r + 2 ] as i32 ,
19+ K32 [ 4 * r + 1 ] as i32 ,
20+ K32 [ 4 * r] as i32 ,
21+ ) ;
22+ let t1 = _mm_add_epi32 ( rest, rk) ;
23+ * cdgh = _mm_sha256rnds2_epu32 ( * cdgh, * abef, t1) ;
24+ let t2 = _mm_shuffle_epi32 ( t1, 0x0E ) ;
25+ * abef = _mm_sha256rnds2_epu32 ( * abef, * cdgh, t2) ;
1926}
2027
21- macro_rules! rounds4 {
22- ( $abef: ident, $cdgh: ident, $rest: expr, $i: expr) => { {
23- let k = crate :: consts:: K32X4 [ $i] ;
24- let kv = _mm_set_epi32( k[ 0 ] as i32 , k[ 1 ] as i32 , k[ 2 ] as i32 , k[ 3 ] as i32 ) ;
25- let t1 = _mm_add_epi32( $rest, kv) ;
26- $cdgh = _mm_sha256rnds2_epu32( $cdgh, $abef, t1) ;
27- let t2 = _mm_shuffle_epi32( t1, 0x0E ) ;
28- $abef = _mm_sha256rnds2_epu32( $abef, $cdgh, t2) ;
29- } } ;
28+ #[ target_feature( enable = "sha,ssse3" ) ]
29+ unsafe fn schedule_rounds16 (
30+ r : usize ,
31+ abef : & mut __m128i ,
32+ cdgh : & mut __m128i ,
33+ w : & mut [ __m128i ; 4 ] ,
34+ ) {
35+ for i in 0 ..4 {
36+ let w0 = w[ i] ;
37+ let w1 = w[ ( i + 1 ) % 4 ] ;
38+ let w2 = w[ ( i + 2 ) % 4 ] ;
39+ let w3 = w[ ( i + 3 ) % 4 ] ;
40+
41+ let t1 = _mm_sha256msg1_epu32 ( w0, w1) ;
42+ let t2 = _mm_alignr_epi8 ( w3, w2, 4 ) ;
43+ let t3 = _mm_add_epi32 ( t1, t2) ;
44+
45+ w[ i] = _mm_sha256msg2_epu32 ( t3, w3) ;
46+
47+ rounds4 ( r + i, abef, cdgh, w[ i] ) ;
48+ }
3049}
3150
32- macro_rules! schedule_rounds4 {
33- (
34- $abef: ident, $cdgh: ident,
35- $w0: expr, $w1: expr, $w2: expr, $w3: expr, $w4: expr,
36- $i: expr
37- ) => { {
38- $w4 = schedule( $w0, $w1, $w2, $w3) ;
39- rounds4!( $abef, $cdgh, $w4, $i) ;
40- } } ;
51+ #[ target_feature( enable = "ssse3" ) ]
52+ unsafe fn read_block ( block : & [ u8 ; 64 ] ) -> [ __m128i ; 4 ] {
53+ let block_ptr: * const __m128i = block. as_ptr ( ) . cast ( ) ;
54+ let mask = _mm_set_epi64x ( 0x0C0D_0E0F_0809_0A0B , 0x0405_0607_0001_0203 ) ;
55+ core:: array:: from_fn ( |i| {
56+ let w = _mm_loadu_si128 ( block_ptr. add ( i) ) ;
57+ _mm_shuffle_epi8 ( w, mask)
58+ } )
4159}
4260
43- // we use unaligned loads with `__m128i` pointers
44- #[ allow( clippy:: cast_ptr_alignment) ]
45- #[ target_feature( enable = "sha,sse2,ssse3,sse4.1" ) ]
61+ #[ target_feature( enable = "sha,sse4.1" ) ]
4662pub ( super ) unsafe fn compress ( state : & mut [ u32 ; 8 ] , blocks : & [ [ u8 ; 64 ] ] ) {
47- #[ allow( non_snake_case) ]
48- let MASK : __m128i = _mm_set_epi64x (
49- 0x0C0D_0E0F_0809_0A0Bu64 as i64 ,
50- 0x0405_0607_0001_0203u64 as i64 ,
51- ) ;
52-
53- let state_ptr: * const __m128i = state. as_ptr ( ) . cast ( ) ;
63+ let state_ptr: * mut __m128i = state. as_mut_ptr ( ) . cast ( ) ;
5464 let dcba = _mm_loadu_si128 ( state_ptr. add ( 0 ) ) ;
5565 let hgfe = _mm_loadu_si128 ( state_ptr. add ( 1 ) ) ;
5666
@@ -63,29 +73,16 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
6373 let abef_save = abef;
6474 let cdgh_save = cdgh;
6575
66- let block_ptr: * const __m128i = block. as_ptr ( ) . cast ( ) ;
67- let mut w0 = _mm_shuffle_epi8 ( _mm_loadu_si128 ( block_ptr. add ( 0 ) ) , MASK ) ;
68- let mut w1 = _mm_shuffle_epi8 ( _mm_loadu_si128 ( block_ptr. add ( 1 ) ) , MASK ) ;
69- let mut w2 = _mm_shuffle_epi8 ( _mm_loadu_si128 ( block_ptr. add ( 2 ) ) , MASK ) ;
70- let mut w3 = _mm_shuffle_epi8 ( _mm_loadu_si128 ( block_ptr. add ( 3 ) ) , MASK ) ;
71- let mut w4;
72-
73- rounds4 ! ( abef, cdgh, w0, 0 ) ;
74- rounds4 ! ( abef, cdgh, w1, 1 ) ;
75- rounds4 ! ( abef, cdgh, w2, 2 ) ;
76- rounds4 ! ( abef, cdgh, w3, 3 ) ;
77- schedule_rounds4 ! ( abef, cdgh, w0, w1, w2, w3, w4, 4 ) ;
78- schedule_rounds4 ! ( abef, cdgh, w1, w2, w3, w4, w0, 5 ) ;
79- schedule_rounds4 ! ( abef, cdgh, w2, w3, w4, w0, w1, 6 ) ;
80- schedule_rounds4 ! ( abef, cdgh, w3, w4, w0, w1, w2, 7 ) ;
81- schedule_rounds4 ! ( abef, cdgh, w4, w0, w1, w2, w3, 8 ) ;
82- schedule_rounds4 ! ( abef, cdgh, w0, w1, w2, w3, w4, 9 ) ;
83- schedule_rounds4 ! ( abef, cdgh, w1, w2, w3, w4, w0, 10 ) ;
84- schedule_rounds4 ! ( abef, cdgh, w2, w3, w4, w0, w1, 11 ) ;
85- schedule_rounds4 ! ( abef, cdgh, w3, w4, w0, w1, w2, 12 ) ;
86- schedule_rounds4 ! ( abef, cdgh, w4, w0, w1, w2, w3, 13 ) ;
87- schedule_rounds4 ! ( abef, cdgh, w0, w1, w2, w3, w4, 14 ) ;
88- schedule_rounds4 ! ( abef, cdgh, w1, w2, w3, w4, w0, 15 ) ;
76+ let mut w = read_block ( block) ;
77+
78+ rounds4 ( 0 , & mut abef, & mut cdgh, w[ 0 ] ) ;
79+ rounds4 ( 1 , & mut abef, & mut cdgh, w[ 1 ] ) ;
80+ rounds4 ( 2 , & mut abef, & mut cdgh, w[ 2 ] ) ;
81+ rounds4 ( 3 , & mut abef, & mut cdgh, w[ 3 ] ) ;
82+
83+ schedule_rounds16 ( 4 , & mut abef, & mut cdgh, & mut w) ;
84+ schedule_rounds16 ( 8 , & mut abef, & mut cdgh, & mut w) ;
85+ schedule_rounds16 ( 12 , & mut abef, & mut cdgh, & mut w) ;
8986
9087 abef = _mm_add_epi32 ( abef, abef_save) ;
9188 cdgh = _mm_add_epi32 ( cdgh, cdgh_save) ;
@@ -96,7 +93,6 @@ pub(super) unsafe fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
9693 let dcba = _mm_blend_epi16 ( feba, dchg, 0xF0 ) ;
9794 let hgef = _mm_alignr_epi8 ( dchg, feba, 8 ) ;
9895
99- let state_ptr_mut: * mut __m128i = state. as_mut_ptr ( ) . cast ( ) ;
100- _mm_storeu_si128 ( state_ptr_mut. add ( 0 ) , dcba) ;
101- _mm_storeu_si128 ( state_ptr_mut. add ( 1 ) , hgef) ;
96+ _mm_storeu_si128 ( state_ptr. add ( 0 ) , dcba) ;
97+ _mm_storeu_si128 ( state_ptr. add ( 1 ) , hgef) ;
10298}
0 commit comments