2525
2626// deinterleave_even: inverse of the even - bit compaction. Inflates the even
2727// bit - plane stored in \e back into byte positions (odd bits garbage).
28- .macro deinterleave_even e , tmp
28+ .macro deinterleave_even e , tmp , const4 , const16
2929 // | e [ 31 : 24 ] | e [ 23 : 16 ] | e [ 15 : 8 ] | e [ 7 : 0 ] |
3030 vsli.u32 \e , \e , # 8 // | e [ 23 : 16 ] | e [ 15 : 8 ] | e [ 7 : 0 ] | e [ 7 : 0 ] |
3131 vsli.u16 \e , \e , # 4 // | e [ 19 : 12 ] | e [ 11 : 8 , 11 : 8 ] | e [ 3 : 0 , 7 : 4 ] | e [ 3 : 0 , 3 : 0 ] |
3232 vsli.u8 \e , \e , # 1 // | e [ 18 : 12 , 12 ] | e [ 10 : 8 , 11 : 8 , 8 ] | e [ 2 : 0 , 7 : 4 , 4 ] | e [ 2 : 0 , 3 : 0 , 0 ] |
33- vshr.u8 \tmp , \e , # 3
33+ vqdmulh.s8 \tmp , \e , \const16
3434 vsli.u8 \e , \tmp , # 4 // | e [ 17 : 14 , 14 : 12 , 12 ] | e [ 9 : 8 , 11 : 10 , 10 : 8 , 8 ] | e [ 1 : 0 , 7 : 6 , 6 : 4 , 4 ] | e [ 1 : 0 , 3 : 2 , 2 : 0 , 0 ] |
35- vshr.u8 \tmp , \e , # 5
35+ vqdmulh.s8 \tmp , \e , \const4
3636 vsli.u8 \e , \tmp , # 6 // | e [ 16 : 15 , 15 : 14 , 14 : 12 , 12 ] | e [ 8 , 11 , 11 : 10 , 10 : 8 , 8 ] | e [ 0 , 7 , 7 : 6 , 6 : 4 , 4 ] | e [ 0 , 3 , 3 : 2 , 2 : 0 , 0 ] |
3737 // after 0x55 mask
3838 // | e [ 15 , 14 , 13 , 12 ] | e [ 11 , 10 , 9 , 8 ] | e [ 7 , 6 , 5 , 4 ] | e [ 3 , 2 , 1 , 0 ] |
3939.endm
4040
4141.balign 8
42- .macro from_bit_interleaving_x1 tmp
42+ .macro from_bit_interleaving_x1 tmp , const4 , const16
4343 // Input: q0 = [ e0 , o0 , e1 , o1 ]
4444 // Output: q0 = [ d0l , d0h , d1l , d1h ]
4545 // Clobbers: r0 , q1 , q2 , q3 , q4
5555 // construct an o vector
5656 vpsel q1 , q3 , q1 // q0.u16: [ o0l , o0h , o0h , o0l , o1l , o1h , o1l , o1h ]
5757 // expand vectors
58- deinterleave_even q0 , q2
59- deinterleave_even q1 , q2
58+ deinterleave_even q0 , q2 , \const4 , \const16
59+ deinterleave_even q1 , q2 , \const4 , \const16
6060 // Zero garbage bits
6161 mov \tmp , # 0x55
6262 vdup.u8 q2 , \tmp
@@ -90,12 +90,17 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
9090 tmp .req r4
9191 nB .req lr
9292 off .req r5
93+ const4 .req r6
94+ const16 .req r7
9395 lane_offset_bytes .req r9
9496 mask .req r11
9597 // ---- Vector naming ----
9698 qd .req q0
9799 qs .req q1
98100
101+ mov const4 , # 4
102+ mov const16 , # 16
103+
99104 cmp length , # 0 // if len== 0 done
100105 beq keccak_f1600_x1_state_extract_bytes_asm_exit
101106
@@ -123,7 +128,7 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
123128 // Load state for the partial lane
124129 vldrw.u32 qd , [ state ], # 16
125130 // Deinterleave to bytes
126- from_bit_interleaving_x1 tmp
131+ from_bit_interleaving_x1 tmp , const4 , const16
127132 // Predicated byte store of up to 16 bytes
128133 // calculate the predicates
129134 // mask = ( 1 << nB) - 1 over 8 - bit lanes , then shift by 'off' .
@@ -154,7 +159,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_start:
154159 // Load 16B (two lanes) from state and bump pointer
155160 vldrw.u32 qd , [ state ], # 16
156161 // Deinterleave to bytes
157- from_bit_interleaving_x1 tmp
162+ from_bit_interleaving_x1 tmp , const4 , const16
158163 // Store 16B of output bytes (post - increment by 16 )
159164 vstrw.u32 qd , [ dp ], # 16
160165
@@ -172,7 +177,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_end:
172177
173178 // Load next state lane , deinterleave , store tail
174179 vldrw.u32 qd , [ state ], # 16
175- from_bit_interleaving_x1 tmp
180+ from_bit_interleaving_x1 tmp , const4 , const16
176181 // Tail via predicated byte stores like prologue , but off= 0 (no base adjust)
177182 vctp. 8 length
178183 vpst
0 commit comments