2828.macro interleave_odds t , u
2929 vshl.u8 \u , \t , # 2 // u = t [ 5 .. 0 ], 00
3030 vsri.u8 \t , \u , # 1 // t = t [ 7 ], u [ 6 .. 0 ] => t = t [ 7 ], t [ 5 .. 0 ], 0
31- vshl.u8 \u , \t , # 3 // stage 2 across nibbles
32- vsri.u8 \t , \u , # 2
33- vshl.u8 \u , \t , # 4 // stage 3 across bytes
34- vsri.u8 \t , \u , # 3
35- vshl.u16 \u , \t , # 8 // widen within halfwords
36- vsri.u8 \t , \u , # 4
37- vshl.u32 \u , \t , # 16 // widen within words
38- vsri.u16 \t , \u , # 8 // odd bits compacted ; per 32b lane: lo16=bytes0..3, hi16=4..7
31+ vshl.u8 \u , \t , # 3 // u = t [ 3 .. 0 ], 0000
32+ vsri.u8 \t , \u , # 2 // t = t [ 7 .. 6 ], u [ 5 .. 0 ] => t = t [ 7 ], t [ 5 ], t [ 3 .. 0 ], 00
33+ vshl.u8 \u , \t , # 4 // u = t [ 1 .. 0 ], 000000
34+ vsri.u8 \t , \u , # 3 // t = t [ 7 ], t [ 5 ], t [ 3 ], u [ 4 .. 0 ] => t = t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
35+ // t16 = t [ 15 ], t [ 13 ], t [ 11 ], t [ 9 .. 8 ], 000 , t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
36+ vshl.u16 \u , \t , # 8 // u16 = t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
37+ vsri.u8 \t , \u , # 4 // t16 = t [ 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
38+ vshl.u32 \u , \t , # 16 // u32 = t [ 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
39+ vsri.u16 \t , \u , # 8 // u16 = t [ 31 , 29 , 27 , 25 , 23 , 21 , 19 , 17 , 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
3940.endm
4041
4142// interleave_evens: in - place SWAR bit permutation th at compacts even - numbered
5556.endm
5657
5758.balign 8
58- .macro to_bit_interleaving_x1
59+ .macro to_bit_interleaving_x1 tmp
5960 // NOTE: This macro clobbers r0 , q0 , q1 , q2 , q3
6061 // Inputs on entry:
6162 // q0 = [ d0l , d0h , d1l , d1h ] (Two complete 64 - bit lanes in 32 - bit chunks)
6869 vrev64.u32 q2 , q1 // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || e0h | e0l | e1h | e1l || X | X | X | X ||
6970 vsli.u32 q1 , q2 , # 16 // || d0l | d0h | d1l | d1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
7071 interleave_odds q0 , q3 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
71- vrev64.u32 q0 , q3 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
72+ vrev64.u32 q3 , q0 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
7273 vsri.u32 q0 , q3 , # 16 // || X | o0 | X | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
73- mov r0 , # 0x0F0F
74- vmsr p0 , r0
74+ mov \tmp , # 0x0F0F
75+ vmsr p0 , \tmp
7576 vpsel q0 , q1 , q0 // || e0 | o0 | e1 | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
7677.endm
7778
@@ -145,7 +146,7 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_xor_bytes_asm)
145146
146147 // Bit interleave
147148 // NOTE: q2 , q3 , q4 are dead here and not preserved.
148- to_bit_interleaving_x1
149+ to_bit_interleaving_x1 tmp
149150
150151 vldrw.u32 qs , [ state ]
151152 veor qs , qs , qd
@@ -168,7 +169,7 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_start:
168169 vldrw.u32 qd , [ dp ], # 16
169170 // Bit interleave
170171 // NOTE: q2 , q3 , q4 are dead here and not preserved.
171- to_bit_interleaving_x1
172+ to_bit_interleaving_x1 tmp
172173
173174 // XOR into state (stores post - increment state by 16 )
174175 vldrw.u32 qs , [ state ]
@@ -182,26 +183,20 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_end:
182183 // TAIL: if length remaining < 8 , absorb it at offset_in_lane= 0
183184 // -------------------------------------------------------------------------
184185
185- // length &= 7
186+ // length &= 15
186187 // Placeholder: if r6 == 0 , done.
187- ands length , length , # 7
188+ ands length , length , # 15
188189 cmp length , # 0
189190 beq keccak_f1600_x1_state_xor_bytes_asm_exit
190191
191192 // Tail via predicated byte loads like prologue , but off= 0 (no base adjust)
192193 vctp. 8 length
193- vctp. 8 nB
194- vmrs mask , p0
195- // mask << offset
196- lsl mask , mask , off
197- vmsr p0 , mask
198- // now load the partial lanes
199194 vpst
200195 vldrbt.u8 qd , [ dp ], # 16
201196
202197 // Bit interleave
203198 // NOTE: q2 , q3 , q4 are dead here and not preserved.
204- to_bit_interleaving_x1
199+ to_bit_interleaving_x1 tmp
205200
206201 vldrw.u32 qs , [ state ]
207202 veor qs , qs , qd
0 commit comments