2222// -----------------------------------------------------------------------------
2323// Interleave macros
2424// -----------------------------------------------------------------------------
25- // interleave_odds: in - place SWAR bit permutation th at compacts odd - numbered
26- // bits of each byte/halfword/word in \t toward the upper half , preparing the
27- // odd bit - plane. Uses vshl + vsri insertion per the semantics above.
28- .macro interleave_odds t , u
29- vshl.u8 \u , \t , # 2 // u = t [ 5 .. 0 ], 00
30- vsri.u8 \t , \u , # 1 // t = t [ 7 ], u [ 6 .. 0 ] => t = t [ 7 ], t [ 5 .. 0 ], 0
31- vshl.u8 \u , \t , # 3 // u = t [ 3 .. 0 ], 0000
32- vsri.u8 \t , \u , # 2 // t = t [ 7 .. 6 ], u [ 5 .. 0 ] => t = t [ 7 ], t [ 5 ], t [ 3 .. 0 ], 00
33- vshl.u8 \u , \t , # 4 // u = t [ 1 .. 0 ], 000000
34- vsri.u8 \t , \u , # 3 // t = t [ 7 ], t [ 5 ], t [ 3 ], u [ 4 .. 0 ] => t = t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
35- // t16 = t [ 15 ], t [ 13 ], t [ 11 ], t [ 9 .. 8 ], 000 , t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
36- vshl.u16 \u , \t , # 8 // u16 = t [ 7 ], t [ 5 ], t [ 3 ], t [ 1 .. 0 ], 000
37- vsri.u8 \t , \u , # 4 // t16 = t [ 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
38- vshl.u32 \u , \t , # 16 // u32 = t [ 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
39- vsri.u16 \t , \u , # 8 // u16 = t [ 31 , 29 , 27 , 25 , 23 , 21 , 19 , 17 , 15 , 13 , 11 , 9 , 7 , 5 , 3 , 1 ]
40- .endm
4125
4226// interleave_evens: in - place SWAR bit permutation th at compacts even - numbered
4327// bits of each byte/halfword/word in \t toward the lower half , preparing the
4428// even bit - plane. Comments show the equivalent masks after each stage.
45- .macro interleave_evens t , u
46- vshr.u8 \u , \t , # 2 // stage 1 within bytes
29+ .macro interleave_evens t , u , const8 , const16 , const32 , const128 , const32768
30+ //vshr.u8 \u , \t , # 2 // stage 1 within bytes
31+ vqdmulh.s8 \u , \t , \const32 // shift right 2 = shift left 5 + 1 , shift right 8 ; 1<<5 = 32
4732 vsli.u8 \t , \u , # 1 // t = ((t >> 1 ) & 0x7E7E7E7E ) | (t & 0x01010101 )
48- vshr.u8 \u , \t , # 3 // stage 2 within nibbles
33+ //vshr.u8 \u , \t , # 3 // stage 2 within nibbles
34+ vqdmulh.s8 \u , \t , \const16 // shift right 3 = shift left 5 , shift right 8 ; 1<<5 = 32
4935 vsli.u8 \t , \u , # 2 // t = ((t >> 2 ) & 0x1C1C1C1C ) | (t & 0x03030303 )
50- vshr.u8 \u , \t , # 4 // stage 3 across bytes
36+ //vshr.u8 \u , \t , # 4 // stage 3 across bytes
37+ vqdmulh.s8 \u , \t , \const8 // shift right 4 = shift left 4 , shift right 8 ; 1<<4 = 16
5138 vsli.u8 \t , \u , # 3 // t = ((t >> 3 ) & 0x08080808 ) | (t & 0x07070707 )
52- vshr.u16 \u , \t , # 8 // widen within halfwords
39+ //vshr.u16 \u , \t , # 8 // widen within halfwords
40+ vqdmulh.s16 \u , \t , \const128 // shift right by 8 = shift left 7 + 1 , shift right 16 ; 1<<7 = 128
5341 vsli.u8 \t , \u , # 4 // t = ((t >> 4 ) & 0x00F000F0 ) | (t & 0x000F000F )
54- vshr.u32 \u , \t , # 16 // widen within words
42+ //vshr.u32 \u , \t , # 16 // widen within words
43+ vqdmulh.s32 \u , \t , \const32768 // shift right by 16 = shift left 15 + 1 , shift right 32 ; 1<<15 = 32768
5544 vsli.u16 \t , \u , # 8 // t = ((t >> 8 ) & 0x0000FF00 ) | (t & 0x000000FF )
5645.endm
5746
5847.balign 8
59- .macro to_bit_interleaving_x1 tmp
48+ .macro to_bit_interleaving_x1 tmp , const8 , const16 , const32 , const128 , const32768
6049 // NOTE: This macro clobbers r0 , q0 , q1 , q2 , q3
6150 // Inputs on entry:
6251 // q0 = [ d0l , d0h , d1l , d1h ] (Two complete 64 - bit lanes in 32 - bit chunks)
6352 // Output on return:
6453 // q0 = Even bit - plane packed (e0 , o0 , e1 , o1)
6554 // Vectors: || q0 || q1 || q2 || q3 ||
6655 // Elements: || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X || X | X | X | X ||
67- vshl.u32 q1 , q0 , # 0 // || d0l | d0h | d1l | d1h || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X ||
68- interleave_evens q1 , q2 // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || X | X | X | X || X | X | X | X ||
56+ vshl.u32 q1 , q0 , # 1 // || d0l | d0h | d1l | d1h || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X ||
57+ interleave_evens q1 , q2 , \const8 , \const16 , \const32 , \const128 , \const32768
58+ // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || X | X | X | X || X | X | X | X ||
6959 vrev64.u32 q2 , q1 // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || e0h | e0l | e1h | e1l || X | X | X | X ||
7060 vsli.u32 q1 , q2 , # 16 // || d0l | d0h | d1l | d1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
71- interleave_odds q0 , q3 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
61+ interleave_evens q0 , q3 , \const8 , \const16 , \const32 , \const128 , \const32768
62+ // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
7263 vrev64.u32 q3 , q0 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
73- vsri .u32 q0 , q3 , # 16 // || X | o0 | X | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
64+ vsli .u32 q0 , q3 , # 16 // || X | o0 | X | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
7465 mov \tmp , # 0x0F0F
7566 vmsr p0 , \tmp
7667 vpsel q0 , q1 , q0 // || e0 | o0 | e1 | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
@@ -99,13 +90,21 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_xor_bytes_asm)
9990 tmp .req r4
10091 off .req r5
10192 lane_offset_bytes .req r6
102- mask .req r7
93+ const8 .req r8
94+ const16 .req r9
95+ const32 .req r10
96+ const128 .req r11
97+ const32768 .req r12
10398 nB .req lr
10499 // ---- Vector naming ----
105100 qd .req q0
106101 qs .req q1
107102
108-
103+ mov const8 , # 8
104+ mov const16 , # 16
105+ mov const32 , # 32
106+ mov const128 , # 128
107+ mov const32768 , # 32768
109108
110109 cmp length , # 0 // if len== 0 done
111110 beq keccak_f1600_x1_state_xor_bytes_asm_exit
@@ -136,17 +135,17 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_xor_bytes_asm)
136135 // left - shift to align the active bytes within the 8 - byte lane , and write
137136 // it back to p0 to predicate the subsequent byte gathers.
138137 vctp. 8 nB
139- vmrs mask , p0
138+ vmrs tmp , p0
140139 // mask << offset
141- lsl mask , mask , off
142- vmsr p0 , mask
140+ lsl tmp , tmp , off
141+ vmsr p0 , tmp
143142 // now load the partial lanes
144143 vpst
145144 vldrbt.u8 qd , [ dp ], # 16
146145
147146 // Bit interleave
148147 // NOTE: q2 , q3 , q4 are dead here and not preserved.
149- to_bit_interleaving_x1 tmp
148+ to_bit_interleaving_x1 tmp , const8 , const16 , const32 , const128 , const32768
150149
151150 vldrw.u32 qs , [ state ]
152151 veor qs , qs , qd
@@ -169,7 +168,7 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_start:
169168 vldrw.u32 qd , [ dp ], # 16
170169 // Bit interleave
171170 // NOTE: q2 , q3 , q4 are dead here and not preserved.
172- to_bit_interleaving_x1 tmp
171+ to_bit_interleaving_x1 tmp , const8 , const16 , const32 , const128 , const32768
173172
174173 // XOR into state (stores post - increment state by 16 )
175174 vldrw.u32 qs , [ state ]
@@ -196,7 +195,7 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_end:
196195
197196 // Bit interleave
198197 // NOTE: q2 , q3 , q4 are dead here and not preserved.
199- to_bit_interleaving_x1 tmp
198+ to_bit_interleaving_x1 tmp , const8 , const16 , const32 , const128 , const32768
200199
201200 vldrw.u32 qs , [ state ]
202201 veor qs , qs , qd
@@ -206,5 +205,22 @@ keccak_f1600_x1_state_xor_bytes_asm_exit:
206205 @ vpop {d8 - d15}
207206 pop {r4 - r12 , pc}
208207
208+ / ****************** REGISTER DEALLOCATIONS ******************* /
209+ .unreq state
210+ .unreq dp
211+ .unreq off_full
212+ .unreq length
213+ .unreq tmp
214+ .unreq off
215+ .unreq lane_offset_bytes
216+ .unreq nB
217+ .unreq qd
218+ .unreq qs
219+ .unreq const8
220+ .unreq const16
221+ .unreq const32
222+ .unreq const128
223+ .unreq const32768
224+
209225/ * simpasm: footer - start * /
210226#endif / * MLK_FIPS202_ARMV81M_NEED_X4 && !MLK_CONFIG_MULTILEVEL_NO_SHARED * /
0 commit comments