Skip to content

Commit 278a9ac

Browse files
bremoranmkannwischer
authored andcommitted
Do vqdmulh optimisation for extractbytes as well
Signed-off-by: Brendan Moran <brendan.moran@arm.com>
1 parent 4d0a127 commit 278a9ac

2 files changed

Lines changed: 19 additions & 19 deletions

File tree

mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x1_mve_asm.S

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,21 +25,21 @@
2525

2626
// deinterleave_even: inverse of the even-bit compaction. Inflates the even
2727
// bit-plane stored in \e back into byte positions (odd bits garbage).
28-
.macro deinterleave_even e, tmp
28+
.macro deinterleave_even e, tmp, const4, const16
2929
// | e[31:24] | e[23:16] | e[15:8] | e[7:0] |
3030
vsli.u32 \e, \e, #8 // | e[23:16] | e[15:8] | e[7:0] | e[7:0] |
3131
vsli.u16 \e, \e, #4 // | e[19:12] | e[11:8,11:8] | e[3:0,7:4] | e[3:0,3:0] |
3232
vsli.u8 \e, \e, #1 // | e[18:12,12] | e[10:8,11:8,8] | e[2:0,7:4,4] | e[2:0,3:0,0] |
33-
vshr.u8 \tmp, \e, #3
33+
vqdmulh.s8 \tmp, \e, \const16
3434
vsli.u8 \e, \tmp, #4 // | e[17:14,14:12,12] | e[9:8,11:10,10:8,8] | e[1:0,7:6,6:4,4] | e[1:0,3:2,2:0,0] |
35-
vshr.u8 \tmp, \e, #5
35+
vqdmulh.s8 \tmp, \e, \const4
3636
vsli.u8 \e, \tmp, #6 // | e[16:15,15:14,14:12,12] | e[8,11,11:10,10:8,8] | e[0,7,7:6,6:4,4] | e[0,3,3:2,2:0,0] |
3737
// after 0x55 mask
3838
// | e[15,14,13,12] | e[11,10,9,8] | e[7,6,5,4] | e[3,2,1,0] |
3939
.endm
4040

4141
.balign 8
42-
.macro from_bit_interleaving_x1 tmp
42+
.macro from_bit_interleaving_x1 tmp, const4, const16
4343
// Input: q0 = [e0, o0, e1, o1]
4444
// Output: q0 = [d0l, d0h, d1l, d1h]
4545
// Clobbers: r0, q1, q2, q3, q4
@@ -55,8 +55,8 @@
5555
// construct an o vector
5656
vpsel q1, q3, q1 // q0.u16: [o0l, o0h, o0h, o0l, o1l, o1h, o1l, o1h]
5757
// expand vectors
58-
deinterleave_even q0, q2
59-
deinterleave_even q1, q2
58+
deinterleave_even q0, q2, \const4, \const16
59+
deinterleave_even q1, q2, \const4, \const16
6060
// Zero garbage bits
6161
mov \tmp, #0x55
6262
vdup.u8 q2, \tmp
@@ -90,12 +90,17 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
9090
tmp .req r4
9191
nB .req lr
9292
off .req r5
93+
const4 .req r6
94+
const16 .req r7
9395
lane_offset_bytes .req r9
9496
mask .req r11
9597
// ---- Vector naming ----
9698
qd .req q0
9799
qs .req q1
98100

101+
mov const4, #4
102+
mov const16, #16
103+
99104
cmp length, #0 // if len==0 done
100105
beq keccak_f1600_x1_state_extract_bytes_asm_exit
101106

@@ -123,7 +128,7 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_extract_bytes_asm)
123128
// Load state for the partial lane
124129
vldrw.u32 qd, [state], #16
125130
// Deinterleave to bytes
126-
from_bit_interleaving_x1 tmp
131+
from_bit_interleaving_x1 tmp, const4, const16
127132
// Predicated byte store of up to 16 bytes
128133
// calculate the predicates
129134
// mask = (1 << nB) - 1 over 8-bit lanes, then shift by 'off'.
@@ -154,7 +159,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_start:
154159
// Load 16B (two lanes) from state and bump pointer
155160
vldrw.u32 qd, [state], #16
156161
// Deinterleave to bytes
157-
from_bit_interleaving_x1 tmp
162+
from_bit_interleaving_x1 tmp, const4, const16
158163
// Store 16B of output bytes (post-increment by 16)
159164
vstrw.u32 qd, [dp], #16
160165

@@ -172,7 +177,7 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_end:
172177

173178
// Load next state lane, deinterleave, store tail
174179
vldrw.u32 qd, [state], #16
175-
from_bit_interleaving_x1 tmp
180+
from_bit_interleaving_x1 tmp, const4, const16
176181
// Tail via predicated byte stores like prologue, but off=0 (no base adjust)
177182
vctp.8 length
178183
vpst

mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x1_mve_asm.S

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,15 @@
2727
// bits of each byte/halfword/word in \t toward the lower half, preparing the
2828
// even bit-plane. Comments show the equivalent masks after each stage.
2929
.macro interleave_evens t, u, const8, const16, const32, const128, const32768
30-
//vshr.u8 \u, \t, #2 // stage 1 within bytes
31-
vqdmulh.s8 \u, \t, \const32 // shift right 2 = shift left 5+1, shift right 8; 1<<5 = 32
30+
vqdmulh.s8 \u, \t, \const32
3231
vsli.u8 \t, \u, #1 // t = ((t >> 1) & 0x7E7E7E7E) | (t & 0x01010101)
33-
//vshr.u8 \u, \t, #3 // stage 2 within nibbles
34-
vqdmulh.s8 \u, \t, \const16 // shift right 3 = shift left 5, shift right 8; 1<<5 = 32
32+
vqdmulh.s8 \u, \t, \const16
3533
vsli.u8 \t, \u, #2 // t = ((t >> 2) & 0x1C1C1C1C) | (t & 0x03030303)
36-
//vshr.u8 \u, \t, #4 // stage 3 across bytes
37-
vqdmulh.s8 \u, \t, \const8 // shift right 4 = shift left 4, shift right 8; 1<<4 = 16
34+
vqdmulh.s8 \u, \t, \const8
3835
vsli.u8 \t, \u, #3 // t = ((t >> 3) & 0x08080808) | (t & 0x07070707)
39-
//vshr.u16 \u, \t, #8 // widen within halfwords
40-
vqdmulh.s16 \u, \t, \const128 // shift right by 8 = shift left 7+1, shift right 16; 1<<7 = 128
36+
vqdmulh.s16 \u, \t, \const128
4137
vsli.u8 \t, \u, #4 // t = ((t >> 4) & 0x00F000F0) | (t & 0x000F000F)
42-
//vshr.u32 \u, \t, #16 // widen within words
43-
vqdmulh.s32 \u, \t, \const32768 // shift right by 16 = shift left 15+1, shift right 32; 1<<15 = 32768
38+
vqdmulh.s32 \u, \t, \const32768
4439
vsli.u16 \t, \u, #8 // t = ((t >> 8) & 0x0000FF00) | (t & 0x000000FF)
4540
.endm
4641

0 commit comments

Comments
 (0)