Skip to content

Commit ed6f7f0

Browse files
committed
Format files and switch from vshr to vqdmulh for better pipelining
1 parent 75e788a commit ed6f7f0

7 files changed

Lines changed: 118 additions & 74 deletions

File tree

mlkem/src/fips202/keccakf1600.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636

3737
MLK_STATIC_TESTABLE
3838
void mlk_keccakf1600_extract_bytes_c(uint64_t *state, unsigned char *data,
39-
unsigned offset, unsigned length)
39+
unsigned offset, unsigned length)
4040
{
4141
unsigned i;
4242
#if defined(MLK_SYS_LITTLE_ENDIAN)
@@ -61,7 +61,8 @@ void mlk_keccakf1600_extract_bytes(uint64_t *state, unsigned char *data,
6161
unsigned offset, unsigned length)
6262
{
6363
#if defined(MLK_USE_FIPS202_X1_EXTRACT_BYTES_NATIVE)
64-
if(mlk_keccakf1600_extract_bytes_x1_native(state, data, offset, length) == MLK_NATIVE_FUNC_SUCCESS)
64+
if (mlk_keccakf1600_extract_bytes_x1_native(state, data, offset, length) ==
65+
MLK_NATIVE_FUNC_SUCCESS)
6566
{
6667
return;
6768
}
@@ -71,7 +72,7 @@ void mlk_keccakf1600_extract_bytes(uint64_t *state, unsigned char *data,
7172

7273
MLK_STATIC_TESTABLE
7374
void mlk_keccakf1600_xor_bytes_c(uint64_t *state, const unsigned char *data,
74-
unsigned offset, unsigned length)
75+
unsigned offset, unsigned length)
7576
{
7677
unsigned i;
7778
#if defined(MLK_SYS_LITTLE_ENDIAN)
@@ -96,7 +97,9 @@ void mlk_keccakf1600_xor_bytes(uint64_t *state, const unsigned char *data,
9697
unsigned offset, unsigned length)
9798
{
9899
#if defined(MLK_USE_FIPS202_X1_XOR_BYTES_NATIVE)
99-
if (mlk_keccakf1600_xor_bytes_x1_native(state, data, offset, length) == MLK_NATIVE_FUNC_SUCCESS) {
100+
if (mlk_keccakf1600_xor_bytes_x1_native(state, data, offset, length) ==
101+
MLK_NATIVE_FUNC_SUCCESS)
102+
{
100103
return;
101104
}
102105
#endif

mlkem/src/fips202/native/armv81m/mve.h

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,15 @@ static MLK_INLINE int mlk_keccak_f1600_x4_native(uint64_t *state)
4747
*/
4848
#define mlk_keccak_f1600_x1_state_xor_bytes_impl \
4949
MLK_NAMESPACE(mlk_keccak_f1600_x1_state_xor_bytes_impl)
50-
void mlk_keccak_f1600_x1_state_xor_bytes_impl(uint64_t *state, const uint8_t *data,
51-
unsigned offset,
52-
unsigned length);
50+
void mlk_keccak_f1600_x1_state_xor_bytes_impl(uint64_t *state,
51+
const uint8_t *data,
52+
unsigned offset, unsigned length);
5353

5454
MLK_MUST_CHECK_RETURN_VALUE
55-
static MLK_INLINE int mlk_keccakf1600_xor_bytes_x1_native(
56-
uint64_t *state, const uint8_t *data, unsigned offset,
57-
unsigned length)
55+
static MLK_INLINE int mlk_keccakf1600_xor_bytes_x1_native(uint64_t *state,
56+
const uint8_t *data,
57+
unsigned offset,
58+
unsigned length)
5859
{
5960
mlk_keccak_f1600_x1_state_xor_bytes_impl(state, data, offset, length);
6061
return MLK_NATIVE_FUNC_SUCCESS;
@@ -65,13 +66,16 @@ static MLK_INLINE int mlk_keccakf1600_xor_bytes_x1_native(
6566
*/
6667
#define mlk_keccak_f1600_x1_state_extract_bytes_impl \
6768
MLK_NAMESPACE(mlk_keccak_f1600_x1_state_extract_bytes_impl)
68-
void mlk_keccak_f1600_x1_state_extract_bytes_impl(uint64_t *state, uint8_t *data,
69-
unsigned offset,
70-
unsigned length);
69+
void mlk_keccak_f1600_x1_state_extract_bytes_impl(uint64_t *state,
70+
uint8_t *data,
71+
unsigned offset,
72+
unsigned length);
7173

7274
MLK_MUST_CHECK_RETURN_VALUE
73-
static MLK_INLINE int mlk_keccakf1600_extract_bytes_x1_native(
74-
uint64_t *state, uint8_t *data, unsigned offset, unsigned length)
75+
static MLK_INLINE int mlk_keccakf1600_extract_bytes_x1_native(uint64_t *state,
76+
uint8_t *data,
77+
unsigned offset,
78+
unsigned length)
7579
{
7680
mlk_keccak_f1600_x1_state_extract_bytes_impl(state, data, offset, length);
7781
return MLK_NATIVE_FUNC_SUCCESS;

mlkem/src/fips202/native/armv81m/src/fips202_native_armv81m.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,18 @@ void mlk_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
2020
#define mlk_keccak_f1600_x1_armv7m_asm MLK_NAMESPACE(keccak_f1600_x1_armv7m_asm)
2121
void mlk_keccak_f1600_x1_armv7m_asm(uint32_t state[50], const uint32_t rc[49]);
2222

23-
#define mlk_keccak_f1600_x1_state_xor_bytes_asm MLK_NAMESPACE(keccak_f1600_x1_state_xor_bytes_asm)
24-
void mlk_keccak_f1600_x1_state_xor_bytes_asm(
25-
uint64_t *state, const uint8_t *data, unsigned offset,
26-
unsigned length);
27-
28-
#define mlk_keccak_f1600_x1_state_extract_bytes_asm MLK_NAMESPACE(keccak_f1600_x1_state_extract_bytes_asm)
29-
void mlk_keccak_f1600_x1_state_extract_bytes_asm(
30-
uint64_t *state, const uint8_t *data, unsigned offset,
31-
unsigned length);
23+
#define mlk_keccak_f1600_x1_state_xor_bytes_asm \
24+
MLK_NAMESPACE(keccak_f1600_x1_state_xor_bytes_asm)
25+
void mlk_keccak_f1600_x1_state_xor_bytes_asm(uint64_t *state,
26+
const uint8_t *data,
27+
unsigned offset, unsigned length);
28+
29+
#define mlk_keccak_f1600_x1_state_extract_bytes_asm \
30+
MLK_NAMESPACE(keccak_f1600_x1_state_extract_bytes_asm)
31+
void mlk_keccak_f1600_x1_state_extract_bytes_asm(uint64_t *state,
32+
const uint8_t *data,
33+
unsigned offset,
34+
unsigned length);
3235

3336

3437
#endif /* !MLK_FIPS202_NATIVE_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */

mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv81m.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,17 @@
1212
#include <stdint.h>
1313
#include "fips202_native_armv81m.h"
1414

15-
void mlk_keccak_f1600_x1_state_extract_bytes_impl(
16-
uint64_t *state, uint8_t *data, unsigned offset,
17-
unsigned length)
15+
void mlk_keccak_f1600_x1_state_extract_bytes_impl(uint64_t *state,
16+
uint8_t *data,
17+
unsigned offset,
18+
unsigned length)
1819
{
1920
mlk_keccak_f1600_x1_state_extract_bytes_asm(state, data, offset, length);
2021
}
2122

22-
void mlk_keccak_f1600_x1_state_xor_bytes_impl(
23-
uint64_t *state, const uint8_t *data, unsigned offset,
24-
unsigned length)
23+
void mlk_keccak_f1600_x1_state_xor_bytes_impl(uint64_t *state,
24+
const uint8_t *data,
25+
unsigned offset, unsigned length)
2526
{
2627
mlk_keccak_f1600_x1_state_xor_bytes_asm(state, data, offset, length);
2728
}
@@ -32,7 +33,8 @@ void mlk_keccak_f1600_x1_state_xor_bytes_impl(
3233
int mlk_keccak_f1600_x1_native_impl(uint64_t *state)
3334
{
3435
/* Run the permutation */
35-
mlk_keccak_f1600_x1_armv7m_asm((void*)state, mlk_keccakf1600_round_constants);
36+
mlk_keccak_f1600_x1_armv7m_asm((void *)state,
37+
mlk_keccakf1600_round_constants);
3638
return MLK_NATIVE_FUNC_SUCCESS;
3739
}
3840

mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x1_mve_asm.S

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,18 @@ keccak_f1600_x1_state_extract_bytes_asm_main_loop_end:
181181
keccak_f1600_x1_state_extract_bytes_asm_exit:
182182
@ vpop {d8-d15}
183183
pop {r4-r12, pc}
184+
/****************** REGISTER DEALLOCATIONS *******************/
185+
.unreq state
186+
.unreq dp
187+
.unreq off_full
188+
.unreq length
189+
.unreq tmp
190+
.unreq nB
191+
.unreq off
192+
.unreq lane_offset_bytes
193+
.unreq mask
194+
.unreq qd
195+
.unreq qs
184196

185197
/* simpasm: footer-start */
186198
#endif /* MLK_FIPS202_ARMV81M_NEED_X1 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */

mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x1_mve_asm.S

Lines changed: 51 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -22,55 +22,46 @@
2222
// -----------------------------------------------------------------------------
2323
// Interleave macros
2424
// -----------------------------------------------------------------------------
25-
// interleave_odds: in-place SWAR bit permutation that compacts odd-numbered
26-
// bits of each byte/halfword/word in \t toward the upper half, preparing the
27-
// odd bit-plane. Uses vshl + vsri insertion per the semantics above.
28-
.macro interleave_odds t, u
29-
vshl.u8 \u, \t, #2 // u = t[5..0],00
30-
vsri.u8 \t, \u, #1 // t = t[7],u[6..0] => t = t[7],t[5..0],0
31-
vshl.u8 \u, \t, #3 // u = t[3..0],0000
32-
vsri.u8 \t, \u, #2 // t = t[7..6],u[5..0] => t = t[7],t[5],t[3..0],00
33-
vshl.u8 \u, \t, #4 // u = t[1..0],000000
34-
vsri.u8 \t, \u, #3 // t = t[7],t[5],t[3],u[4..0] => t = t[7],t[5],t[3],t[1..0],000
35-
// t16 = t[15],t[13],t[11],t[9..8],000,t[7],t[5],t[3],t[1..0],000
36-
vshl.u16 \u, \t, #8 // u16 = t[7],t[5],t[3],t[1..0],000
37-
vsri.u8 \t, \u, #4 // t16 = t[15,13,11,9,7,5,3,1]
38-
vshl.u32 \u, \t, #16 // u32 = t[15,13,11,9,7,5,3,1]
39-
vsri.u16 \t, \u, #8 // u16 = t[31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1]
40-
.endm
4125

4226
// interleave_evens: in-place SWAR bit permutation that compacts even-numbered
4327
// bits of each byte/halfword/word in \t toward the lower half, preparing the
4428
// even bit-plane. Comments show the equivalent masks after each stage.
45-
.macro interleave_evens t, u
46-
vshr.u8 \u, \t, #2 // stage 1 within bytes
29+
.macro interleave_evens t, u, const8, const16, const32, const128, const32768
30+
//vshr.u8 \u, \t, #2 // stage 1 within bytes
31+
vqdmulh.s8 \u, \t, \const32 // shift right 2 = shift left 5+1, shift right 8; 1<<5 = 32
4732
vsli.u8 \t, \u, #1 // t = ((t >> 1) & 0x7E7E7E7E) | (t & 0x01010101)
48-
vshr.u8 \u, \t, #3 // stage 2 within nibbles
33+
//vshr.u8 \u, \t, #3 // stage 2 within nibbles
34+
vqdmulh.s8 \u, \t, \const16 // shift right 3 = shift left 5, shift right 8; 1<<5 = 32
4935
vsli.u8 \t, \u, #2 // t = ((t >> 2) & 0x1C1C1C1C) | (t & 0x03030303)
50-
vshr.u8 \u, \t, #4 // stage 3 across bytes
36+
//vshr.u8 \u, \t, #4 // stage 3 across bytes
37+
vqdmulh.s8 \u, \t, \const8 // shift right 4 = shift left 4, shift right 8; 1<<4 = 16
5138
vsli.u8 \t, \u, #3 // t = ((t >> 3) & 0x08080808) | (t & 0x07070707)
52-
vshr.u16 \u, \t, #8 // widen within halfwords
39+
//vshr.u16 \u, \t, #8 // widen within halfwords
40+
vqdmulh.s16 \u, \t, \const128 // shift right by 8 = shift left 7+1, shift right 16; 1<<7 = 128
5341
vsli.u8 \t, \u, #4 // t = ((t >> 4) & 0x00F000F0) | (t & 0x000F000F)
54-
vshr.u32 \u, \t, #16 // widen within words
42+
//vshr.u32 \u, \t, #16 // widen within words
43+
vqdmulh.s32 \u, \t, \const32768 // shift right by 16 = shift left 15+1, shift right 32; 1<<15 = 32768
5544
vsli.u16 \t, \u, #8 // t = ((t >> 8) & 0x0000FF00) | (t & 0x000000FF)
5645
.endm
5746

5847
.balign 8
59-
.macro to_bit_interleaving_x1 tmp
48+
.macro to_bit_interleaving_x1 tmp, const8, const16, const32, const128, const32768
6049
// NOTE: This macro clobbers r0, q0, q1, q2, q3
6150
// Inputs on entry:
6251
// q0 = [d0l, d0h, d1l, d1h] (Two complete 64-bit lanes in 32-bit chunks)
6352
// Output on return:
6453
// q0 = Even bit-plane packed (e0, o0, e1, o1)
6554
// Vectors: || q0 || q1 || q2 || q3 ||
6655
// Elements: || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X || X | X | X | X ||
67-
vshl.u32 q1, q0, #0 // || d0l | d0h | d1l | d1h || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X ||
68-
interleave_evens q1, q2 // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || X | X | X | X || X | X | X | X ||
56+
vshl.u32 q1, q0, #1 // || d0l | d0h | d1l | d1h || d0l | d0h | d1l | d1h || X | X | X | X || X | X | X | X ||
57+
interleave_evens q1, q2, \const8, \const16, \const32, \const128, \const32768
58+
// || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || X | X | X | X || X | X | X | X ||
6959
vrev64.u32 q2, q1 // || d0l | d0h | d1l | d1h || e0l | e0h | e1l | e1h || e0h | e0l | e1h | e1l || X | X | X | X ||
7060
vsli.u32 q1, q2, #16 // || d0l | d0h | d1l | d1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
71-
interleave_odds q0, q3 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
61+
interleave_evens q0, q3, \const8, \const16, \const32, \const128, \const32768
62+
// || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || X | X | X | X ||
7263
vrev64.u32 q3, q0 // || o0l | o0h | o1l | o1h || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
73-
vsri.u32 q0, q3, #16 // || X | o0 | X | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
64+
vsli.u32 q0, q3, #16 // || X | o0 | X | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
7465
mov \tmp, #0x0F0F
7566
vmsr p0, \tmp
7667
vpsel q0, q1, q0 // || e0 | o0 | e1 | o1 || e0 | X | e1 | X || e0h | e0l | e1h | e1l || o0h | o0l | o1h | o1l ||
@@ -99,13 +90,21 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_xor_bytes_asm)
9990
tmp .req r4
10091
off .req r5
10192
lane_offset_bytes .req r6
102-
mask .req r7
93+
const8 .req r8
94+
const16 .req r9
95+
const32 .req r10
96+
const128 .req r11
97+
const32768 .req r12
10398
nB .req lr
10499
// ---- Vector naming ----
105100
qd .req q0
106101
qs .req q1
107102

108-
103+
mov const8, #8
104+
mov const16, #16
105+
mov const32, #32
106+
mov const128, #128
107+
mov const32768, #32768
109108

110109
cmp length, #0 // if len==0 done
111110
beq keccak_f1600_x1_state_xor_bytes_asm_exit
@@ -136,17 +135,17 @@ MLK_ASM_FN_SYMBOL(keccak_f1600_x1_state_xor_bytes_asm)
136135
// left-shift to align the active bytes within the 8-byte lane, and write
137136
// it back to p0 to predicate the subsequent byte gathers.
138137
vctp.8 nB
139-
vmrs mask, p0
138+
vmrs tmp, p0
140139
// mask << offset
141-
lsl mask, mask, off
142-
vmsr p0, mask
140+
lsl tmp, tmp, off
141+
vmsr p0, tmp
143142
// now load the partial lanes
144143
vpst
145144
vldrbt.u8 qd, [dp], #16
146145

147146
// Bit interleave
148147
// NOTE: q2,q3,q4 are dead here and not preserved.
149-
to_bit_interleaving_x1 tmp
148+
to_bit_interleaving_x1 tmp, const8, const16, const32, const128, const32768
150149

151150
vldrw.u32 qs, [state]
152151
veor qs, qs, qd
@@ -169,7 +168,7 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_start:
169168
vldrw.u32 qd, [dp], #16
170169
// Bit interleave
171170
// NOTE: q2,q3,q4 are dead here and not preserved.
172-
to_bit_interleaving_x1 tmp
171+
to_bit_interleaving_x1 tmp, const8, const16, const32, const128, const32768
173172

174173
// XOR into state (stores post-increment state by 16)
175174
vldrw.u32 qs, [state]
@@ -196,7 +195,7 @@ keccak_f1600_x1_state_xor_bytes_asm_main_loop_end:
196195

197196
// Bit interleave
198197
// NOTE: q2,q3,q4 are dead here and not preserved.
199-
to_bit_interleaving_x1 tmp
198+
to_bit_interleaving_x1 tmp, const8, const16, const32, const128, const32768
200199

201200
vldrw.u32 qs, [state]
202201
veor qs, qs, qd
@@ -206,5 +205,22 @@ keccak_f1600_x1_state_xor_bytes_asm_exit:
206205
@ vpop {d8-d15}
207206
pop {r4-r12, pc}
208207

208+
/****************** REGISTER DEALLOCATIONS *******************/
209+
.unreq state
210+
.unreq dp
211+
.unreq off_full
212+
.unreq length
213+
.unreq tmp
214+
.unreq off
215+
.unreq lane_offset_bytes
216+
.unreq nB
217+
.unreq qd
218+
.unreq qs
219+
.unreq const8
220+
.unreq const16
221+
.unreq const32
222+
.unreq const128
223+
.unreq const32768
224+
209225
/* simpasm: footer-start */
210226
#endif /* MLK_FIPS202_ARMV81M_NEED_X4 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */

test/src/test_unit.c

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ void mlk_polyvec_basemul_acc_montgomery_cached_c(
4444
void mlk_poly_mulcache_compute_c(mlk_poly_mulcache *x, const mlk_poly *a);
4545
void mlk_keccakf1600_permute_c(uint64_t *state);
4646
void mlk_keccakf1600_xor_bytes_c(uint64_t *state, const unsigned char *data,
47-
unsigned offset, unsigned length);
47+
unsigned offset, unsigned length);
4848
void mlk_keccakf1600_extract_bytes_c(uint64_t *state, unsigned char *data,
49-
unsigned offset, unsigned length);
49+
unsigned offset, unsigned length);
5050
#define CHECK(x) \
5151
do \
5252
{ \
@@ -654,8 +654,8 @@ static int test_keccakf1600_xor_permute_extract(void)
654654

655655
for (i = 0; i < NUM_RANDOM_TESTS; i++)
656656
{
657-
randombytes(&xor_offset,1);
658-
randombytes(&xor_length,1);
657+
randombytes(&xor_offset, 1);
658+
randombytes(&xor_length, 1);
659659
xor_offset = xor_offset % MAX_RATE;
660660
xor_length = (uint8_t)(1 + (xor_length % (MAX_RATE - xor_offset)));
661661
randombytes(&ext_offset, 1);
@@ -667,15 +667,19 @@ static int test_keccakf1600_xor_permute_extract(void)
667667
memset(state_native, 0, sizeof(state_native));
668668
memset(output_native, 0, sizeof(output_native));
669669

670-
mlk_keccakf1600_xor_bytes(state_native, (uint8_t *)input, xor_offset, xor_length);
670+
mlk_keccakf1600_xor_bytes(state_native, (uint8_t *)input, xor_offset,
671+
xor_length);
671672
mlk_keccakf1600_permute(state_native);
672-
mlk_keccakf1600_extract_bytes(state_native, (uint8_t *)output_native, ext_offset, ext_length);
673+
mlk_keccakf1600_extract_bytes(state_native, (uint8_t *)output_native,
674+
ext_offset, ext_length);
673675

674676
memset(state_c, 0, sizeof(state_c));
675677
memset(output_c, 0, sizeof(output_c));
676-
mlk_keccakf1600_xor_bytes_c(state_c, (uint8_t *)input, xor_offset, xor_length);
678+
mlk_keccakf1600_xor_bytes_c(state_c, (uint8_t *)input, xor_offset,
679+
xor_length);
677680
mlk_keccakf1600_permute_c(state_c);
678-
mlk_keccakf1600_extract_bytes_c(state_c, (uint8_t *)output_c, ext_offset, ext_length);
681+
mlk_keccakf1600_extract_bytes_c(state_c, (uint8_t *)output_c, ext_offset,
682+
ext_length);
679683

680684
CHECK(compare_u64_arrays(output_native, output_c, MLK_KECCAK_LANES,
681685
"keccakf1600_permute"));

0 commit comments

Comments
 (0)