Skip to content

Commit bb40f72

Browse files
Armv8.1-M: Add native Keccak x4 XORBytes and ExtractBytes
Add native MVE implementations of XORBytes and ExtractBytes that perform bit-interleaving/deinterleaving on-the-fly, enabling use of a bit-interleaved state representation without temporary conversions in the permutation. This improves performance by: - Reducing the number of bit-interleaving operations - Accelerating bit-interleaving using MVE vector instructions The backend uses bit-interleaved state representation where each 64-bit lane is split into even and odd 32-bit halves for efficient 32-bit MVE processing. Co-Authored-By: Brendan Moran <brendan.moran@arm.com> Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 1601cb6 commit bb40f72

16 files changed

Lines changed: 1507 additions & 207 deletions

dev/fips202/armv81m/mve.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,18 @@
1111

1212
/* Part of backend API */
1313
#define MLD_USE_FIPS202_X4_NATIVE
14+
#define MLD_USE_FIPS202_X4_XOR_BYTES_NATIVE
15+
#define MLD_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
1416
/* Guard for assembly file */
1517
#define MLD_FIPS202_ARMV81M_NEED_X4
1618

1719
#if !defined(__ASSEMBLER__)
1820
#include "../api.h"
1921

22+
/*
23+
* Native x4 permutation
24+
* State is kept in bit-interleaved format.
25+
*/
2026
#define mld_keccak_f1600_x4_native_impl \
2127
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
2228
int mld_keccak_f1600_x4_native_impl(uint64_t *state);
@@ -27,6 +33,48 @@ static MLD_INLINE int mld_keccak_f1600_x4_native(uint64_t *state)
2733
return mld_keccak_f1600_x4_native_impl(state);
2834
}
2935

36+
/*
37+
* Native x4 XOR bytes (with on-the-fly bit interleaving)
38+
*/
39+
#define mld_keccak_f1600_x4_state_xor_bytes \
40+
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
41+
void mld_keccak_f1600_x4_state_xor_bytes(void *state, const uint8_t *data0,
42+
const uint8_t *data1,
43+
const uint8_t *data2,
44+
const uint8_t *data3, unsigned offset,
45+
unsigned length);
46+
47+
MLD_MUST_CHECK_RETURN_VALUE
48+
static MLD_INLINE int mld_keccakf1600_xor_bytes_x4_native(
49+
uint64_t *state, const uint8_t *data0, const uint8_t *data1,
50+
const uint8_t *data2, const uint8_t *data3, unsigned offset,
51+
unsigned length)
52+
{
53+
mld_keccak_f1600_x4_state_xor_bytes(state, data0, data1, data2, data3, offset,
54+
length);
55+
return MLD_NATIVE_FUNC_SUCCESS;
56+
}
57+
58+
/*
59+
* Native x4 extract bytes (with on-the-fly bit de-interleaving)
60+
*/
61+
#define mld_keccak_f1600_x4_state_extract_bytes \
62+
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
63+
void mld_keccak_f1600_x4_state_extract_bytes(void *state, uint8_t *data0,
64+
uint8_t *data1, uint8_t *data2,
65+
uint8_t *data3, unsigned offset,
66+
unsigned length);
67+
68+
MLD_MUST_CHECK_RETURN_VALUE
69+
static MLD_INLINE int mld_keccakf1600_extract_bytes_x4_native(
70+
uint64_t *state, uint8_t *data0, uint8_t *data1, uint8_t *data2,
71+
uint8_t *data3, unsigned offset, unsigned length)
72+
{
73+
mld_keccak_f1600_x4_state_extract_bytes(state, data0, data1, data2, data3,
74+
offset, length);
75+
return MLD_NATIVE_FUNC_SUCCESS;
76+
}
77+
3078
#endif /* !__ASSEMBLER__ */
3179

3280
#endif /* !MLD_DEV_FIPS202_ARMV81M_MVE_H */

dev/fips202/armv81m/src/fips202_native_armv81m.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,20 @@ extern const uint32_t mld_keccakf1600_round_constants[48];
1717
void mld_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
1818
const uint32_t rc[48]);
1919

20+
#define mld_keccak_f1600_x4_state_xor_bytes_asm \
21+
MLD_NAMESPACE(keccak_f1600_x4_state_xor_bytes_asm)
22+
void mld_keccak_f1600_x4_state_xor_bytes_asm(void *state, const uint8_t *d0,
23+
const uint8_t *d1,
24+
const uint8_t *d2,
25+
const uint8_t *d3, unsigned offset,
26+
unsigned length);
27+
28+
#define mld_keccak_f1600_x4_state_extract_bytes_asm \
29+
MLD_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
30+
void mld_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
31+
uint8_t *data1, uint8_t *data2,
32+
uint8_t *data3,
33+
unsigned offset,
34+
unsigned length);
35+
2036
#endif /* !MLD_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */

dev/fips202/armv81m/src/keccak_f1600_x4_mve.S

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77

88
/*yaml
99
Name: keccak_f1600_x4_mve_asm
10-
Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
10+
Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
1111
Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
1212
ABI:
1313
r0:
1414
type: buffer
1515
size_bytes: 800
1616
permissions: read/write
1717
c_parameter: void *state
18-
description: Four bit-interleaved Keccak states (low halves followed by high halves)
18+
description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
1919
r1:
2020
type: buffer
2121
size_bytes: 800
@@ -33,6 +33,36 @@
3333
description: register preservation (44) + SIMD registers (64) + temporary storage (128)
3434
*/
3535

36+
// ---------------------------------------------------------------------------
37+
// Bit-interleaving background
38+
// ---------------------------------------------------------------------------
39+
// Each 64-bit Keccak lane is stored as two 32-bit words:
40+
// even half -- bits 0, 2, 4, ..., 62 of the lane
41+
// odd half -- bits 1, 3, 5, ..., 63 of the lane
42+
// This representation allows 64-bit lane rotations (used in the Keccak
43+
// round function) to be implemented as pairs of 32-bit rotations.
44+
//
45+
// Batched (x4) processing:
46+
// Four Keccak instances are processed as a batch. Their states are
47+
// stored interleaved in a single 800-byte buffer: first the even
48+
// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
49+
// Within each 16-byte row, the four u32 words correspond to
50+
// instances 0..3 of the same lane, enabling SIMD-parallel operations
51+
// across all four instances.
52+
//
53+
// State memory layout (25 lanes x 4 instances x 2 halves):
54+
// S[i][l]_even/odd = even/odd half of lane l, instance i (u32)
55+
// Each row is 16 bytes (one Q-register).
56+
// Offset Contents
57+
// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
58+
// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
59+
// ...
60+
// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
61+
// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd
62+
// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd
63+
// ...
64+
// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd
65+
3666
#include "../../../../common.h"
3767
#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
3868
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
@@ -426,7 +456,7 @@ qA20_l .req q2
426456
.endm
427457

428458
.text
429-
.balign 8
459+
.balign 4
430460
.type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function
431461
.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
432462
MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)

dev/fips202/armv81m/src/keccak_f1600_x4_mve.c

Lines changed: 5 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -12,114 +12,19 @@
1212

1313
#include "fips202_native_armv81m.h"
1414

15-
/*
16-
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
17-
* TODO: Replace with optimized MVE assembly implementations
18-
* (as a part of XORBytes and ExtractBytes)
19-
*/
20-
21-
/* Extract even-indexed bits from 64-bit value into lower 32 bits */
22-
static uint32_t bitinterleave_even(uint64_t x)
23-
{
24-
uint64_t t;
25-
t = x & 0x5555555555555555ULL;
26-
t = (t | (t >> 1)) & 0x3333333333333333ULL;
27-
t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
28-
t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
29-
t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
30-
t = (t | (t >> 16)) & 0x00000000ffffffffULL;
31-
return (uint32_t)t;
32-
}
33-
34-
/* Extract odd-indexed bits from 64-bit value into lower 32 bits */
35-
static uint32_t bitinterleave_odd(uint64_t x)
36-
{
37-
return bitinterleave_even(x >> 1);
38-
}
39-
40-
/* Spread 32-bit value across even bit positions of 64-bit result */
41-
static uint64_t spread_even(uint32_t x)
42-
{
43-
uint64_t t = x;
44-
t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
45-
t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
46-
t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
47-
t = (t | (t << 2)) & 0x3333333333333333ULL;
48-
t = (t | (t << 1)) & 0x5555555555555555ULL;
49-
return t;
50-
}
51-
52-
/* Combine even and odd 32-bit halves into interleaved 64-bit value */
53-
static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
54-
{
55-
return spread_even(even) | (spread_even(odd) << 1);
56-
}
5715

5816
/*
59-
* TEMPORARY: Naive C interleaving functions.
60-
* These will be replaced with optimized MVE assembly implementations.
17+
* Keccak-f1600 x4 permutation (on bit-interleaved state)
18+
* State is expected to already be in bit-interleaved format.
6119
*/
62-
static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
63-
const uint64_t *state1, const uint64_t *state2,
64-
const uint64_t *state3)
65-
{
66-
uint32_t *state_4xl = (uint32_t *)state_4x;
67-
uint32_t *state_4xh = (uint32_t *)state_4x + 100;
68-
69-
for (size_t i = 0; i < 25; i++)
70-
{
71-
state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
72-
state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
73-
state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
74-
state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);
75-
76-
state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
77-
state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
78-
state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
79-
state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
80-
}
81-
}
82-
83-
static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
84-
uint64_t *state1, uint64_t *state2,
85-
uint64_t *state3)
86-
{
87-
uint32_t *state_4xl = (uint32_t *)state_4x;
88-
uint32_t *state_4xh = (uint32_t *)state_4x + 100;
89-
90-
for (size_t i = 0; i < 25; i++)
91-
{
92-
state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
93-
state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
94-
state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
95-
state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
96-
}
97-
}
98-
9920
#define mld_keccak_f1600_x4_native_impl \
10021
MLD_NAMESPACE(keccak_f1600_x4_native_impl)
10122
int mld_keccak_f1600_x4_native_impl(uint64_t *state)
10223
{
103-
/*
104-
* TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
105-
* TODO: Replace with optimized MVE assembly implementations
106-
* (as a part of XORBytes and ExtractBytes)
107-
*/
108-
MLD_ALIGN uint64_t state_4x[100];
109-
MLD_ALIGN uint64_t state_4x_tmp[100];
110-
111-
/* Interleave the 4 states into bit-interleaved format */
112-
interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
113-
114-
/* Run the permutation */
115-
mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
24+
MLD_ALIGN uint64_t state_tmp[100];
25+
mld_keccak_f1600_x4_mve_asm(state, state_tmp,
11626
mld_keccakf1600_round_constants);
117-
118-
/* Deinterleave back to 4 separate states */
119-
deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
120-
121-
mld_zeroize(state_4x, sizeof(state_4x));
122-
mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
27+
mld_zeroize(state_tmp, sizeof(state_tmp));
12328
return MLD_NATIVE_FUNC_SUCCESS;
12429
}
12530

0 commit comments

Comments
 (0)