Skip to content

Commit a34c798

Browse files
committed
Armv8.1-M: Add CFI directives for stack unwinding
Ports pq-code-package/mlkem-native#1558 Signed-off-by: Matthias J. Kannwischer <matthias@zerorisc.com>
1 parent f57eae6 commit a34c798

4 files changed

Lines changed: 396 additions & 79 deletions

File tree

dev/fips202/armv81m/src/keccak_f1600_x4_mve.S

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77

88
/*yaml
99
Name: keccak_f1600_x4_mve_asm
10-
Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
10+
Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
1111
Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
1212
ABI:
1313
r0:
1414
type: buffer
1515
size_bytes: 800
1616
permissions: read/write
1717
c_parameter: void *state
18-
description: Four bit-interleaved Keccak states (low halves followed by high halves)
18+
description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
1919
r1:
2020
type: buffer
2121
size_bytes: 800
@@ -29,10 +29,40 @@
2929
c_parameter: const uint32_t *rc
3030
description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words)
3131
Stack:
32-
bytes: 236
33-
description: register preservation (44) + SIMD registers (64) + temporary storage (128)
32+
bytes: 228
33+
description: register preservation (36) + SIMD registers (64) + temporary storage (128)
3434
*/
3535

36+
// ---------------------------------------------------------------------------
37+
// Bit-interleaving background
38+
// ---------------------------------------------------------------------------
39+
// Each 64-bit Keccak lane is stored as two 32-bit words:
40+
// even half -- bits 0, 2, 4, ..., 62 of the lane
41+
// odd half -- bits 1, 3, 5, ..., 63 of the lane
42+
// This representation allows 64-bit lane rotations (used in the Keccak
43+
// round function) to be implemented as pairs of 32-bit rotations.
44+
//
45+
// Batched (x4) processing:
46+
// Four Keccak instances are processed as a batch. Their states are
47+
// stored interleaved in a single 800-byte buffer: first the even
48+
// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
49+
// Within each 16-byte row, the four u32 words correspond to
50+
// instances 0..3 of the same lane, enabling SIMD-parallel operations
51+
// across all four instances.
52+
//
53+
// State memory layout (25 lanes x 4 instances x 2 halves):
54+
// S[i][l]_even/odd = even/odd half of lane l, instance i (u32)
55+
// Each row is 16 bytes (one Q-register).
56+
// Offset Contents
57+
// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
58+
// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
59+
// ...
60+
// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
61+
// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd
62+
// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd
63+
// ...
64+
// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd
65+
3666
#include "../../../../common.h"
3767
#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
3868
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
@@ -426,12 +456,12 @@ qA20_l .req q2
426456
.endm
427457

428458
.text
429-
.balign 8
459+
.balign 4
430460
.type MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function
431461
.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
432462
MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
433463

434-
push {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
464+
push {r4,r5,r6,r7,r8,r9,r10,r11,lr}
435465
vpush {d8-d15}
436466
sub sp, #8*16
437467

mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S

Lines changed: 82 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77

88
/*yaml
99
Name: keccak_f1600_x4_mve_asm
10-
Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state
10+
Description: Armv8.1-M MVE implementation of batched (x4) Keccak-f[1600] permutation using bit-interleaved state
1111
Signature: void mld_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc)
1212
ABI:
1313
r0:
1414
type: buffer
1515
size_bytes: 800
1616
permissions: read/write
1717
c_parameter: void *state
18-
description: Four bit-interleaved Keccak states (low halves followed by high halves)
18+
description: Bit-interleaved state for 4 Keccak instances (even halves followed by odd halves)
1919
r1:
2020
type: buffer
2121
size_bytes: 800
@@ -29,10 +29,40 @@
2929
c_parameter: const uint32_t *rc
3030
description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words)
3131
Stack:
32-
bytes: 236
33-
description: register preservation (44) + SIMD registers (64) + temporary storage (128)
32+
bytes: 228
33+
description: register preservation (36) + SIMD registers (64) + temporary storage (128)
3434
*/
3535

36+
// ---------------------------------------------------------------------------
37+
// Bit-interleaving background
38+
// ---------------------------------------------------------------------------
39+
// Each 64-bit Keccak lane is stored as two 32-bit words:
40+
// even half -- bits 0, 2, 4, ..., 62 of the lane
41+
// odd half -- bits 1, 3, 5, ..., 63 of the lane
42+
// This representation allows 64-bit lane rotations (used in the Keccak
43+
// round function) to be implemented as pairs of 32-bit rotations.
44+
//
45+
// Batched (x4) processing:
46+
// Four Keccak instances are processed as a batch. Their states are
47+
// stored interleaved in a single 800-byte buffer: first the even
48+
// halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
49+
// Within each 16-byte row, the four u32 words correspond to
50+
// instances 0..3 of the same lane, enabling SIMD-parallel operations
51+
// across all four instances.
52+
//
53+
// State memory layout (25 lanes x 4 instances x 2 halves):
54+
// S[i][l]_even/odd = even/odd half of lane l, instance i (u32)
55+
// Each row is 16 bytes (one Q-register).
56+
// Offset Contents
57+
// 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
58+
// 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
59+
// ...
60+
// 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
61+
// 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd
62+
// 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd
63+
// ...
64+
// 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd
65+
3666
#include "../../../../common.h"
3767
#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
3868
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
@@ -50,9 +80,30 @@
5080
.global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
5181
MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
5282

53-
push.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
83+
.cfi_startproc
84+
push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
85+
.cfi_adjust_cfa_offset 0x24
86+
.cfi_rel_offset r4, 0x0
87+
.cfi_rel_offset r5, 0x4
88+
.cfi_rel_offset r6, 0x8
89+
.cfi_rel_offset r7, 0xc
90+
.cfi_rel_offset r8, 0x10
91+
.cfi_rel_offset r9, 0x14
92+
.cfi_rel_offset r10, 0x18
93+
.cfi_rel_offset r11, 0x1c
94+
.cfi_rel_offset lr, 0x20
5495
vpush {d8, d9, d10, d11, d12, d13, d14, d15}
96+
.cfi_adjust_cfa_offset 0x40
97+
.cfi_rel_offset d8, 0x0
98+
.cfi_rel_offset d9, 0x8
99+
.cfi_rel_offset d10, 0x10
100+
.cfi_rel_offset d11, 0x18
101+
.cfi_rel_offset d12, 0x20
102+
.cfi_rel_offset d13, 0x28
103+
.cfi_rel_offset d14, 0x30
104+
.cfi_rel_offset d15, 0x38
55105
sub sp, #0x80
106+
.cfi_adjust_cfa_offset 0x80
56107
mov r6, r2
57108
mov.w lr, #0x18
58109
mov r2, r0
@@ -61,9 +112,9 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
61112
vldrw.u32 q0, [r3]
62113
vldrw.u32 q1, [r2]
63114
vldrw.u32 q2, [r2, #32]
64-
wls lr, lr, keccak_f1600_x4_mve_asm_roundend @ imm = #0x8c0
115+
wls lr, lr, Lkeccak_f1600_x4_mve_asm_roundend @ imm = #0x8c0
65116

66-
keccak_f1600_x4_mve_asm_roundstart:
117+
Lkeccak_f1600_x4_mve_asm_roundstart:
67118
vldrw.u32 q6, [r2, #112]
68119
veor q7, q6, q2
69120
vldrw.u32 q2, [r2, #80]
@@ -624,13 +675,34 @@ keccak_f1600_x4_mve_asm_roundstart:
624675
veor q0, q4, q6
625676
vstrw.32 q0, [r5]
626677

627-
keccak_f1600_x4_mve_asm_roundend_pre:
628-
le lr, keccak_f1600_x4_mve_asm_roundstart @ imm = #-0x8c0
678+
Lkeccak_f1600_x4_mve_asm_roundend_pre:
679+
le lr, Lkeccak_f1600_x4_mve_asm_roundstart @ imm = #-0x8c0
629680

630-
keccak_f1600_x4_mve_asm_roundend:
681+
Lkeccak_f1600_x4_mve_asm_roundend:
631682
add sp, #0x80
683+
.cfi_adjust_cfa_offset -0x80
632684
vpop {d8, d9, d10, d11, d12, d13, d14, d15}
685+
.cfi_restore d8
686+
.cfi_restore d9
687+
.cfi_restore d10
688+
.cfi_restore d11
689+
.cfi_restore d12
690+
.cfi_restore d13
691+
.cfi_restore d14
692+
.cfi_restore d15
693+
.cfi_adjust_cfa_offset -0x40
633694
pop.w {r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
695+
.cfi_restore r4
696+
.cfi_restore r5
697+
.cfi_restore r6
698+
.cfi_restore r7
699+
.cfi_restore r8
700+
.cfi_restore r9
701+
.cfi_restore r10
702+
.cfi_restore r11
703+
.cfi_restore lr
704+
.cfi_adjust_cfa_offset -0x2c
705+
.cfi_endproc
634706
nop
635707

636708
MLD_ASM_FN_SIZE(keccak_f1600_x4_mve_asm)

scripts/autogen

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,9 +2312,7 @@ def update_via_simpasm(
23122312
"-o",
23132313
tmp.name,
23142314
]
2315-
# TODO: Support CFI for Armv8.1-M
2316-
if arch != "armv81m":
2317-
cmd += ["--cfify"]
2315+
cmd += ["--cfify"]
23182316
if cross_prefix is not None:
23192317
# Stick with llvm-objdump for disassembly
23202318
cmd += ["--cc", cross_prefix + "gcc"]

0 commit comments

Comments
 (0)