77
88/ * yaml
99 Name: keccak_f1600_x4_mve_asm
10- Description: Armv8. 1 - M MVE implementation of 4 - way parallel Keccak- f [ 1600 ] permutation using bit - interleaved state
10+ Description: Armv8. 1 - M MVE implementation of batched (x4) Keccak - f [ 1600 ] permutation using bit - interleaved state
1111 Signature: void mld_keccak_f1600_x4_mve_asm(void * state , void * tmpstate , const uint32_t * rc)
1212 ABI:
1313 r0:
1414 type: buffer
1515 size_bytes: 800
1616 permissions: read/write
1717 c_parameter: void * state
18- description: Four bit - interleaved Keccak states (low halves followed by high halves)
18+ description: Bit - interleaved state for 4 Keccak instances (even halves followed by odd halves)
1919 r1:
2020 type: buffer
2121 size_bytes: 800
2929 c_parameter: const uint32_t * rc
3030 description: Keccak round constants in bit - interleaved form ( 24 pairs of 32 - bit words)
3131 Stack:
32- bytes: 236
33- description: register preservation ( 44 ) + SIMD registers ( 64 ) + temporary storage ( 128 )
32+ bytes: 228
33+ description: register preservation ( 36 ) + SIMD registers ( 64 ) + temporary storage ( 128 )
3434* /
3535
36+ // ---------------------------------------------------------------------------
37+ // Bit - interleaving background
38+ // ---------------------------------------------------------------------------
39+ // Each 64 - bit Keccak lane is stored as two 32 - bit words:
40+ // even half -- bits 0 , 2 , 4 , ... , 62 of the lane
41+ // odd half -- bits 1 , 3 , 5 , ... , 63 of the lane
42+ // This representation allows 64 - bit lane rotations (used in the Keccak
43+ // round function) to be implemented as pairs of 32 - bit rotations.
44+ //
45+ // Batched (x4) processing:
46+ // Four Keccak instances are processed as a batch. Their states are
47+ // stored interleaved in a single 800 - byte buffer: first the even
48+ // halves of all 25 lanes ( 400 bytes) , then the odd halves ( 400 bytes).
49+ // Within each 16 - byte row , the four u32 words correspond to
50+ // instances 0 .. 3 of the same lane , enabling SIMD - parallel operations
51+ // across all four instances.
52+ //
53+ // State memory layout ( 25 lanes x 4 instances x 2 halves):
54+ // S [ i ][ l ] _even/odd = even/odd half of lane l , instance i (u32)
55+ // Each row is 16 bytes (one Q - register).
56+ // Offset Contents
57+ // 0 S [ 0 ][ 0 ] _even , S [ 1 ][ 0 ] _even , S [ 2 ][ 0 ] _even , S [ 3 ][ 0 ] _even
58+ // 16 S [ 0 ][ 1 ] _even , S [ 1 ][ 1 ] _even , S [ 2 ][ 1 ] _even , S [ 3 ][ 1 ] _even
59+ // ...
60+ // 384 S [ 0 ][ 24 ] _even , S [ 1 ][ 24 ] _even , S [ 2 ][ 24 ] _even , S [ 3 ][ 24 ] _even
61+ // 400 S [ 0 ][ 0 ] _odd , S [ 1 ][ 0 ] _odd , S [ 2 ][ 0 ] _odd , S [ 3 ][ 0 ] _odd
62+ // 416 S [ 0 ][ 1 ] _odd , S [ 1 ][ 1 ] _odd , S [ 2 ][ 1 ] _odd , S [ 3 ][ 1 ] _odd
63+ // ...
64+ // 784 S [ 0 ][ 24 ] _odd , S [ 1 ][ 24 ] _odd , S [ 2 ][ 24 ] _odd , S [ 3 ][ 24 ] _odd
65+
3666#include "../../../../common.h"
3767#if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
3868 !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
5080. global MLD_ASM_NAMESPACE(keccak_f1600_x4_mve_asm)
5181MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
5282
53- push .w {r3 , r4 , r5 , r6 , r7 , r8 , r9 , r10 , r11 , r12 , lr}
83+ .cfi_startproc
84+ push .w {r4 , r5 , r6 , r7 , r8 , r9 , r10 , r11 , lr}
85+ .cfi_adjust_cfa_offset 0x24
86+ .cfi_rel_offset r4 , 0x0
87+ .cfi_rel_offset r5 , 0x4
88+ .cfi_rel_offset r6 , 0x8
89+ .cfi_rel_offset r7 , 0xc
90+ .cfi_rel_offset r8 , 0x10
91+ .cfi_rel_offset r9 , 0x14
92+ .cfi_rel_offset r10 , 0x18
93+ .cfi_rel_offset r11 , 0x1c
94+ .cfi_rel_offset lr , 0x20
5495 vpush {d8 , d9 , d10 , d11 , d12 , d13 , d14 , d15}
96+ .cfi_adjust_cfa_offset 0x40
97+ .cfi_rel_offset d8 , 0x0
98+ .cfi_rel_offset d9 , 0x8
99+ .cfi_rel_offset d10 , 0x10
100+ .cfi_rel_offset d11 , 0x18
101+ .cfi_rel_offset d12 , 0x20
102+ .cfi_rel_offset d13 , 0x28
103+ .cfi_rel_offset d14 , 0x30
104+ .cfi_rel_offset d15 , 0x38
55105 sub sp , # 0x80
106+ .cfi_adjust_cfa_offset 0x80
56107 mov r6 , r2
57108 mov .w lr , # 0x18
58109 mov r2 , r0
@@ -61,9 +112,9 @@ MLD_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm)
61112 vldrw.u32 q0 , [ r3 ]
62113 vldrw.u32 q1 , [ r2 ]
63114 vldrw.u32 q2 , [ r2 , # 32 ]
64- wls lr , lr , keccak_f1600_x4_mve_asm_roundend @ imm = # 0x8c0
115+ wls lr , lr , Lkeccak_f1600_x4_mve_asm_roundend @ imm = # 0x8c0
65116
66- keccak_f1600_x4_mve_asm_roundstart :
117+ Lkeccak_f1600_x4_mve_asm_roundstart :
67118 vldrw.u32 q6 , [ r2 , # 112 ]
68119 veor q7 , q6 , q2
69120 vldrw.u32 q2 , [ r2 , # 80 ]
@@ -624,13 +675,34 @@ keccak_f1600_x4_mve_asm_roundstart:
624675 veor q0 , q4 , q6
625676 vstrw. 32 q0 , [ r5 ]
626677
627- keccak_f1600_x4_mve_asm_roundend_pre :
628- le lr , keccak_f1600_x4_mve_asm_roundstart @ imm = # - 0x8c0
678+ Lkeccak_f1600_x4_mve_asm_roundend_pre :
679+ le lr , Lkeccak_f1600_x4_mve_asm_roundstart @ imm = # - 0x8c0
629680
630- keccak_f1600_x4_mve_asm_roundend :
681+ Lkeccak_f1600_x4_mve_asm_roundend :
631682 add sp , # 0x80
683+ .cfi_adjust_cfa_offset - 0x80
632684 vpop {d8 , d9 , d10 , d11 , d12 , d13 , d14 , d15}
685+ .cfi_restore d8
686+ .cfi_restore d9
687+ .cfi_restore d10
688+ .cfi_restore d11
689+ .cfi_restore d12
690+ .cfi_restore d13
691+ .cfi_restore d14
692+ .cfi_restore d15
693+ .cfi_adjust_cfa_offset - 0x40
633694 pop .w {r3 , r4 , r5 , r6 , r7 , r8 , r9 , r10 , r11 , r12 , pc}
695+ .cfi_restore r4
696+ .cfi_restore r5
697+ .cfi_restore r6
698+ .cfi_restore r7
699+ .cfi_restore r8
700+ .cfi_restore r9
701+ .cfi_restore r10
702+ .cfi_restore r11
703+ .cfi_restore lr
704+ .cfi_adjust_cfa_offset - 0x2c
705+ .cfi_endproc
634706 nop
635707
636708MLD_ASM_FN_SIZE(keccak_f1600_x4_mve_asm)
0 commit comments