Skip to content

Commit 77618aa

Browse files
committed
AArch64: Add NEON polyw1_pack to AArch64 native backend
Add AArch64 assembly implementations of polyw1_pack for both GAMMA2 variants using TBL-based byte extraction from 32-bit coefficient lanes. Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 1bb503b commit 77618aa

8 files changed

Lines changed: 580 additions & 0 deletions

File tree

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
#include "../../../common.h"
7+
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
8+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
9+
MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
10+
/* simpasm: header-end */
11+
12+
/*
13+
* polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
14+
*
15+
* Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
16+
* Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
17+
* 256 coefficients -> 128 output bytes.
18+
*
19+
* Process 32 coefficients per iteration (8 iterations):
20+
* Use TBL to extract the low byte from each 32-bit lane,
21+
* then UZP to separate even/odd and pack with shift + OR.
22+
*/
23+
24+
output .req x0
25+
input .req x1
26+
count .req x2
27+
tmp .req x3
28+
29+
vidx .req v20
30+
31+
.text
32+
.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
33+
.balign 4
34+
MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
35+
36+
/* Set up TBL index to extract byte 0 from each 32-bit lane.
37+
* For 4 consecutive Q-registers (64 bytes), the low byte of each
38+
* 32-bit lane is at positions 0,4,8,12, 16,20,24,28, ... */
39+
movz tmp, #0x0400
40+
movk tmp, #0x0C08, lsl 16
41+
movk tmp, #0x1410, lsl 32
42+
movk tmp, #0x1C18, lsl 48
43+
mov vidx.d[0], tmp
44+
movz tmp, #0x2420
45+
movk tmp, #0x2C28, lsl 16
46+
movk tmp, #0x3430, lsl 32
47+
movk tmp, #0x3C38, lsl 48
48+
mov vidx.d[1], tmp
49+
50+
mov count, #(256 / 32)
51+
52+
polyw1_pack_32_loop:
53+
/* Load coefficients 0..15 and extract low bytes */
54+
ldp q0, q1, [input, #0*16]
55+
ldp q2, q3, [input, #2*16]
56+
tbl v16.16b, {v0.16b-v3.16b}, vidx.16b
57+
58+
/* Load coefficients 16..31 and extract low bytes */
59+
ldp q0, q1, [input, #4*16]
60+
ldp q2, q3, [input, #6*16]
61+
add input, input, #8*16
62+
tbl v17.16b, {v0.16b-v3.16b}, vidx.16b
63+
64+
/* Deinterleave: even = a[0],a[2],...; odd = a[1],a[3],... */
65+
uzp1 v0.16b, v16.16b, v17.16b
66+
uzp2 v1.16b, v16.16b, v17.16b
67+
68+
/* Pack: even | (odd << 4) */
69+
shl v1.16b, v1.16b, #4
70+
orr v0.16b, v0.16b, v1.16b
71+
72+
str q0, [output], #16
73+
74+
subs count, count, #1
75+
bne polyw1_pack_32_loop
76+
77+
ret
78+
79+
.unreq output
80+
.unreq input
81+
.unreq count
82+
.unreq tmp
83+
.unreq vidx
84+
/* simpasm: footer-start */
85+
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
86+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
87+
|| MLD_CONFIG_PARAMETER_SET == 87) */
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
#include "../../../common.h"
7+
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
8+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
9+
/* simpasm: header-end */
10+
11+
/*
12+
* polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88.
13+
*
14+
* Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word.
15+
* Pack 4 coefficients into 3 bytes:
16+
* r[3i+0] = a[4i+0] | (a[4i+1] << 6)
17+
* r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4)
18+
* r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2)
19+
* 256 coefficients -> 192 output bytes.
20+
*
21+
* Process 16 coefficients per iteration (16 iterations):
22+
* Use two TBL operations to extract and rearrange coefficient bytes
23+
* from the 32-bit input lanes directly into the output layout,
24+
* then variable-shift and OR to produce packed output.
25+
*
26+
* Each output byte is the OR of a "left" and "right" component:
27+
* Byte position in triplet: 0 1 2
28+
* Left component: a[4i+0] a[4i+1]>>2 a[4i+2]>>4
29+
* Right component: a[4i+1]<<6 a[4i+2]<<4 a[4i+3]<<2
30+
*
31+
* left_idx selects the left coefficient from the 4 input Q-registers.
32+
* right_idx = left_idx + 4 (next coefficient in the 32-bit word layout).
33+
* left_shifts = {0, -2, -4, 0, -2, -4, ...} (signed for USHL)
34+
* right_shifts = left_shifts + 6
35+
*/
36+
37+
output .req x0
38+
input .req x1
39+
count .req x2
40+
tmp .req x3
41+
42+
left_idx .req v20
43+
right_idx .req v21
44+
left_shifts .req v22
45+
right_shifts .req v23
46+
47+
.text
48+
.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
49+
.balign 4
50+
MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)
51+
52+
/* Set up left_idx: for each output byte, which source byte to read.
53+
* Input layout: 4 Q-regs = 64 bytes; coeff k is at byte 4*k in
54+
* the concatenated table {v0-v3}.
55+
*
56+
* Output triplet i (out[3i+0..2]):
57+
* left reads a[4i+0], a[4i+1], a[4i+2] -> table indices 16i+0, 16i+4, 16i+8
58+
* right reads a[4i+1], a[4i+2], a[4i+3] -> table indices 16i+4, 16i+8, 16i+12
59+
*
60+
* left_idx = {0,4,8, 16,20,24, 32,36,40, 48,52,56, 0x80,0x80,0x80,0x80} */
61+
movz tmp, #0x0400
62+
movk tmp, #0x1008, lsl 16
63+
movk tmp, #0x1814, lsl 32
64+
movk tmp, #0x2420, lsl 48
65+
mov left_idx.d[0], tmp
66+
movz tmp, #0x3028
67+
movk tmp, #0x3834, lsl 16
68+
movk tmp, #0x8080, lsl 32
69+
movk tmp, #0x8080, lsl 48
70+
mov left_idx.d[1], tmp
71+
72+
/* right_idx = left_idx + 4 (selects the next coefficient).
73+
* Padding bytes become 0x84 which is >= 64, so TBL returns 0. */
74+
movi v24.16b, #4
75+
add right_idx.16b, left_idx.16b, v24.16b
76+
77+
/* Set up left_shifts: {0, -2, -4, 0, -2, -4, ...} (repeating per triplet).
78+
* USHL interprets each byte as signed: negative = right shift. */
79+
movz tmp, #0xFE00
80+
movk tmp, #0x00FC, lsl 16
81+
movk tmp, #0xFCFE, lsl 32
82+
movk tmp, #0xFE00, lsl 48
83+
mov left_shifts.d[0], tmp
84+
movz tmp, #0x00FC
85+
movk tmp, #0xFCFE, lsl 16
86+
mov left_shifts.d[1], tmp
87+
88+
/* right_shifts = left_shifts + 6 */
89+
movi v24.16b, #6
90+
add right_shifts.16b, left_shifts.16b, v24.16b
91+
92+
mov count, #(256 / 16)
93+
94+
polyw1_pack_88_loop:
95+
/* Load 16 int32 coefficients */
96+
ldp q0, q1, [input, #0*16]
97+
ldp q2, q3, [input, #2*16]
98+
add input, input, #4*16
99+
100+
/* Extract and rearrange: left and right components */
101+
tbl v4.16b, {v0.16b-v3.16b}, left_idx.16b
102+
tbl v5.16b, {v0.16b-v3.16b}, right_idx.16b
103+
104+
/* Apply variable shifts */
105+
ushl v4.16b, v4.16b, left_shifts.16b
106+
ushl v5.16b, v5.16b, right_shifts.16b
107+
108+
/* Combine */
109+
orr v4.16b, v4.16b, v5.16b
110+
111+
/* Store 12 bytes: 8 + 4 */
112+
str d4, [output], #8
113+
st1 {v4.s}[2], [output], #4
114+
115+
subs count, count, #1
116+
bne polyw1_pack_88_loop
117+
118+
ret
119+
120+
.unreq output
121+
.unreq input
122+
.unreq count
123+
.unreq tmp
124+
.unreq left_idx
125+
.unreq right_idx
126+
.unreq left_shifts
127+
.unreq right_shifts
128+
/* simpasm: footer-start */
129+
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
130+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
131+
*/
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
#include "../../../common.h"
7+
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
8+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
9+
MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
10+
/* simpasm: header-end */
11+
12+
/*
13+
* polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
14+
*
15+
* Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
16+
* Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
17+
* 256 coefficients -> 128 output bytes.
18+
*
19+
* Process 32 coefficients per iteration (8 iterations):
20+
* Use TBL to extract the low byte from each 32-bit lane,
21+
* then UZP to separate even/odd and pack with shift + OR.
22+
*/
23+
24+
output .req x0
25+
input .req x1
26+
count .req x2
27+
tmp .req x3
28+
29+
vidx .req v20
30+
31+
.text
32+
.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
33+
.balign 4
34+
MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
35+
36+
/* Set up TBL index to extract byte 0 from each 32-bit lane.
37+
* For 4 consecutive Q-registers (64 bytes), the low byte of each
38+
* 32-bit lane is at positions 0,4,8,12, 16,20,24,28, ... */
39+
movz tmp, #0x0400
40+
movk tmp, #0x0C08, lsl 16
41+
movk tmp, #0x1410, lsl 32
42+
movk tmp, #0x1C18, lsl 48
43+
mov vidx.d[0], tmp
44+
movz tmp, #0x2420
45+
movk tmp, #0x2C28, lsl 16
46+
movk tmp, #0x3430, lsl 32
47+
movk tmp, #0x3C38, lsl 48
48+
mov vidx.d[1], tmp
49+
50+
mov count, #(256 / 32)
51+
52+
polyw1_pack_32_loop:
53+
/* Load coefficients 0..15 and extract low bytes */
54+
ldp q0, q1, [input, #0*16]
55+
ldp q2, q3, [input, #2*16]
56+
tbl v16.16b, {v0.16b-v3.16b}, vidx.16b
57+
58+
/* Load coefficients 16..31 and extract low bytes */
59+
ldp q0, q1, [input, #4*16]
60+
ldp q2, q3, [input, #6*16]
61+
add input, input, #8*16
62+
tbl v17.16b, {v0.16b-v3.16b}, vidx.16b
63+
64+
/* Deinterleave: even = a[0],a[2],...; odd = a[1],a[3],... */
65+
uzp1 v0.16b, v16.16b, v17.16b
66+
uzp2 v1.16b, v16.16b, v17.16b
67+
68+
/* Pack: even | (odd << 4) */
69+
shl v1.16b, v1.16b, #4
70+
orr v0.16b, v0.16b, v1.16b
71+
72+
str q0, [output], #16
73+
74+
subs count, count, #1
75+
bne polyw1_pack_32_loop
76+
77+
ret
78+
79+
.unreq output
80+
.unreq input
81+
.unreq count
82+
.unreq tmp
83+
.unreq vidx
84+
/* simpasm: footer-start */
85+
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
86+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
87+
|| MLD_CONFIG_PARAMETER_SET == 87) */

0 commit comments

Comments
 (0)