Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions BIBLIOGRAPHY.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ source code and documentation.
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
- [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
- [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
- [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c)
- [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c)
- [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
- [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
- [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
Expand All @@ -243,6 +245,8 @@ source code and documentation.
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
- [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c)
- [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c)
- [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
- [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
- [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
Expand Down
40 changes: 40 additions & 0 deletions dev/aarch64_clean/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#define MLD_USE_NATIVE_POLY_CHKNORM
#define MLD_USE_NATIVE_POLYZ_UNPACK_17
#define MLD_USE_NATIVE_POLYZ_UNPACK_19
#define MLD_USE_NATIVE_POLYW1_PACK_32
#define MLD_USE_NATIVE_POLYW1_PACK_88
#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
Expand Down Expand Up @@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87 */

#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
{
mld_polyw1_pack_32_asm(r, a);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87 */

#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
/* Table of constants for polyw1_pack_88_asm:
* [0:15] v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
* [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
* [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
* [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
/* clang-format off */
MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
/* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
0, 0, 0, 0, 6, 0, 0, 0, 12, 0, 0, 0, 18, 0, 0, 0,
/* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
/* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
/* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30,
};
/* clang-format on */
MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
{
mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
*/

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_poly_pointwise_montgomery_native(
int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
Expand Down
6 changes: 6 additions & 0 deletions dev/aarch64_clean/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
const uint8_t *indices);

#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);

#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);

#define mld_poly_pointwise_montgomery_asm \
MLD_NAMESPACE(poly_pointwise_montgomery_asm)
void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,
Expand Down
112 changes: 112 additions & 0 deletions dev/aarch64_clean/src/polyw1_pack_32_asm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
/* simpasm: header-end */

/*
* polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
*
* Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
* Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
* 256 coefficients -> 128 output bytes.
*
* UZP1 narrowing chain (32->16->8 bit) extracts the low byte from
* each coefficient; UZP1/UZP2 separate even/odd coefficients;
* SLI shifts and inserts the odd nibbles.
*
* 4x unrolled, 2 iterations for 256 coefficients.
*/

output .req x0
input .req x1
count .req x2

.text
.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
.balign 4
MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)

mov count, #(256 / (32 * 4))

polyw1_pack_32_loop:

/* Block 0: coefficients 0-31 */
ldp q0, q1, [input], #512
ldp q2, q3, [input, #(32 - 512)]
ldp q4, q5, [input, #(64 - 512)]
ldp q6, q7, [input, #(96 - 512)]
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
uzp1 v4.8h, v4.8h, v5.8h
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v0.16b, v0.16b, v2.16b
uzp1 v4.16b, v4.16b, v6.16b
uzp1 v16.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v16.16b, v0.16b, #4

/* Block 1: coefficients 32-63 */
ldp q0, q1, [input, #(128 - 512)]
ldp q2, q3, [input, #(160 - 512)]
ldp q4, q5, [input, #(192 - 512)]
ldp q6, q7, [input, #(224 - 512)]
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
uzp1 v4.8h, v4.8h, v5.8h
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v0.16b, v0.16b, v2.16b
uzp1 v4.16b, v4.16b, v6.16b
uzp1 v17.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v17.16b, v0.16b, #4

/* Block 2: coefficients 64-95 */
ldp q0, q1, [input, #(256 - 512)]
ldp q2, q3, [input, #(288 - 512)]
ldp q4, q5, [input, #(320 - 512)]
ldp q6, q7, [input, #(352 - 512)]
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
uzp1 v4.8h, v4.8h, v5.8h
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v0.16b, v0.16b, v2.16b
uzp1 v4.16b, v4.16b, v6.16b
uzp1 v18.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v18.16b, v0.16b, #4

/* Block 3: coefficients 96-127 */
ldp q0, q1, [input, #(384 - 512)]
ldp q2, q3, [input, #(416 - 512)]
ldp q4, q5, [input, #(448 - 512)]
ldp q6, q7, [input, #(480 - 512)]
uzp1 v0.8h, v0.8h, v1.8h
uzp1 v2.8h, v2.8h, v3.8h
uzp1 v4.8h, v4.8h, v5.8h
uzp1 v6.8h, v6.8h, v7.8h
uzp1 v0.16b, v0.16b, v2.16b
uzp1 v4.16b, v4.16b, v6.16b
uzp1 v19.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v19.16b, v0.16b, #4

st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64

subs count, count, #1
bne polyw1_pack_32_loop

ret

.unreq output
.unreq input
.unreq count
/* simpasm: footer-start */
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87) */
126 changes: 126 additions & 0 deletions dev/aarch64_clean/src/polyw1_pack_88_asm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
/* simpasm: header-end */

/*
* polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88.
*
* Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word.
* Pack 4 coefficients into 3 bytes:
* r[3i+0] = a[4i+0] | (a[4i+1] << 6)
* r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4)
* r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2)
* 256 coefficients -> 192 output bytes.
*
* Each group of 4 coefficients in a .4s vector is shifted to its
* bit position using USHL, then reduced with ADDP to form one
* 24-bit packed value per 32-bit lane.
*
* Three 2-register TBL instructions then extract the useful 3 bytes
* from each 32-bit lane across pairs of adjacent result vectors,
* producing 3 contiguous 16-byte output vectors (48 bytes total).
*
* 4x unrolled, 4 iterations for 256 coefficients.
*/

output .req x0
input .req x1
table .req x2
count .req x3

v_shifts .req v24
v_tbl0 .req v25
v_tbl1 .req v26
v_tbl2 .req v27

.text
.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
.balign 4
MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)

/* Load constants from table pointer (x2):
* [0:15] = v_shifts.4s = {0, 6, 12, 18}
* [16:31] = v_tbl0: TBL indices for out0 from {v16, v17}
* [32:47] = v_tbl1: TBL indices for out1 from {v17, v18}
* [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */
ldp q24, q25, [table]
ldp q26, q27, [table, #32]

mov count, #(256 / (16 * 4))

polyw1_pack_88_loop:

/* Block 0: coefficients 0-15 */
ldp q0, q1, [input], #256
ldp q2, q3, [input, #(32 - 256)]
ushl v0.4s, v0.4s, v_shifts.4s
ushl v1.4s, v1.4s, v_shifts.4s
ushl v2.4s, v2.4s, v_shifts.4s
ushl v3.4s, v3.4s, v_shifts.4s
addp v0.4s, v0.4s, v1.4s
addp v2.4s, v2.4s, v3.4s
addp v16.4s, v0.4s, v2.4s

/* Block 1: coefficients 16-31 */
ldp q0, q1, [input, #(64 - 256)]
ldp q2, q3, [input, #(96 - 256)]
ushl v0.4s, v0.4s, v_shifts.4s
ushl v1.4s, v1.4s, v_shifts.4s
ushl v2.4s, v2.4s, v_shifts.4s
ushl v3.4s, v3.4s, v_shifts.4s
addp v0.4s, v0.4s, v1.4s
addp v2.4s, v2.4s, v3.4s
addp v17.4s, v0.4s, v2.4s

/* Block 2: coefficients 32-47 */
ldp q0, q1, [input, #(128 - 256)]
ldp q2, q3, [input, #(160 - 256)]
ushl v0.4s, v0.4s, v_shifts.4s
ushl v1.4s, v1.4s, v_shifts.4s
ushl v2.4s, v2.4s, v_shifts.4s
ushl v3.4s, v3.4s, v_shifts.4s
addp v0.4s, v0.4s, v1.4s
addp v2.4s, v2.4s, v3.4s
addp v18.4s, v0.4s, v2.4s

/* Block 3: coefficients 48-63 */
ldp q0, q1, [input, #(192 - 256)]
ldp q2, q3, [input, #(224 - 256)]
ushl v0.4s, v0.4s, v_shifts.4s
ushl v1.4s, v1.4s, v_shifts.4s
ushl v2.4s, v2.4s, v_shifts.4s
ushl v3.4s, v3.4s, v_shifts.4s
addp v0.4s, v0.4s, v1.4s
addp v2.4s, v2.4s, v3.4s
addp v19.4s, v0.4s, v2.4s

/* Compact + splice into 3 output vectors */
tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b
tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b
tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b

st1 {v20.16b, v21.16b, v22.16b}, [output], #48

subs count, count, #1
bne polyw1_pack_88_loop

ret

.unreq output
.unreq input
.unreq table
.unreq count
.unreq v_shifts
.unreq v_tbl0
.unreq v_tbl1
.unreq v_tbl2
/* simpasm: footer-start */
#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
*/
40 changes: 40 additions & 0 deletions dev/aarch64_opt/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#define MLD_USE_NATIVE_POLY_CHKNORM
#define MLD_USE_NATIVE_POLYZ_UNPACK_17
#define MLD_USE_NATIVE_POLYZ_UNPACK_19
#define MLD_USE_NATIVE_POLYW1_PACK_32
#define MLD_USE_NATIVE_POLYW1_PACK_88
#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
Expand Down Expand Up @@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87 */

#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
{
mld_polyw1_pack_32_asm(r, a);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87 */

#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
/* Table of constants for polyw1_pack_88_asm:
* [0:15] v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
* [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
* [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
* [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
/* clang-format off */
MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
/* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
0, 0, 0, 0, 6, 0, 0, 0, 12, 0, 0, 0, 18, 0, 0, 0,
/* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
/* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
/* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30,
};
/* clang-format on */
MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
{
mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
*/

MLD_MUST_CHECK_RETURN_VALUE
static MLD_INLINE int mld_poly_pointwise_montgomery_native(
int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
Expand Down
6 changes: 6 additions & 0 deletions dev/aarch64_opt/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
const uint8_t *indices);

#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);

#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);

#define mld_poly_pointwise_montgomery_asm \
MLD_NAMESPACE(poly_pointwise_montgomery_asm)
void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,
Expand Down
Loading
Loading