pq-code-package · mkannwischer · Feb 21, 2026 · Feb 21, 2026
@@ -225,6 +225,8 @@ source code and documentation.
   - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
   - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
   - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c)
   - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
   - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
   - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
@@ -243,6 +245,8 @@ source code and documentation.
   - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)

@@ -21,6 +21,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_32_asm(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/* Table of constants for polyw1_pack_88_asm:
+ *   [0:15]  v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
+ *   [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
+ *   [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
+ *   [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
+/* clang-format off */
+MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
+  /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
+  0, 0, 0, 0,  6, 0, 0, 0,  12, 0, 0, 0,  18, 0, 0, 0,
+  /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
+  0, 1, 2, 4,  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,
+  /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
+  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,
+  /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
+  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,  26, 28, 29, 30,
+};
+/* clang-format on */
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],

@@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
 void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
                              const uint8_t *indices);
 
+#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
+void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
+void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);
+
 #define mld_poly_pointwise_montgomery_asm \
   MLD_NAMESPACE(poly_pointwise_montgomery_asm)
 void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,

@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+            MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_32: Pack w1 polynomial for GAMMA2 = (Q-1)/32.
+ *
+ * Each coefficient is in [0, 15] (4 bits) stored in a 32-bit word.
+ * Pack 2 coefficients per byte: r[i] = a[2i] | (a[2i+1] << 4)
+ * 256 coefficients -> 128 output bytes.
+ *
+ * UZP1 narrowing chain (32->16->8 bit) extracts the low byte from
+ * each coefficient; UZP1/UZP2 separate even/odd coefficients;
+ * SLI shifts and inserts the odd nibbles.
+ *
+ * 4x unrolled, 2 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        count           .req x2
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_32_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_32_asm)
+
+        mov count, #(256 / (32 * 4))
+
+polyw1_pack_32_loop:
+
+        /* Block 0: coefficients 0-31 */
+        ldp q0, q1, [input], #512
+        ldp q2, q3, [input, #(32 - 512)]
+        ldp q4, q5, [input, #(64 - 512)]
+        ldp q6, q7, [input, #(96 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v16.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v16.16b, v0.16b, #4
+
+        /* Block 1: coefficients 32-63 */
+        ldp q0, q1, [input, #(128 - 512)]
+        ldp q2, q3, [input, #(160 - 512)]
+        ldp q4, q5, [input, #(192 - 512)]
+        ldp q6, q7, [input, #(224 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v17.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v17.16b, v0.16b, #4
+
+        /* Block 2: coefficients 64-95 */
+        ldp q0, q1, [input, #(256 - 512)]
+        ldp q2, q3, [input, #(288 - 512)]
+        ldp q4, q5, [input, #(320 - 512)]
+        ldp q6, q7, [input, #(352 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v18.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v18.16b, v0.16b, #4
+
+        /* Block 3: coefficients 96-127 */
+        ldp q0, q1, [input, #(384 - 512)]
+        ldp q2, q3, [input, #(416 - 512)]
+        ldp q4, q5, [input, #(448 - 512)]
+        ldp q6, q7, [input, #(480 - 512)]
+        uzp1 v0.8h, v0.8h, v1.8h
+        uzp1 v2.8h, v2.8h, v3.8h
+        uzp1 v4.8h, v4.8h, v5.8h
+        uzp1 v6.8h, v6.8h, v7.8h
+        uzp1 v0.16b, v0.16b, v2.16b
+        uzp1 v4.16b, v4.16b, v6.16b
+        uzp1 v19.16b, v0.16b, v4.16b
+        uzp2 v0.16b, v0.16b, v4.16b
+        sli v19.16b, v0.16b, #4
+
+        st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [output], #64
+
+        subs count, count, #1
+        bne polyw1_pack_32_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq count
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87) */
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
+           (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44)
+/* simpasm: header-end */
+
+/*
+ * polyw1_pack_88: Pack w1 polynomial for GAMMA2 = (Q-1)/88.
+ *
+ * Each coefficient is in [0, 43] (6 bits) stored in a 32-bit word.
+ * Pack 4 coefficients into 3 bytes:
+ *   r[3i+0] =  a[4i+0]       | (a[4i+1] << 6)
+ *   r[3i+1] = (a[4i+1] >> 2) | (a[4i+2] << 4)
+ *   r[3i+2] = (a[4i+2] >> 4) | (a[4i+3] << 2)
+ * 256 coefficients -> 192 output bytes.
+ *
+ * Each group of 4 coefficients in a .4s vector is shifted to its
+ * bit position using USHL, then reduced with ADDP to form one
+ * 24-bit packed value per 32-bit lane.
+ *
+ * Three 2-register TBL instructions then extract the useful 3 bytes
+ * from each 32-bit lane across pairs of adjacent result vectors,
+ * producing 3 contiguous 16-byte output vectors (48 bytes total).
+ *
+ * 4x unrolled, 4 iterations for 256 coefficients.
+ */
+
+        output          .req x0
+        input           .req x1
+        table           .req x2
+        count           .req x3
+
+        v_shifts        .req v24
+        v_tbl0          .req v25
+        v_tbl1          .req v26
+        v_tbl2          .req v27
+
+.text
+.global MLD_ASM_NAMESPACE(polyw1_pack_88_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyw1_pack_88_asm)
+
+        /* Load constants from table pointer (x2):
+         * [0:15]  = v_shifts.4s = {0, 6, 12, 18}
+         * [16:31] = v_tbl0: TBL indices for out0 from {v16, v17}
+         * [32:47] = v_tbl1: TBL indices for out1 from {v17, v18}
+         * [48:63] = v_tbl2: TBL indices for out2 from {v18, v19} */
+        ldp q24, q25, [table]
+        ldp q26, q27, [table, #32]
+
+        mov count, #(256 / (16 * 4))
+
+polyw1_pack_88_loop:
+
+        /* Block 0: coefficients 0-15 */
+        ldp q0, q1, [input], #256
+        ldp q2, q3, [input, #(32 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v16.4s, v0.4s, v2.4s
+
+        /* Block 1: coefficients 16-31 */
+        ldp q0, q1, [input, #(64 - 256)]
+        ldp q2, q3, [input, #(96 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v17.4s, v0.4s, v2.4s
+
+        /* Block 2: coefficients 32-47 */
+        ldp q0, q1, [input, #(128 - 256)]
+        ldp q2, q3, [input, #(160 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v18.4s, v0.4s, v2.4s
+
+        /* Block 3: coefficients 48-63 */
+        ldp q0, q1, [input, #(192 - 256)]
+        ldp q2, q3, [input, #(224 - 256)]
+        ushl v0.4s, v0.4s, v_shifts.4s
+        ushl v1.4s, v1.4s, v_shifts.4s
+        ushl v2.4s, v2.4s, v_shifts.4s
+        ushl v3.4s, v3.4s, v_shifts.4s
+        addp v0.4s, v0.4s, v1.4s
+        addp v2.4s, v2.4s, v3.4s
+        addp v19.4s, v0.4s, v2.4s
+
+        /* Compact + splice into 3 output vectors */
+        tbl v20.16b, {v16.16b, v17.16b}, v_tbl0.16b
+        tbl v21.16b, {v17.16b, v18.16b}, v_tbl1.16b
+        tbl v22.16b, {v18.16b, v19.16b}, v_tbl2.16b
+
+        st1 {v20.16b, v21.16b, v22.16b}, [output], #48
+
+        subs count, count, #1
+        bne polyw1_pack_88_loop
+
+        ret
+
+        .unreq output
+        .unreq input
+        .unreq table
+        .unreq count
+        .unreq v_shifts
+        .unreq v_tbl0
+        .unreq v_tbl1
+        .unreq v_tbl2
+/* simpasm: footer-start */
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
+        */
@@ -21,6 +21,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -198,6 +200,44 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *buf)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_32_asm(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/* Table of constants for polyw1_pack_88_asm:
+ *   [0:15]  v_shifts: USHL shift amounts {0, 6, 12, 18} as .4s
+ *   [16:31] v_tbl0: TBL indices for out0 from {v16, v17}
+ *   [32:47] v_tbl1: TBL indices for out1 from {v17, v18}
+ *   [48:63] v_tbl2: TBL indices for out2 from {v18, v19} */
+/* clang-format off */
+MLD_ALIGN static const uint8_t mld_polyw1_pack_88_consts[] = {
+  /* v_shifts: {0, 6, 12, 18} as uint32_t little-endian */
+  0, 0, 0, 0,  6, 0, 0, 0,  12, 0, 0, 0,  18, 0, 0, 0,
+  /* v_tbl0: {0,1,2, 4,5,6, 8,9,10, 12,13,14, 16,17,18, 20} */
+  0, 1, 2, 4,  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,
+  /* v_tbl1: {5,6, 8,9,10, 12,13,14, 16,17,18, 20,21,22, 24,25} */
+  5, 6, 8, 9,  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,
+  /* v_tbl2: {10, 12,13,14, 16,17,18, 20,21,22, 24,25,26, 28,29,30} */
+  10, 12, 13, 14,  16, 17, 18, 20,  21, 22, 24, 25,  26, 28, 29, 30,
+};
+/* clang-format on */
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  mld_polyw1_pack_88_asm(r, a, mld_polyw1_pack_88_consts);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],

@@ -105,6 +105,12 @@ void mld_polyz_unpack_17_asm(int32_t *r, const uint8_t *buf,
 void mld_polyz_unpack_19_asm(int32_t *r, const uint8_t *buf,
                              const uint8_t *indices);
 
+#define mld_polyw1_pack_32_asm MLD_NAMESPACE(polyw1_pack_32_asm)
+void mld_polyw1_pack_32_asm(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_asm MLD_NAMESPACE(polyw1_pack_88_asm)
+void mld_polyw1_pack_88_asm(uint8_t *r, const int32_t *a, const uint8_t *table);
+
 #define mld_poly_pointwise_montgomery_asm \
   MLD_NAMESPACE(poly_pointwise_montgomery_asm)
 void mld_poly_pointwise_montgomery_asm(int32_t *, const int32_t *,