x86_64: Add AVX2 polyw1_pack to x86_64 native backend

mkannwischer · mkannwischer · commit 1bb503b58566 · 2026-02-21T15:19:16.000+08:00
Integrate polyw1_pack AVX2 implementations for both GAMMA2 variants
into the native backend.

Signed-off-by: Matthias J. Kannwischer &lt;matthias@kannwischer.eu&gt;
diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md
@@ -225,6 +225,8 @@ source code and documentation.
   - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
   - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
   - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c)
+  - [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c)
   - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
   - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
   - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
@@ -243,6 +245,8 @@ source code and documentation.
   - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
   - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c)
+  - [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
   - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
   - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h
@@ -25,6 +25,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_32_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_88_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
 #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
 void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
 
+#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
+
 #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
 void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
                         const int32_t *qdata);
diff --git a/dev/x86_64/src/polyw1_pack_32_avx2.c b/dev/x86_64/src/polyw1_pack_32_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32.
+ * Packs 2 nibbles per byte; 64 coefficients per iteration. */
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift = _mm256_set1_epi16((16 << 8) + 1);
+  const __m256i shufbidx =
+      _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15,
+                      14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 64; ++i)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]);
+    __m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]);
+    __m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]);
+    __m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]);
+    __m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f2 = _mm256_packus_epi32(f4, f5);
+    f3 = _mm256_packus_epi32(f6, f7);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f1 = _mm256_packus_epi16(f2, f3);
+    f0 = _mm256_maddubs_epi16(f0, shift);
+    f1 = _mm256_maddubs_epi16(f1, shift);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_permute4x64_epi64(f0, 0xD8);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    _mm256_storeu_si256((__m256i *)&r[32 * i], f0);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         65 || MLD_CONFIG_PARAMETER_SET == 87) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_32)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                                \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                                  \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87)) */
diff --git a/dev/x86_64/src/polyw1_pack_88_avx2.c b/dev/x86_64/src/polyw1_pack_88_avx2.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/* References
+ * ==========
+ *
+ * - [REF_AVX2]
+ *   CRYSTALS-Dilithium optimized AVX2 implementation
+ *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
+ *   https://github.com/pq-crystals/dilithium/tree/master/avx2
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) &&   \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) &&   \
+    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+     MLD_CONFIG_PARAMETER_SET == 44)
+
+#include <immintrin.h>
+#include "arith_native_x86_64.h"
+
+/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88.
+ * 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a)
+{
+  unsigned int i;
+  const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1);
+  const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1);
+  const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+  const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0);
+  const __m256i shufbidx =
+      _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
+                      -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
+
+  for (i = 0; i < MLDSA_N / 32; i++)
+  {
+    __m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]);
+    __m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]);
+    __m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]);
+    __m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]);
+    f0 = _mm256_packus_epi32(f0, f1);
+    f1 = _mm256_packus_epi32(f2, f3);
+    f0 = _mm256_packus_epi16(f0, f1);
+    f0 = _mm256_maddubs_epi16(f0, shift1);
+    f0 = _mm256_madd_epi16(f0, shift2);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1);
+    f0 = _mm256_shuffle_epi8(f0, shufbidx);
+    f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2);
+
+    /* Each iteration produces 24 valid bytes in the low 192 bits.
+     * Store as 128-bit + 64-bit to avoid writing past the output buffer. */
+    {
+      __m128i lo = _mm256_castsi256_si128(f0);
+      __m128i hi = _mm256_extracti128_si256(f0, 1);
+      _mm_storeu_si128((__m128i *)&r[24 * i], lo);
+      _mm_storel_epi64((__m128i *)&r[24 * i + 16], hi);
+    }
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+         && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+         44) */
+
+MLD_EMPTY_CU(avx2_polyw1_pack_88)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&                             \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED &&                               \
+          (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
+          44)) */
diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c
@@ -88,6 +88,8 @@
 #include "src/native/x86_64/src/poly_decompose_88_avx2.c"
 #include "src/native/x86_64/src/poly_use_hint_32_avx2.c"
 #include "src/native/x86_64/src/poly_use_hint_88_avx2.c"
+#include "src/native/x86_64/src/polyw1_pack_32_avx2.c"
+#include "src/native/x86_64/src/polyw1_pack_88_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_17_avx2.c"
 #include "src/native/x86_64/src/polyz_unpack_19_avx2.c"
 #include "src/native/x86_64/src/rej_uniform_avx2.c"
@@ -672,6 +674,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -701,6 +705,8 @@
 #undef mld_poly_decompose_88_avx2
 #undef mld_poly_use_hint_32_avx2
 #undef mld_poly_use_hint_88_avx2
+#undef mld_polyw1_pack_32_avx2
+#undef mld_polyw1_pack_88_avx2
 #undef mld_polyz_unpack_17_avx2
 #undef mld_polyz_unpack_19_avx2
 #undef mld_rej_uniform_avx2
diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S
@@ -675,6 +675,8 @@
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
 #undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+#undef MLD_USE_NATIVE_POLYW1_PACK_32
+#undef MLD_USE_NATIVE_POLYW1_PACK_88
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_17
 #undef MLD_USE_NATIVE_POLYZ_UNPACK_19
 #undef MLD_USE_NATIVE_POLY_CADDQ
@@ -704,6 +706,8 @@
 #undef mld_poly_decompose_88_avx2
 #undef mld_poly_use_hint_32_avx2
 #undef mld_poly_use_hint_88_avx2
+#undef mld_polyw1_pack_32_avx2
+#undef mld_polyw1_pack_88_avx2
 #undef mld_polyz_unpack_17_avx2
 #undef mld_polyz_unpack_19_avx2
 #undef mld_rej_uniform_avx2
diff --git a/mldsa/src/native/api.h b/mldsa/src/native/api.h
@@ -498,6 +498,57 @@ __contract__(
           || MLD_CONFIG_PARAMETER_SET == 87 */
 #endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
 
+#if defined(MLD_USE_NATIVE_POLYW1_PACK_32)
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+/*************************************************
+ * Name:        mld_polyw1_pack_32_native
+ *
+ * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/32.
+ *              Bit-pack polynomial w1 with coefficients in [0, 15],
+ *              packing 2 nibbles per byte.
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *              - const int32_t *a: pointer to input polynomial coefficients
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
+  assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
+);
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+#endif /* MLD_USE_NATIVE_POLYW1_PACK_32 */
+
+#if defined(MLD_USE_NATIVE_POLYW1_PACK_88)
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+/*************************************************
+ * Name:        mld_polyw1_pack_88_native
+ *
+ * Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/88.
+ *              Bit-pack polynomial w1 with coefficients in [0, 43],
+ *              using 6-bit encoding (4 coefficients -> 3 bytes).
+ *
+ * Arguments:   - uint8_t *r: pointer to output byte array
+ *              - const int32_t *a: pointer to input polynomial coefficients
+ **************************************************/
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+__contract__(
+  requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
+  requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
+  requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
+  assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
+  ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
+);
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+#endif /* MLD_USE_NATIVE_POLYW1_PACK_88 */
+
 #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
 /*************************************************
  * Name:        mld_poly_pointwise_montgomery_native
diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h
@@ -25,6 +25,8 @@
 #define MLD_USE_NATIVE_POLY_CHKNORM
 #define MLD_USE_NATIVE_POLYZ_UNPACK_17
 #define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POLYW1_PACK_32
+#define MLD_USE_NATIVE_POLYW1_PACK_88
 #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
 #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
           || MLD_CONFIG_PARAMETER_SET == 87 */
 
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
+    (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_32_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
+          || MLD_CONFIG_PARAMETER_SET == 87 */
+
+#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
+MLD_MUST_CHECK_RETURN_VALUE
+static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
+{
+  if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
+  {
+    return MLD_NATIVE_FUNC_FALLBACK;
+  }
+  mld_polyw1_pack_88_avx2(r, a);
+  return MLD_NATIVE_FUNC_SUCCESS;
+}
+#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
+        */
+
 MLD_MUST_CHECK_RETURN_VALUE
 static MLD_INLINE int mld_poly_pointwise_montgomery_native(
     int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
 #define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
 void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
 
+#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
+void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
+
+#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
+void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
+
 #define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
 void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
                         const int32_t *qdata);
diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c
diff --git a/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c b/mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c
diff --git a/mldsa/src/poly_kl.c b/mldsa/src/poly_kl.c