Skip to content

Commit 1bb503b

Browse files
committed
x86_64: Add AVX2 polyw1_pack to x86_64 native backend
Integrate polyw1_pack AVX2 implementations for both GAMMA2 variants into the native backend. Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 0b1c536 commit 1bb503b

13 files changed

Lines changed: 483 additions & 18 deletions

BIBLIOGRAPHY.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ source code and documentation.
225225
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
226226
- [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
227227
- [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
228+
- [dev/x86_64/src/polyw1_pack_32_avx2.c](dev/x86_64/src/polyw1_pack_32_avx2.c)
229+
- [dev/x86_64/src/polyw1_pack_88_avx2.c](dev/x86_64/src/polyw1_pack_88_avx2.c)
228230
- [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
229231
- [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
230232
- [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
@@ -243,6 +245,8 @@ source code and documentation.
243245
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
244246
- [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
245247
- [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
248+
- [mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_32_avx2.c)
249+
- [mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c](mldsa/src/native/x86_64/src/polyw1_pack_88_avx2.c)
246250
- [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
247251
- [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
248252
- [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)

dev/x86_64/meta.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define MLD_USE_NATIVE_POLY_CHKNORM
2626
#define MLD_USE_NATIVE_POLYZ_UNPACK_17
2727
#define MLD_USE_NATIVE_POLYZ_UNPACK_19
28+
#define MLD_USE_NATIVE_POLYW1_PACK_32
29+
#define MLD_USE_NATIVE_POLYW1_PACK_88
2830
#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
2931
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
3032
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
253255
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
254256
|| MLD_CONFIG_PARAMETER_SET == 87 */
255257

258+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
259+
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
260+
MLD_MUST_CHECK_RETURN_VALUE
261+
static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
262+
{
263+
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
264+
{
265+
return MLD_NATIVE_FUNC_FALLBACK;
266+
}
267+
mld_polyw1_pack_32_avx2(r, a);
268+
return MLD_NATIVE_FUNC_SUCCESS;
269+
}
270+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
271+
|| MLD_CONFIG_PARAMETER_SET == 87 */
272+
273+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
274+
MLD_MUST_CHECK_RETURN_VALUE
275+
static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
276+
{
277+
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
278+
{
279+
return MLD_NATIVE_FUNC_FALLBACK;
280+
}
281+
mld_polyw1_pack_88_avx2(r, a);
282+
return MLD_NATIVE_FUNC_SUCCESS;
283+
}
284+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
285+
*/
286+
256287
MLD_MUST_CHECK_RETURN_VALUE
257288
static MLD_INLINE int mld_poly_pointwise_montgomery_native(
258289
int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])

dev/x86_64/src/arith_native_x86_64.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
102102
#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
103103
void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
104104

105+
#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
106+
void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
107+
108+
#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
109+
void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
110+
105111
#define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
106112
void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
107113
const int32_t *qdata);
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/* References
7+
* ==========
8+
*
9+
* - [REF_AVX2]
10+
* CRYSTALS-Dilithium optimized AVX2 implementation
11+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
13+
*/
14+
15+
/*
16+
* This file is derived from the public domain
17+
* AVX2 Dilithium implementation @[REF_AVX2].
18+
*/
19+
20+
#include "../../../common.h"
21+
22+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
24+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
25+
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
26+
27+
#include <immintrin.h>
28+
#include "arith_native_x86_64.h"
29+
30+
/* Pack w1 polynomial (coefficients in [0,15]) for GAMMA2 = (Q-1)/32.
31+
* Packs 2 nibbles per byte; 64 coefficients per iteration. */
32+
void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a)
33+
{
34+
unsigned int i;
35+
const __m256i shift = _mm256_set1_epi16((16 << 8) + 1);
36+
const __m256i shufbidx =
37+
_mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, 15,
38+
14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
39+
40+
for (i = 0; i < MLDSA_N / 64; ++i)
41+
{
42+
__m256i f0 = _mm256_load_si256((__m256i *)&a[64 * i + 0]);
43+
__m256i f1 = _mm256_load_si256((__m256i *)&a[64 * i + 8]);
44+
__m256i f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16]);
45+
__m256i f3 = _mm256_load_si256((__m256i *)&a[64 * i + 24]);
46+
__m256i f4 = _mm256_load_si256((__m256i *)&a[64 * i + 32]);
47+
__m256i f5 = _mm256_load_si256((__m256i *)&a[64 * i + 40]);
48+
__m256i f6 = _mm256_load_si256((__m256i *)&a[64 * i + 48]);
49+
__m256i f7 = _mm256_load_si256((__m256i *)&a[64 * i + 56]);
50+
f0 = _mm256_packus_epi32(f0, f1);
51+
f1 = _mm256_packus_epi32(f2, f3);
52+
f2 = _mm256_packus_epi32(f4, f5);
53+
f3 = _mm256_packus_epi32(f6, f7);
54+
f0 = _mm256_packus_epi16(f0, f1);
55+
f1 = _mm256_packus_epi16(f2, f3);
56+
f0 = _mm256_maddubs_epi16(f0, shift);
57+
f1 = _mm256_maddubs_epi16(f1, shift);
58+
f0 = _mm256_packus_epi16(f0, f1);
59+
f0 = _mm256_permute4x64_epi64(f0, 0xD8);
60+
f0 = _mm256_shuffle_epi8(f0, shufbidx);
61+
_mm256_storeu_si256((__m256i *)&r[32 * i], f0);
62+
}
63+
}
64+
65+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
66+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
67+
65 || MLD_CONFIG_PARAMETER_SET == 87) */
68+
69+
MLD_EMPTY_CU(avx2_polyw1_pack_32)
70+
71+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
72+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
73+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
74+
|| MLD_CONFIG_PARAMETER_SET == 87)) */
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/* References
7+
* ==========
8+
*
9+
* - [REF_AVX2]
10+
* CRYSTALS-Dilithium optimized AVX2 implementation
11+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
13+
*/
14+
15+
/*
16+
* This file is derived from the public domain
17+
* AVX2 Dilithium implementation @[REF_AVX2].
18+
*/
19+
20+
#include "../../../common.h"
21+
22+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
24+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
25+
MLD_CONFIG_PARAMETER_SET == 44)
26+
27+
#include <immintrin.h>
28+
#include "arith_native_x86_64.h"
29+
30+
/* Pack w1 polynomial (coefficients in [0,43]) for GAMMA2 = (Q-1)/88.
31+
* 6-bit encoding, 4 coefficients per 3 bytes; 32 coefficients per iteration. */
32+
void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a)
33+
{
34+
unsigned int i;
35+
const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1);
36+
const __m256i shift2 = _mm256_set1_epi32(((1 << 12) << 16) + 1);
37+
const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
38+
const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0);
39+
const __m256i shufbidx =
40+
_mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0,
41+
-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
42+
43+
for (i = 0; i < MLDSA_N / 32; i++)
44+
{
45+
__m256i f0 = _mm256_load_si256((__m256i *)&a[32 * i + 0]);
46+
__m256i f1 = _mm256_load_si256((__m256i *)&a[32 * i + 8]);
47+
__m256i f2 = _mm256_load_si256((__m256i *)&a[32 * i + 16]);
48+
__m256i f3 = _mm256_load_si256((__m256i *)&a[32 * i + 24]);
49+
f0 = _mm256_packus_epi32(f0, f1);
50+
f1 = _mm256_packus_epi32(f2, f3);
51+
f0 = _mm256_packus_epi16(f0, f1);
52+
f0 = _mm256_maddubs_epi16(f0, shift1);
53+
f0 = _mm256_madd_epi16(f0, shift2);
54+
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1);
55+
f0 = _mm256_shuffle_epi8(f0, shufbidx);
56+
f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2);
57+
58+
/* Each iteration produces 24 valid bytes in the low 192 bits.
59+
* Store as 128-bit + 64-bit to avoid writing past the output buffer. */
60+
{
61+
__m128i lo = _mm256_castsi256_si128(f0);
62+
__m128i hi = _mm256_extracti128_si256(f0, 1);
63+
_mm_storeu_si128((__m128i *)&r[24 * i], lo);
64+
_mm_storel_epi64((__m128i *)&r[24 * i + 16], hi);
65+
}
66+
}
67+
}
68+
69+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
70+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
71+
44) */
72+
73+
MLD_EMPTY_CU(avx2_polyw1_pack_88)
74+
75+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
76+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
77+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
78+
44)) */

mldsa/mldsa_native.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@
8888
#include "src/native/x86_64/src/poly_decompose_88_avx2.c"
8989
#include "src/native/x86_64/src/poly_use_hint_32_avx2.c"
9090
#include "src/native/x86_64/src/poly_use_hint_88_avx2.c"
91+
#include "src/native/x86_64/src/polyw1_pack_32_avx2.c"
92+
#include "src/native/x86_64/src/polyw1_pack_88_avx2.c"
9193
#include "src/native/x86_64/src/polyz_unpack_17_avx2.c"
9294
#include "src/native/x86_64/src/polyz_unpack_19_avx2.c"
9395
#include "src/native/x86_64/src/rej_uniform_avx2.c"
@@ -672,6 +674,8 @@
672674
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
673675
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
674676
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
677+
#undef MLD_USE_NATIVE_POLYW1_PACK_32
678+
#undef MLD_USE_NATIVE_POLYW1_PACK_88
675679
#undef MLD_USE_NATIVE_POLYZ_UNPACK_17
676680
#undef MLD_USE_NATIVE_POLYZ_UNPACK_19
677681
#undef MLD_USE_NATIVE_POLY_CADDQ
@@ -701,6 +705,8 @@
701705
#undef mld_poly_decompose_88_avx2
702706
#undef mld_poly_use_hint_32_avx2
703707
#undef mld_poly_use_hint_88_avx2
708+
#undef mld_polyw1_pack_32_avx2
709+
#undef mld_polyw1_pack_88_avx2
704710
#undef mld_polyz_unpack_17_avx2
705711
#undef mld_polyz_unpack_19_avx2
706712
#undef mld_rej_uniform_avx2

mldsa/mldsa_native_asm.S

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,6 +675,8 @@
675675
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
676676
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
677677
#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
678+
#undef MLD_USE_NATIVE_POLYW1_PACK_32
679+
#undef MLD_USE_NATIVE_POLYW1_PACK_88
678680
#undef MLD_USE_NATIVE_POLYZ_UNPACK_17
679681
#undef MLD_USE_NATIVE_POLYZ_UNPACK_19
680682
#undef MLD_USE_NATIVE_POLY_CADDQ
@@ -704,6 +706,8 @@
704706
#undef mld_poly_decompose_88_avx2
705707
#undef mld_poly_use_hint_32_avx2
706708
#undef mld_poly_use_hint_88_avx2
709+
#undef mld_polyw1_pack_32_avx2
710+
#undef mld_polyw1_pack_88_avx2
707711
#undef mld_polyz_unpack_17_avx2
708712
#undef mld_polyz_unpack_19_avx2
709713
#undef mld_rej_uniform_avx2

mldsa/src/native/api.h

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,57 @@ __contract__(
498498
|| MLD_CONFIG_PARAMETER_SET == 87 */
499499
#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */
500500

501+
#if defined(MLD_USE_NATIVE_POLYW1_PACK_32)
502+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
503+
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
504+
/*************************************************
505+
* Name: mld_polyw1_pack_32_native
506+
*
507+
* Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/32.
508+
* Bit-pack polynomial w1 with coefficients in [0, 15],
509+
* packing 2 nibbles per byte.
510+
*
511+
* Arguments: - uint8_t *r: pointer to output byte array
512+
* - const int32_t *a: pointer to input polynomial coefficients
513+
**************************************************/
514+
MLD_MUST_CHECK_RETURN_VALUE
515+
static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
516+
__contract__(
517+
requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
518+
requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
519+
requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
520+
assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
521+
ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
522+
);
523+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
524+
|| MLD_CONFIG_PARAMETER_SET == 87 */
525+
#endif /* MLD_USE_NATIVE_POLYW1_PACK_32 */
526+
527+
#if defined(MLD_USE_NATIVE_POLYW1_PACK_88)
528+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
529+
/*************************************************
530+
* Name: mld_polyw1_pack_88_native
531+
*
532+
* Description: Native implementation of polyw1_pack for GAMMA2 = (Q-1)/88.
533+
* Bit-pack polynomial w1 with coefficients in [0, 43],
534+
* using 6-bit encoding (4 coefficients -> 3 bytes).
535+
*
536+
* Arguments: - uint8_t *r: pointer to output byte array
537+
* - const int32_t *a: pointer to input polynomial coefficients
538+
**************************************************/
539+
MLD_MUST_CHECK_RETURN_VALUE
540+
static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
541+
__contract__(
542+
requires(memory_no_alias(r, MLDSA_POLYW1_PACKEDBYTES))
543+
requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N))
544+
requires(array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))
545+
assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES))
546+
ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS)
547+
);
548+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
549+
*/
550+
#endif /* MLD_USE_NATIVE_POLYW1_PACK_88 */
551+
501552
#if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY)
502553
/*************************************************
503554
* Name: mld_poly_pointwise_montgomery_native

mldsa/src/native/x86_64/meta.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#define MLD_USE_NATIVE_POLY_CHKNORM
2626
#define MLD_USE_NATIVE_POLYZ_UNPACK_17
2727
#define MLD_USE_NATIVE_POLYZ_UNPACK_19
28+
#define MLD_USE_NATIVE_POLYW1_PACK_32
29+
#define MLD_USE_NATIVE_POLYW1_PACK_88
2830
#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
2931
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
3032
#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
@@ -253,6 +255,35 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
253255
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
254256
|| MLD_CONFIG_PARAMETER_SET == 87 */
255257

258+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
259+
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)
260+
MLD_MUST_CHECK_RETURN_VALUE
261+
static MLD_INLINE int mld_polyw1_pack_32_native(uint8_t *r, const int32_t *a)
262+
{
263+
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
264+
{
265+
return MLD_NATIVE_FUNC_FALLBACK;
266+
}
267+
mld_polyw1_pack_32_avx2(r, a);
268+
return MLD_NATIVE_FUNC_SUCCESS;
269+
}
270+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
271+
|| MLD_CONFIG_PARAMETER_SET == 87 */
272+
273+
#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44
274+
MLD_MUST_CHECK_RETURN_VALUE
275+
static MLD_INLINE int mld_polyw1_pack_88_native(uint8_t *r, const int32_t *a)
276+
{
277+
if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2))
278+
{
279+
return MLD_NATIVE_FUNC_FALLBACK;
280+
}
281+
mld_polyw1_pack_88_avx2(r, a);
282+
return MLD_NATIVE_FUNC_SUCCESS;
283+
}
284+
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
285+
*/
286+
256287
MLD_MUST_CHECK_RETURN_VALUE
257288
static MLD_INLINE int mld_poly_pointwise_montgomery_native(
258289
int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])

mldsa/src/native/x86_64/src/arith_native_x86_64.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
102102
#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
103103
void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
104104

105+
#define mld_polyw1_pack_32_avx2 MLD_NAMESPACE(mld_polyw1_pack_32_avx2)
106+
void mld_polyw1_pack_32_avx2(uint8_t *r, const int32_t *a);
107+
108+
#define mld_polyw1_pack_88_avx2 MLD_NAMESPACE(mld_polyw1_pack_88_avx2)
109+
void mld_polyw1_pack_88_avx2(uint8_t *r, const int32_t *a);
110+
105111
#define mld_pointwise_avx2 MLD_NAMESPACE(pointwise_avx2)
106112
void mld_pointwise_avx2(int32_t *c, const int32_t *a, const int32_t *b,
107113
const int32_t *qdata);

0 commit comments

Comments
 (0)