-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathpoly_use_hint_32_avx2.c
More file actions
103 lines (87 loc) · 3.68 KB
/
poly_use_hint_32_avx2.c
File metadata and controls
103 lines (87 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/
#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_NO_VERIFY_API) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
#include <immintrin.h>
#include "arith_native_x86_64.h"
#include "consts.h"
#define MLD_MM256_BLENDV_EPI32(a, b, mask) \
_mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
_mm256_castsi256_ps(b), \
_mm256_castsi256_ps(mask)))
void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *hint)
{
unsigned int i;
__m256i f, f0, f1, h, t;
const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32));
/* check-magic: 1025 == floor(2**22 / 4092) */
const __m256i v = _mm256_set1_epi32(1025);
const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
const __m256i off = _mm256_set1_epi32(127);
const __m256i shift = _mm256_set1_epi32(512);
const __m256i mask = _mm256_set1_epi32(15);
const __m256i zero = _mm256_setzero_si256();
for (i = 0; i < MLDSA_N / 8; i++)
{
f = _mm256_load_si256((const __m256i *)&a[8 * i]);
h = _mm256_load_si256((const __m256i *)&hint[8 * i]);
/* Reference:
* - @[REF_AVX2] calls poly_decompose to compute all a1, a0 before the loop.
* - Our implementation of decompose() is slightly different from that in
* @[REF_AVX2]. See poly_decompose_32_avx2.c for more information.
*/
/* f1, f2 = decompose(f) */
f1 = _mm256_add_epi32(f, off);
f1 = _mm256_srli_epi32(f1, 7);
f1 = _mm256_mulhi_epu16(f1, v);
f1 = _mm256_mulhrs_epi16(f1, shift);
t = _mm256_cmpgt_epi32(f, q_bound);
f0 = _mm256_mullo_epi32(f1, alpha);
f0 = _mm256_sub_epi32(f, f0);
f1 = _mm256_andnot_si256(t, f1);
f0 = _mm256_add_epi32(f0, t);
/* Reference: The reference avx2 implementation checks a0 >= 0, which is
* different from the specification and the reference C implementation. We
* follow the specification and check a0 > 0.
*/
/* t = (f0 > 0) ? h : -h */
f0 = _mm256_cmpgt_epi32(f0, zero);
t = MLD_MM256_BLENDV_EPI32(h, zero, f0);
t = _mm256_slli_epi32(t, 1);
h = _mm256_sub_epi32(h, t);
/* f1 = (f1 + t) % 16 */
f1 = _mm256_add_epi32(f1, h);
f1 = _mm256_and_si256(f1, mask);
_mm256_store_si256((__m256i *)&a[8 * i], f1);
}
}
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87) */
MLD_EMPTY_CU(avx2_poly_use_hint_32)
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|| MLD_CONFIG_PARAMETER_SET == 87)) */
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef MLD_MM256_BLENDV_EPI32