-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathpointwise_acc_l5_avx2_asm.S
More file actions
150 lines (129 loc) · 4.02 KB
/
pointwise_acc_l5_avx2_asm.S
File metadata and controls
150 lines (129 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/
#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
/* simpasm: header-end */
#include "consts.h"
.intel_syntax noprefix
.text
.macro pointwise off
// Load
vmovdqa ymm6, [rsi + \off]
vmovdqa ymm8, [rsi + \off + 32]
vmovdqa ymm10, [rdx + \off]
vmovdqa ymm12, [rdx + \off + 32]
vpsrlq ymm7, ymm6, 32
vpsrlq ymm9, ymm8, 32
vmovshdup ymm11, ymm10
vmovshdup ymm13, ymm12
/*
* ymm{i} stores a's coefficients for i in 6...9, and b's coefficients
* for i in 10...13.
*
* Bounds: |ymm{i}| < q for i in 6...9
* < 9q for i in 10...13
*/
// Multiply
vpmuldq ymm6, ymm6, ymm10
vpmuldq ymm7, ymm7, ymm11
vpmuldq ymm8, ymm8, ymm12
vpmuldq ymm9, ymm9, ymm13
/* Bounds: |ymm{i}| < 9q^2 for i in 6...9 */
.endm
.macro acc
vpaddq ymm2, ymm6, ymm2
vpaddq ymm3, ymm7, ymm3
vpaddq ymm4, ymm8, ymm4
vpaddq ymm5, ymm9, ymm5
.endm
/*
* void mld_pointwise_acc_l5_avx2_asm(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata)
*
* Pointwise multiplication with accumulation across multiple polynomial vectors
*
* Arguments:
* rdi: pointer to output polynomial c
* rsi: pointer to input polynomial a (multiple vectors)
* rdx: pointer to input polynomial b (multiple vectors)
* rcx: pointer to qdata constants
*/
.balign 4
.global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2_asm)
MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2_asm)
// Load constants
vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4]
vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4]
xor eax, eax
pointwise_acc_l5_avx2_looptop2:
pointwise 0
// Move
vmovdqa ymm2, ymm6
vmovdqa ymm3, ymm7
vmovdqa ymm4, ymm8
vmovdqa ymm5, ymm9
/* Bounds: |ymm{i}| < 9q^2 */
pointwise 1024
acc
/* Bounds: |ymm{i}| < 18q^2 */
pointwise 2048
acc
/* Bounds: |ymm{i}| < 27q^2 */
pointwise 3072
acc
/* Bounds: |ymm{i}| < 36q^2 */
pointwise 4096
acc
/* Bounds: |ymm{i}| < 45q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX */
// Reduce
vpmuldq ymm6, ymm0, ymm2
vpmuldq ymm7, ymm0, ymm3
vpmuldq ymm8, ymm0, ymm4
vpmuldq ymm9, ymm0, ymm5
vpmuldq ymm6, ymm1, ymm6
vpmuldq ymm7, ymm1, ymm7
vpmuldq ymm8, ymm1, ymm8
vpmuldq ymm9, ymm1, ymm9
vpsubq ymm2, ymm2, ymm6
vpsubq ymm3, ymm3, ymm7
vpsubq ymm4, ymm4, ymm8
vpsubq ymm5, ymm5, ymm9
/*
* All coefficients are Montgomery-reduced, resulting in
*
* Bounds: |ymm{i}| < q for i in 2...5
*
* See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
*/
// Store
vpsrlq ymm2, ymm2, 32
vmovshdup ymm4, ymm4
vpblendd ymm2, ymm2, ymm3, 0xAA
vpblendd ymm4, ymm4, ymm5, 0xAA
vmovdqa [rdi], ymm2
vmovdqa [rdi + 32], ymm4
add rsi, 64
add rdx, 64
add rdi, 64
add eax, 1
cmp eax, 16
jb pointwise_acc_l5_avx2_looptop2
ret
/* simpasm: footer-start */
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */