-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathpoly_caddq_avx2_asm.S
More file actions
100 lines (87 loc) · 2.31 KB
/
Copy pathpoly_caddq_avx2_asm.S
File metadata and controls
100 lines (87 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/
/*yaml
Name: poly_caddq_avx2_asm
Description: x86_64 AVX2 conditional addition of q to each coefficient.
For all coefficients of the in/out polynomial, add Q if the coefficient
is negative.
Signature: void mld_poly_caddq_avx2_asm(int32_t *r)
ABI:
Architecture: x86_64
CallingConvention: SysV
Features: [AVX2]
rdi:
type: buffer
size_bytes: 1024
permissions: read/write
c_parameter: int32_t *r
description: Input/output polynomial (256 x int32_t)
*/
#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
/* simpasm: header-end */
.macro caddq_vector offset, reg
vpcmpgtd \offset(%rdi), %ymm2, \reg
vpand %ymm1, \reg, \reg
vpaddd \offset(%rdi), \reg, \reg
vmovdqa \reg, \offset(%rdi)
.endm
.text
.global MLD_ASM_NAMESPACE(poly_caddq_avx2_asm)
.balign 16
MLD_ASM_FN_SYMBOL(poly_caddq_avx2_asm)
vpxor %xmm2, %xmm2, %xmm2
mov $8380417, %eax
vmovd %eax, %xmm1
vpbroadcastd %xmm1, %ymm1
caddq_vector 0, %ymm0
caddq_vector 32, %ymm3
caddq_vector 64, %ymm4
caddq_vector 96, %ymm5
caddq_vector 128, %ymm0
caddq_vector 160, %ymm3
caddq_vector 192, %ymm4
caddq_vector 224, %ymm5
caddq_vector 256, %ymm0
caddq_vector 288, %ymm3
caddq_vector 320, %ymm4
caddq_vector 352, %ymm5
caddq_vector 384, %ymm0
caddq_vector 416, %ymm3
caddq_vector 448, %ymm4
caddq_vector 480, %ymm5
caddq_vector 512, %ymm0
caddq_vector 544, %ymm3
caddq_vector 576, %ymm4
caddq_vector 608, %ymm5
caddq_vector 640, %ymm0
caddq_vector 672, %ymm3
caddq_vector 704, %ymm4
caddq_vector 736, %ymm5
caddq_vector 768, %ymm0
caddq_vector 800, %ymm3
caddq_vector 832, %ymm4
caddq_vector 864, %ymm5
caddq_vector 896, %ymm0
caddq_vector 928, %ymm3
caddq_vector 960, %ymm4
caddq_vector 992, %ymm5
ret
/* simpasm: footer-start */
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
*/