mldsa-native/dev/x86_64/src/pointwise_acc_l5_avx2_asm.S at main · pq-code-package/mldsa-native · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
 * Copyright (c) The mldsa-native project authors
 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
 */

/* References
 * ==========
 *
 * - [REF_AVX2]
 *   CRYSTALS-Dilithium optimized AVX2 implementation
 *   Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
 *   https://github.com/pq-crystals/dilithium/tree/master/avx2
 */

/*
 * This file is derived from the public domain
 * AVX2 Dilithium implementation @[REF_AVX2].
 */

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
    (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
/* simpasm: header-end */

#include "consts.h"

        .intel_syntax noprefix
        .text

.macro pointwise off
// Load
        vmovdqa ymm6, [rsi + \off]
        vmovdqa ymm8, [rsi + \off + 32]
        vmovdqa ymm10, [rdx + \off]
        vmovdqa ymm12, [rdx + \off + 32]
        vpsrlq  ymm7, ymm6, 32
        vpsrlq  ymm9, ymm8, 32
        vmovshdup ymm11, ymm10
        vmovshdup ymm13, ymm12
        /*
         * ymm{i} stores a's coefficients for i in 6...9, and b's coefficients
         * for i in 10...13.
         *
         * Bounds: |ymm{i}| < q  for i in 6...9
         *                  < 9q for i in 10...13
         */

// Multiply
        vpmuldq ymm6, ymm6, ymm10
        vpmuldq ymm7, ymm7, ymm11
        vpmuldq ymm8, ymm8, ymm12
        vpmuldq ymm9, ymm9, ymm13
        /* Bounds: |ymm{i}| < 9q^2 for i in 6...9 */
.endm

.macro acc
        vpaddq  ymm2, ymm6, ymm2
        vpaddq  ymm3, ymm7, ymm3
        vpaddq  ymm4, ymm8, ymm4
        vpaddq  ymm5, ymm9, ymm5
.endm

/*
 * void mld_pointwise_acc_l5_avx2_asm(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *qdata)
 *
 * Pointwise multiplication with accumulation across multiple polynomial vectors
 *
 * Arguments:
 *   rdi: pointer to output polynomial c
 *   rsi: pointer to input polynomial a (multiple vectors)
 *   rdx: pointer to input polynomial b (multiple vectors)
 *   rcx: pointer to qdata constants
 */
        .balign 4
        .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2_asm)
MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2_asm)

// Load constants
        vmovdqa ymm0, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV)*4]
        vmovdqa ymm1, [rcx + (MLD_AVX2_BACKEND_DATA_OFFSET_8XQ)*4]

        xor     eax, eax
pointwise_acc_l5_avx2_looptop2:
        pointwise 0

// Move
        vmovdqa ymm2, ymm6
        vmovdqa ymm3, ymm7
        vmovdqa ymm4, ymm8
        vmovdqa ymm5, ymm9
        /* Bounds: |ymm{i}| < 9q^2 */

        pointwise 1024
        acc
        /* Bounds: |ymm{i}| < 18q^2 */

        pointwise 2048
        acc
        /* Bounds: |ymm{i}| < 27q^2 */

        pointwise 3072
        acc
        /* Bounds: |ymm{i}| < 36q^2 */

        pointwise 4096
        acc
        /* Bounds: |ymm{i}| < 45q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX */

// Reduce
        vpmuldq ymm6, ymm0, ymm2
        vpmuldq ymm7, ymm0, ymm3
        vpmuldq ymm8, ymm0, ymm4
        vpmuldq ymm9, ymm0, ymm5
        vpmuldq ymm6, ymm1, ymm6
        vpmuldq ymm7, ymm1, ymm7
        vpmuldq ymm8, ymm1, ymm8
        vpmuldq ymm9, ymm1, ymm9
        vpsubq  ymm2, ymm2, ymm6
        vpsubq  ymm3, ymm3, ymm7
        vpsubq  ymm4, ymm4, ymm8
        vpsubq  ymm5, ymm5, ymm9
        /*
         * All coefficients are Montgomery-reduced, resulting in
         *
         * Bounds: |ymm{i}| < q for i in 2...5
         *
         * See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
         */

// Store
        vpsrlq  ymm2, ymm2, 32
        vmovshdup ymm4, ymm4
        vpblendd ymm2, ymm2, ymm3, 0xAA
        vpblendd ymm4, ymm4, ymm5, 0xAA

        vmovdqa [rdi], ymm2
        vmovdqa [rdi + 32], ymm4

        add     rsi, 64
        add     rdx, 64
        add     rdi, 64
        add     eax, 1
        cmp     eax, 16
        jb      pointwise_acc_l5_avx2_looptop2

        ret
/* simpasm: footer-start */
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
          && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */