Skip to content

Commit 8185ec7

Browse files
committed
HOL-Light: Add x86 AVX2 nttunpack proof
Signed-off-by: Jake Massimo <jakemas@amazon.com>
1 parent e6a137b commit 8185ec7

11 files changed

Lines changed: 793 additions & 7 deletions

File tree

.github/workflows/hol_light.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ jobs:
163163
# Dependencies on {name}.{S,ml} are implicit
164164
- name: mldsa_ntt
165165
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml"]
166+
- name: mldsa_nttunpack
167+
needs: ["mldsa_specs.ml", "mldsa_utils.ml"]
166168
name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S
167169
runs-on: pqcp-x64
168170
if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork

BIBLIOGRAPHY.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ source code and documentation.
249249
- [mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c)
250250
- [mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c)
251251
- [proofs/hol_light/x86_64/mldsa/mldsa_ntt.S](proofs/hol_light/x86_64/mldsa/mldsa_ntt.S)
252+
- [proofs/hol_light/x86_64/mldsa/mldsa_nttunpack.S](proofs/hol_light/x86_64/mldsa/mldsa_nttunpack.S)
252253

253254
### `Round3_Spec`
254255

dev/x86_64/src/nttunpack.S

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,12 @@
1313
* https://github.com/pq-crystals/dilithium/tree/master/avx2
1414
*/
1515

16-
/*
16+
/*
1717
* This file is derived from the public domain
1818
* AVX2 Dilithium implementation @[REF_AVX2].
1919
*/
2020

21-
#include "../../../common.h"
22-
21+
#include "../../../common.h"
2322
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
2423
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
2524
/* simpasm: header-end */

mldsa/src/native/x86_64/src/nttunpack.S

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,12 @@
1313
* https://github.com/pq-crystals/dilithium/tree/master/avx2
1414
*/
1515

16-
/*
16+
/*
1717
* This file is derived from the public domain
1818
* AVX2 Dilithium implementation @[REF_AVX2].
1919
*/
2020

21-
#include "../../../common.h"
22-
21+
#include "../../../common.h"
2322
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
2423
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
2524

proofs/hol_light/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,4 @@ echo '1+1;;' | nc -w 5 127.0.0.1 2012
5353

5454
- AArch64 poly_caddq: [mldsa_poly_caddq.S](aarch64/mldsa/mldsa_poly_caddq.S)
5555
- x86_64 forward NTT: [mldsa_ntt.S](x86_64/mldsa/mldsa_ntt.S)
56+
- x86_64 NTT unpack: [mldsa_nttunpack.S](x86_64/mldsa/mldsa_nttunpack.S)

proofs/hol_light/x86_64/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ endif
5050

5151
SPLIT=tr ';' '\n'
5252

53-
OBJ = mldsa/mldsa_ntt.o
53+
OBJ = mldsa/mldsa_ntt.o \
54+
mldsa/mldsa_nttunpack.o
5455

5556
# Build object files from assembly sources
5657
$(OBJ): %.o : %.S

proofs/hol_light/x86_64/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,6 @@ Currently this includes:
104104

105105
- ML-DSA Arithmetic:
106106
* x86_64 forward NTT: [mldsa_ntt.S](mldsa/mldsa_ntt.S)
107+
* x86_64 NTT unpack: [mldsa_nttunpack.S](mldsa/mldsa_nttunpack.S)
107108

108109
The NTT function is optimized using AVX2 instructions and follows the s2n-bignum x86_64 assembly verification patterns.
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
/*
2+
* Copyright (c) The mlkem-native project authors
3+
* Copyright (c) The mldsa-native project authors
4+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5+
*/
6+
7+
/* References
8+
* ==========
9+
*
10+
* - [REF_AVX2]
11+
* CRYSTALS-Dilithium optimized AVX2 implementation
12+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
14+
*/
15+
16+
/*
17+
* This file is derived from the public domain
18+
* AVX2 Dilithium implementation @[REF_AVX2].
19+
*/
20+
21+
22+
/*
23+
* WARNING: This file is auto-derived from the mldsa-native source file
24+
* dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
25+
*/
26+
27+
#if defined(__ELF__)
28+
.section .note.GNU-stack,"",@progbits
29+
#endif
30+
31+
.text
32+
.balign 4
33+
#ifdef __APPLE__
34+
.global _PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2
35+
_PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2:
36+
#else
37+
.global PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2
38+
PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2:
39+
#endif
40+
41+
.cfi_startproc
42+
endbr64
43+
vmovdqa (%rdi), %ymm4
44+
vmovdqa 0x20(%rdi), %ymm5
45+
vmovdqa 0x40(%rdi), %ymm6
46+
vmovdqa 0x60(%rdi), %ymm7
47+
vmovdqa 0x80(%rdi), %ymm8
48+
vmovdqa 0xa0(%rdi), %ymm9
49+
vmovdqa 0xc0(%rdi), %ymm10
50+
vmovdqa 0xe0(%rdi), %ymm11
51+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
52+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
53+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
54+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
55+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
56+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
57+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
58+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
59+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
60+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
61+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
62+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
63+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
64+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
65+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
66+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
67+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
68+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
69+
vpsrlq $0x20, %ymm7, %ymm7
70+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
71+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
72+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
73+
vpsrlq $0x20, %ymm5, %ymm5
74+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
75+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
76+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
77+
vpsrlq $0x20, %ymm3, %ymm3
78+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
79+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
80+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
81+
vpsrlq $0x20, %ymm10, %ymm10
82+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
83+
vmovdqa %ymm9, (%rdi)
84+
vmovdqa %ymm8, 0x20(%rdi)
85+
vmovdqa %ymm7, 0x40(%rdi)
86+
vmovdqa %ymm6, 0x60(%rdi)
87+
vmovdqa %ymm5, 0x80(%rdi)
88+
vmovdqa %ymm4, 0xa0(%rdi)
89+
vmovdqa %ymm3, 0xc0(%rdi)
90+
vmovdqa %ymm11, 0xe0(%rdi)
91+
vmovdqa 0x100(%rdi), %ymm4
92+
vmovdqa 0x120(%rdi), %ymm5
93+
vmovdqa 0x140(%rdi), %ymm6
94+
vmovdqa 0x160(%rdi), %ymm7
95+
vmovdqa 0x180(%rdi), %ymm8
96+
vmovdqa 0x1a0(%rdi), %ymm9
97+
vmovdqa 0x1c0(%rdi), %ymm10
98+
vmovdqa 0x1e0(%rdi), %ymm11
99+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
100+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
101+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
102+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
103+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
104+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
105+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
106+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
107+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
108+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
109+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
110+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
111+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
112+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
113+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
114+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
115+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
116+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
117+
vpsrlq $0x20, %ymm7, %ymm7
118+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
119+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
120+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
121+
vpsrlq $0x20, %ymm5, %ymm5
122+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
123+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
124+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
125+
vpsrlq $0x20, %ymm3, %ymm3
126+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
127+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
128+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
129+
vpsrlq $0x20, %ymm10, %ymm10
130+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
131+
vmovdqa %ymm9, 0x100(%rdi)
132+
vmovdqa %ymm8, 0x120(%rdi)
133+
vmovdqa %ymm7, 0x140(%rdi)
134+
vmovdqa %ymm6, 0x160(%rdi)
135+
vmovdqa %ymm5, 0x180(%rdi)
136+
vmovdqa %ymm4, 0x1a0(%rdi)
137+
vmovdqa %ymm3, 0x1c0(%rdi)
138+
vmovdqa %ymm11, 0x1e0(%rdi)
139+
vmovdqa 0x200(%rdi), %ymm4
140+
vmovdqa 0x220(%rdi), %ymm5
141+
vmovdqa 0x240(%rdi), %ymm6
142+
vmovdqa 0x260(%rdi), %ymm7
143+
vmovdqa 0x280(%rdi), %ymm8
144+
vmovdqa 0x2a0(%rdi), %ymm9
145+
vmovdqa 0x2c0(%rdi), %ymm10
146+
vmovdqa 0x2e0(%rdi), %ymm11
147+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
148+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
149+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
150+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
151+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
152+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
153+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
154+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
155+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
156+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
157+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
158+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
159+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
160+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
161+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
162+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
163+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
164+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
165+
vpsrlq $0x20, %ymm7, %ymm7
166+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
167+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
168+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
169+
vpsrlq $0x20, %ymm5, %ymm5
170+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
171+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
172+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
173+
vpsrlq $0x20, %ymm3, %ymm3
174+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
175+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
176+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
177+
vpsrlq $0x20, %ymm10, %ymm10
178+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
179+
vmovdqa %ymm9, 0x200(%rdi)
180+
vmovdqa %ymm8, 0x220(%rdi)
181+
vmovdqa %ymm7, 0x240(%rdi)
182+
vmovdqa %ymm6, 0x260(%rdi)
183+
vmovdqa %ymm5, 0x280(%rdi)
184+
vmovdqa %ymm4, 0x2a0(%rdi)
185+
vmovdqa %ymm3, 0x2c0(%rdi)
186+
vmovdqa %ymm11, 0x2e0(%rdi)
187+
vmovdqa 0x300(%rdi), %ymm4
188+
vmovdqa 0x320(%rdi), %ymm5
189+
vmovdqa 0x340(%rdi), %ymm6
190+
vmovdqa 0x360(%rdi), %ymm7
191+
vmovdqa 0x380(%rdi), %ymm8
192+
vmovdqa 0x3a0(%rdi), %ymm9
193+
vmovdqa 0x3c0(%rdi), %ymm10
194+
vmovdqa 0x3e0(%rdi), %ymm11
195+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
196+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
197+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
198+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
199+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
200+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
201+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
202+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
203+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
204+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
205+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
206+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
207+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
208+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
209+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
210+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
211+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
212+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
213+
vpsrlq $0x20, %ymm7, %ymm7
214+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
215+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
216+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
217+
vpsrlq $0x20, %ymm5, %ymm5
218+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
219+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
220+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
221+
vpsrlq $0x20, %ymm3, %ymm3
222+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
223+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
224+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
225+
vpsrlq $0x20, %ymm10, %ymm10
226+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
227+
vmovdqa %ymm9, 0x300(%rdi)
228+
vmovdqa %ymm8, 0x320(%rdi)
229+
vmovdqa %ymm7, 0x340(%rdi)
230+
vmovdqa %ymm6, 0x360(%rdi)
231+
vmovdqa %ymm5, 0x380(%rdi)
232+
vmovdqa %ymm4, 0x3a0(%rdi)
233+
vmovdqa %ymm3, 0x3c0(%rdi)
234+
vmovdqa %ymm11, 0x3e0(%rdi)
235+
retq
236+
.cfi_endproc

proofs/hol_light/x86_64/proofs/dump_bytecode.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ needs "x86/proofs/base.ml";;
88
print_string "=== bytecode start: x86_64/mldsa/mldsa_ntt.o ================\n";;
99
print_literal_from_elf "x86_64/mldsa/mldsa_ntt.o";;
1010
print_string "==== bytecode end =====================================\n\n";;
11+
12+
print_string "=== bytecode start: x86_64/mldsa/mldsa_nttunpack.o ================\n";;
13+
print_literal_from_elf "x86_64/mldsa/mldsa_nttunpack.o";;
14+
print_string "==== bytecode end =====================================\n\n";;

0 commit comments

Comments
 (0)