Skip to content

Commit 1f57321

Browse files
committed
HOL-Light: Add x86 AVX2 nttunpack proof
Signed-off-by: Jake Massimo <jakemas@amazon.com>
1 parent e6a137b commit 1f57321

File tree

8 files changed

+789
-1
lines changed

8 files changed

+789
-1
lines changed

.github/workflows/hol_light.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ jobs:
163163
# Dependencies on {name}.{S,ml} are implicit
164164
- name: mldsa_ntt
165165
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml"]
166+
- name: mldsa_nttunpack
167+
needs: ["mldsa_specs.ml", "mldsa_utils.ml"]
166168
name: x86_64 HOL Light proof for ${{ matrix.proof.name }}.S
167169
runs-on: pqcp-x64
168170
if: github.repository_owner == 'pq-code-package' && !github.event.pull_request.head.repo.fork

proofs/hol_light/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,4 @@ echo '1+1;;' | nc -w 5 127.0.0.1 2012
5353

5454
- AArch64 poly_caddq: [mldsa_poly_caddq.S](aarch64/mldsa/mldsa_poly_caddq.S)
5555
- x86_64 forward NTT: [mldsa_ntt.S](x86_64/mldsa/mldsa_ntt.S)
56+
- x86_64 NTT unpack: [mldsa_nttunpack.S](x86_64/mldsa/mldsa_nttunpack.S)

proofs/hol_light/x86_64/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ endif
5050

5151
SPLIT=tr ';' '\n'
5252

53-
OBJ = mldsa/mldsa_ntt.o
53+
OBJ = mldsa/mldsa_ntt.o \
54+
mldsa/mldsa_nttunpack.o
5455

5556
# Build object files from assembly sources
5657
$(OBJ): %.o : %.S

proofs/hol_light/x86_64/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,6 @@ Currently this includes:
104104

105105
- ML-DSA Arithmetic:
106106
* x86_64 forward NTT: [mldsa_ntt.S](mldsa/mldsa_ntt.S)
107+
* x86_64 NTT unpack: [mldsa_nttunpack.S](mldsa/mldsa_nttunpack.S)
107108

108109
The NTT function is optimized using AVX2 instructions and follows the s2n-bignum x86_64 assembly verification patterns.
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
/*
2+
* Copyright (c) The mlkem-native project authors
3+
* Copyright (c) The mldsa-native project authors
4+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5+
*/
6+
7+
/* References
8+
* ==========
9+
*
10+
* - [REF_AVX2]
11+
* CRYSTALS-Dilithium optimized AVX2 implementation
12+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
14+
*/
15+
16+
/*
17+
* This file is derived from the public domain
18+
* AVX2 Dilithium implementation @[REF_AVX2].
19+
*/
20+
21+
22+
23+
/*
24+
* WARNING: This file is auto-derived from the mldsa-native source file
25+
* dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
26+
*/
27+
28+
#if defined(__ELF__)
29+
.section .note.GNU-stack,"",@progbits
30+
#endif
31+
32+
.text
33+
.balign 4
34+
#ifdef __APPLE__
35+
.global _PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2
36+
_PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2:
37+
#else
38+
.global PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2
39+
PQCP_MLDSA_NATIVE_MLDSA44_nttunpack_avx2:
40+
#endif
41+
42+
.cfi_startproc
43+
endbr64
44+
vmovdqa (%rdi), %ymm4
45+
vmovdqa 0x20(%rdi), %ymm5
46+
vmovdqa 0x40(%rdi), %ymm6
47+
vmovdqa 0x60(%rdi), %ymm7
48+
vmovdqa 0x80(%rdi), %ymm8
49+
vmovdqa 0xa0(%rdi), %ymm9
50+
vmovdqa 0xc0(%rdi), %ymm10
51+
vmovdqa 0xe0(%rdi), %ymm11
52+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
53+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
54+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
55+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
56+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
57+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
58+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
59+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
60+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
61+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
62+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
63+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
64+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
65+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
66+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
67+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
68+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
69+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
70+
vpsrlq $0x20, %ymm7, %ymm7
71+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
72+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
73+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
74+
vpsrlq $0x20, %ymm5, %ymm5
75+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
76+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
77+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
78+
vpsrlq $0x20, %ymm3, %ymm3
79+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
80+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
81+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
82+
vpsrlq $0x20, %ymm10, %ymm10
83+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
84+
vmovdqa %ymm9, (%rdi)
85+
vmovdqa %ymm8, 0x20(%rdi)
86+
vmovdqa %ymm7, 0x40(%rdi)
87+
vmovdqa %ymm6, 0x60(%rdi)
88+
vmovdqa %ymm5, 0x80(%rdi)
89+
vmovdqa %ymm4, 0xa0(%rdi)
90+
vmovdqa %ymm3, 0xc0(%rdi)
91+
vmovdqa %ymm11, 0xe0(%rdi)
92+
vmovdqa 0x100(%rdi), %ymm4
93+
vmovdqa 0x120(%rdi), %ymm5
94+
vmovdqa 0x140(%rdi), %ymm6
95+
vmovdqa 0x160(%rdi), %ymm7
96+
vmovdqa 0x180(%rdi), %ymm8
97+
vmovdqa 0x1a0(%rdi), %ymm9
98+
vmovdqa 0x1c0(%rdi), %ymm10
99+
vmovdqa 0x1e0(%rdi), %ymm11
100+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
101+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
102+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
103+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
104+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
105+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
106+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
107+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
108+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
109+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
110+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
111+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
112+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
113+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
114+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
115+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
116+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
117+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
118+
vpsrlq $0x20, %ymm7, %ymm7
119+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
120+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
121+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
122+
vpsrlq $0x20, %ymm5, %ymm5
123+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
124+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
125+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
126+
vpsrlq $0x20, %ymm3, %ymm3
127+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
128+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
129+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
130+
vpsrlq $0x20, %ymm10, %ymm10
131+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
132+
vmovdqa %ymm9, 0x100(%rdi)
133+
vmovdqa %ymm8, 0x120(%rdi)
134+
vmovdqa %ymm7, 0x140(%rdi)
135+
vmovdqa %ymm6, 0x160(%rdi)
136+
vmovdqa %ymm5, 0x180(%rdi)
137+
vmovdqa %ymm4, 0x1a0(%rdi)
138+
vmovdqa %ymm3, 0x1c0(%rdi)
139+
vmovdqa %ymm11, 0x1e0(%rdi)
140+
vmovdqa 0x200(%rdi), %ymm4
141+
vmovdqa 0x220(%rdi), %ymm5
142+
vmovdqa 0x240(%rdi), %ymm6
143+
vmovdqa 0x260(%rdi), %ymm7
144+
vmovdqa 0x280(%rdi), %ymm8
145+
vmovdqa 0x2a0(%rdi), %ymm9
146+
vmovdqa 0x2c0(%rdi), %ymm10
147+
vmovdqa 0x2e0(%rdi), %ymm11
148+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
149+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
150+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
151+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
152+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
153+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
154+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
155+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
156+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
157+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
158+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
159+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
160+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
161+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
162+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
163+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
164+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
165+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
166+
vpsrlq $0x20, %ymm7, %ymm7
167+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
168+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
169+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
170+
vpsrlq $0x20, %ymm5, %ymm5
171+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
172+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
173+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
174+
vpsrlq $0x20, %ymm3, %ymm3
175+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
176+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
177+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
178+
vpsrlq $0x20, %ymm10, %ymm10
179+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
180+
vmovdqa %ymm9, 0x200(%rdi)
181+
vmovdqa %ymm8, 0x220(%rdi)
182+
vmovdqa %ymm7, 0x240(%rdi)
183+
vmovdqa %ymm6, 0x260(%rdi)
184+
vmovdqa %ymm5, 0x280(%rdi)
185+
vmovdqa %ymm4, 0x2a0(%rdi)
186+
vmovdqa %ymm3, 0x2c0(%rdi)
187+
vmovdqa %ymm11, 0x2e0(%rdi)
188+
vmovdqa 0x300(%rdi), %ymm4
189+
vmovdqa 0x320(%rdi), %ymm5
190+
vmovdqa 0x340(%rdi), %ymm6
191+
vmovdqa 0x360(%rdi), %ymm7
192+
vmovdqa 0x380(%rdi), %ymm8
193+
vmovdqa 0x3a0(%rdi), %ymm9
194+
vmovdqa 0x3c0(%rdi), %ymm10
195+
vmovdqa 0x3e0(%rdi), %ymm11
196+
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
197+
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
198+
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
199+
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
200+
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
201+
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
202+
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
203+
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
204+
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
205+
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
206+
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
207+
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
208+
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
209+
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
210+
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
211+
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
212+
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
213+
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
214+
vpsrlq $0x20, %ymm7, %ymm7
215+
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
216+
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
217+
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
218+
vpsrlq $0x20, %ymm5, %ymm5
219+
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
220+
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
221+
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
222+
vpsrlq $0x20, %ymm3, %ymm3
223+
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
224+
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
225+
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
226+
vpsrlq $0x20, %ymm10, %ymm10
227+
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
228+
vmovdqa %ymm9, 0x300(%rdi)
229+
vmovdqa %ymm8, 0x320(%rdi)
230+
vmovdqa %ymm7, 0x340(%rdi)
231+
vmovdqa %ymm6, 0x360(%rdi)
232+
vmovdqa %ymm5, 0x380(%rdi)
233+
vmovdqa %ymm4, 0x3a0(%rdi)
234+
vmovdqa %ymm3, 0x3c0(%rdi)
235+
vmovdqa %ymm11, 0x3e0(%rdi)
236+
retq
237+
.cfi_endproc

proofs/hol_light/x86_64/proofs/dump_bytecode.ml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,7 @@ needs "x86/proofs/base.ml";;
88
print_string "=== bytecode start: x86_64/mldsa/mldsa_ntt.o ================\n";;
99
print_literal_from_elf "x86_64/mldsa/mldsa_ntt.o";;
1010
print_string "==== bytecode end =====================================\n\n";;
11+
12+
print_string "=== bytecode start: x86_64/mldsa/mldsa_nttunpack.o ================\n";;
13+
print_literal_from_elf "x86_64/mldsa/mldsa_nttunpack.o";;
14+
print_string "==== bytecode end =====================================\n\n";;

0 commit comments

Comments
 (0)