|
20 | 20 |
|
21 | 21 | /* |
22 | 22 | * WARNING: This file is auto-derived from the mldsa-native source file |
23 | | - * dev/x86_64/src/rej_uniform_avx2.S using scripts/simpasm. Do not modify it directly. |
| 23 | + * dev/x86_64/src/rej_uniform_avx2_asm.S using scripts/simpasm. Do not modify it directly. |
24 | 24 | */ |
25 | 25 |
|
26 | | -#if defined(__ELF__) |
27 | | -.section .note.GNU-stack,"",@progbits |
28 | | -#endif |
29 | | - |
30 | 26 | .text |
31 | 27 | .balign 4 |
32 | 28 | #ifdef __APPLE__ |
33 | | -.global _PQCP_MLDSA_NATIVE_MLDSA44_mld_rej_uniform_avx2 |
34 | | -_PQCP_MLDSA_NATIVE_MLDSA44_mld_rej_uniform_avx2: |
| 29 | +.global _PQCP_MLDSA_NATIVE_MLDSA44_rej_uniform_avx2_asm |
| 30 | +_PQCP_MLDSA_NATIVE_MLDSA44_rej_uniform_avx2_asm: |
35 | 31 | #else |
36 | | -.global PQCP_MLDSA_NATIVE_MLDSA44_mld_rej_uniform_avx2 |
37 | | -PQCP_MLDSA_NATIVE_MLDSA44_mld_rej_uniform_avx2: |
| 32 | +.global PQCP_MLDSA_NATIVE_MLDSA44_rej_uniform_avx2_asm |
| 33 | +PQCP_MLDSA_NATIVE_MLDSA44_rej_uniform_avx2_asm: |
38 | 34 | #endif |
39 | 35 |
|
40 | 36 | .cfi_startproc |
41 | 37 | endbr64 |
42 | | - movabs $0xff050403ff020100,%r10 |
43 | | - |
44 | | - vmovq %r10,%xmm0 |
45 | | - movabs $0xff0b0a09ff080706,%r10 |
46 | | - |
47 | | - vpinsrq $0x1,%r10,%xmm0,%xmm0 |
48 | | - movabs $0xff090807ff060504,%r10 |
| 38 | + movabsq $-0xfafbfc00fdff00, %r10 # imm = 0xFF050403FF020100 |
| 39 | + vmovq %r10, %xmm0 |
| 40 | + movabsq $-0xf4f5f600f7f8fa, %r10 # imm = 0xFF0B0A09FF080706 |
| 41 | + vpinsrq $0x1, %r10, %xmm0, %xmm0 |
| 42 | + movabsq $-0xf6f7f800f9fafc, %r10 # imm = 0xFF090807FF060504 |
| 43 | + vmovq %r10, %xmm3 |
| 44 | + movabsq $-0xf0f1f200f3f4f6, %r10 # imm = 0xFF0F0E0DFF0C0B0A |
| 45 | + vpinsrq $0x1, %r10, %xmm3, %xmm3 |
| 46 | + vinserti128 $0x1, %xmm3, %ymm0, %ymm0 |
| 47 | + movl $0x7fffff, %r8d # imm = 0x7FFFFF |
| 48 | + vmovd %r8d, %xmm1 |
| 49 | + vpbroadcastd %xmm1, %ymm1 |
| 50 | + movl $0x7fe001, %r8d # imm = 0x7FE001 |
| 51 | + vmovd %r8d, %xmm2 |
| 52 | + vpbroadcastd %xmm2, %ymm2 |
| 53 | + xorl %eax, %eax |
| 54 | + xorl %ecx, %ecx |
49 | 55 |
|
50 | | - vmovq %r10,%xmm3 |
51 | | - movabs $0xff0f0e0dff0c0b0a,%r10 |
| 56 | +Lrej_uniform_avx2_asm_loop: |
| 57 | + cmpl $0xf8, %eax |
| 58 | + ja Lrej_uniform_avx2_asm_scalar |
| 59 | + cmpl $0x328, %ecx # imm = 0x328 |
| 60 | + ja Lrej_uniform_avx2_asm_scalar |
| 61 | + vmovdqu (%rsi,%rcx), %ymm3 |
| 62 | + addl $0x18, %ecx |
| 63 | + vpermq $0x94, %ymm3, %ymm3 # ymm3 = ymm3[0,1,1,2] |
| 64 | + vpshufb %ymm0, %ymm3, %ymm3 |
| 65 | + vpand %ymm1, %ymm3, %ymm3 |
| 66 | + vpsubd %ymm2, %ymm3, %ymm4 |
| 67 | + vmovmskps %ymm4, %r8d |
| 68 | + popcntl %r8d, %r9d |
| 69 | + vmovq (%rdx,%r8,8), %xmm4 |
| 70 | + vpmovzxbd %xmm4, %ymm4 # ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero |
| 71 | + vpermd %ymm3, %ymm4, %ymm3 |
| 72 | + vmovdqu %ymm3, (%rdi,%rax,4) |
| 73 | + addl %r9d, %eax |
| 74 | + jmp Lrej_uniform_avx2_asm_loop |
52 | 75 |
|
53 | | - vpinsrq $0x1,%r10,%xmm3,%xmm3 |
54 | | - vinserti128 $0x1,%xmm3,%ymm0,%ymm0 |
55 | | - mov $0x7fffff,%r8d |
56 | | - vmovd %r8d,%xmm1 |
57 | | - vpbroadcastd %xmm1,%ymm1 |
58 | | - mov $0x7fe001,%r8d |
59 | | - vmovd %r8d,%xmm2 |
60 | | - vpbroadcastd %xmm2,%ymm2 |
61 | | - xor %eax,%eax |
62 | | - xor %ecx,%ecx |
| 76 | +Lrej_uniform_avx2_asm_scalar: |
| 77 | + cmpl $0x100, %eax # imm = 0x100 |
| 78 | + jae Lrej_uniform_avx2_asm_done |
| 79 | + cmpl $0x345, %ecx # imm = 0x345 |
| 80 | + ja Lrej_uniform_avx2_asm_done |
| 81 | + movzwl (%rsi,%rcx), %r8d |
| 82 | + movzbl 0x2(%rsi,%rcx), %r9d |
| 83 | + shll $0x10, %r9d |
| 84 | + orl %r9d, %r8d |
| 85 | + andl $0x7fffff, %r8d # imm = 0x7FFFFF |
| 86 | + addl $0x3, %ecx |
| 87 | + cmpl $0x7fe001, %r8d # imm = 0x7FE001 |
| 88 | + jae Lrej_uniform_avx2_asm_scalar |
| 89 | + movl %r8d, (%rdi,%rax,4) |
| 90 | + addl $0x1, %eax |
| 91 | + jmp Lrej_uniform_avx2_asm_scalar |
63 | 92 |
|
64 | | -Lmld_rej_uniform_avx2_loop: |
65 | | - cmp $0xf8,%eax |
66 | | - ja Lmld_rej_uniform_avx2_scalar |
67 | | - cmp $0x328,%ecx |
68 | | - ja Lmld_rej_uniform_avx2_scalar |
69 | | - vmovdqu (%rsi,%rcx,1),%ymm3 |
70 | | - add $0x18,%ecx |
71 | | - vpermq $0x94,%ymm3,%ymm3 |
72 | | - vpshufb %ymm0,%ymm3,%ymm3 |
73 | | - vpand %ymm1,%ymm3,%ymm3 |
74 | | - vpsubd %ymm2,%ymm3,%ymm4 |
75 | | - vmovmskps %ymm4,%r8d |
76 | | - popcnt %r8d,%r9d |
77 | | - vmovq (%rdx,%r8,8),%xmm4 |
78 | | - vpmovzxbd %xmm4,%ymm4 |
79 | | - vpermd %ymm3,%ymm4,%ymm3 |
80 | | - vmovdqu %ymm3,(%rdi,%rax,4) |
81 | | - add %r9d,%eax |
82 | | - jmp Lmld_rej_uniform_avx2_loop |
83 | | - |
84 | | -Lmld_rej_uniform_avx2_scalar: |
85 | | - cmp $0x100,%eax |
86 | | - jae Lmld_rej_uniform_avx2_done |
87 | | - cmp $0x345,%ecx |
88 | | - ja Lmld_rej_uniform_avx2_done |
89 | | - movzwl (%rsi,%rcx,1),%r8d |
90 | | - movzbl 0x2(%rsi,%rcx,1),%r9d |
91 | | - shl $0x10,%r9d |
92 | | - or %r9d,%r8d |
93 | | - and $0x7fffff,%r8d |
94 | | - add $0x3,%ecx |
95 | | - cmp $0x7fe001,%r8d |
96 | | - jae Lmld_rej_uniform_avx2_scalar |
97 | | - mov %r8d,(%rdi,%rax,4) |
98 | | - add $0x1,%eax |
99 | | - jmp Lmld_rej_uniform_avx2_scalar |
100 | | - |
101 | | -Lmld_rej_uniform_avx2_done: |
102 | | - vzeroupper |
103 | | - ret |
| 93 | +Lrej_uniform_avx2_asm_done: |
| 94 | + retq |
104 | 95 | .cfi_endproc |
| 96 | + |
| 97 | +#if defined(__ELF__) |
| 98 | +.section .note.GNU-stack,"",%progbits |
| 99 | +#endif |
0 commit comments