|
2 | 2 | // AVX/AVX2 instructions are db-encoded for broad assembler compatibility. |
3 | 3 | // Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1). |
4 | 4 | // Each pointer addresses 128 QWords (1024 bytes). |
5 | | -// Uses ymm0-ymm9. Non-volatile ymm6-ymm9 saved/restored on Windows. |
| 5 | +// Uses ymm0-ymm9; ymm6-ymm9 are MS x64 non-volatile (saved/restored). |
6 | 6 | // Register map during G rounds: ymm0 = A(v0..v3), ymm1 = B(v4..v7), |
7 | 7 | // ymm2 = C(v8..v11), ymm3 = D(v12..v15), ymm4-ymm5 = temps. |
8 | 8 | // Stack layout (sub rsp, 2184): |
9 | | -// [rsp+0..127] ymm6-9 save area (Windows only, 4 * 32 = 128 bytes) |
| 9 | +// [rsp+0..127] ymm6-9 save area (4 * 32 = 128 bytes) |
10 | 10 | // [rsp+128..1151] R_buf (1024 bytes) |
11 | 11 | // [rsp+1152..2175] Z_buf (1024 bytes) |
12 | 12 | // [rsp+2176..2183] alignment padding |
|
17 | 17 |
|
18 | 18 | sub rsp, 2184 |
19 | 19 |
|
20 | | -{$IFDEF MSWINDOWS} |
21 | 20 | db $C5, $FE, $7F, $34, $24 // vmovdqu yword [rsp], ymm6 |
22 | 21 | db $C5, $FE, $7F, $7C, $24, $20 // vmovdqu yword [rsp + $20], ymm7 |
23 | 22 | db $C5, $7E, $7F, $44, $24, $40 // vmovdqu yword [rsp + $40], ymm8 |
24 | 23 | db $C5, $7E, $7F, $4C, $24, $60 // vmovdqu yword [rsp + $60], ymm9 |
25 | | -{$ENDIF} |
26 | 24 |
|
27 | 25 | // ========================================================================= |
28 | 26 | // Step 1: Compute R_buf = Left XOR Right, store at [rsp+128] |
|
328 | 326 | jb @final_xor_loop |
329 | 327 |
|
330 | 328 | @epilogue: |
331 | | -{$IFDEF MSWINDOWS} |
332 | 329 | db $C5, $FE, $6F, $34, $24 // vmovdqu ymm6, yword [rsp] |
333 | 330 | db $C5, $FE, $6F, $7C, $24, $20 // vmovdqu ymm7, yword [rsp + $20] |
334 | 331 | db $C5, $7E, $6F, $44, $24, $40 // vmovdqu ymm8, yword [rsp + $40] |
335 | 332 | db $C5, $7E, $6F, $4C, $24, $60 // vmovdqu ymm9, yword [rsp + $60] |
336 | | -{$ENDIF} |
337 | 333 |
|
338 | 334 | add rsp, 2184 |
339 | 335 | db $C5, $F8, $77 // vzeroupper |
0 commit comments