Skip to content

Commit 325b08c

Browse files
committed
simd cleanup attempt
1 parent 6f9d091 commit 325b08c

37 files changed

Lines changed: 98 additions & 271 deletions

HashLib/src/Include/Simd/Adler32/Adler32BlocksAvx2_x86_64.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// ASums layout: [SumA: UInt32, SumB: UInt32].
44
// Constants layout: [weights: 32B, ones_16: 32B] at offsets 0 and 32.
55
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
6-
// Uses ymm0-ymm5 only (all volatile on Windows x64, no saves needed).
6+
// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed).
77
// Weights and ones are reloaded from memory each iteration to avoid
88
// using non-volatile ymm registers.
99
// AVX/AVX2 instructions are db-encoded for broad assembler compatibility.

HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_i386.inc

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,17 @@
44
// Processes num_blocks x 32-byte blocks; caller applies mod 65521.
55
//
66
// x64 uses xmm6-xmm9 for widened weights; IA-32 only has xmm0-xmm7, so widened weights
7-
// live on stack (4 x 16 bytes). xmm6–xmm7 are non-volatile on Win32 and are saved there.
7+
// live on stack (4 x 16 bytes). xmm6xmm7 saved/restored defensively (volatile on i386).
88
//
99
// Same SSE2 emulation as Adler32BlocksSse2_x86_64.inc (punpcklbw/hbw + pmaddwd).
10+
//
11+
// Stack layout (sub esp, 96):
12+
// [esp + 0..15]: xmm6 save
13+
// [esp + 16..31]: xmm7 save
14+
// [esp + 32..47]: w0 (weights_hi low)
15+
// [esp + 48..63]: w1 (weights_hi high)
16+
// [esp + 64..79]: w2 (weights_lo low)
17+
// [esp + 80..95]: w3 (weights_lo high)
1018

1119
// Preserve constants pointer (eax) before GPR reloads from ASums
1220
push eax
@@ -23,7 +31,6 @@
2331
pxor xmm0, xmm0 // v_s1 = 0
2432
pxor xmm3, xmm3 // zero for unpack / psadbw
2533

26-
{$IFDEF MSWINDOWS}
2734
sub esp, 96
2835
movdqu oword ptr [esp], xmm6
2936
movdqu oword ptr [esp + $10], xmm7
@@ -43,24 +50,6 @@
4350
movdqa xmm6, xmm5
4451
punpckhbw xmm6, xmm3
4552
movdqu oword ptr [esp + $50], xmm6 // w3
46-
{$ELSE}
47-
sub esp, 64
48-
mov edx, dword ptr [esp + 64]
49-
movdqu xmm4, oword ptr [edx]
50-
movdqa xmm6, xmm4
51-
punpcklbw xmm6, xmm3
52-
movdqu oword ptr [esp], xmm6
53-
movdqa xmm6, xmm4
54-
punpckhbw xmm6, xmm3
55-
movdqu oword ptr [esp + $10], xmm6
56-
movdqu xmm5, oword ptr [edx + 16]
57-
movdqa xmm6, xmm5
58-
punpcklbw xmm6, xmm3
59-
movdqu oword ptr [esp + $20], xmm6
60-
movdqa xmm6, xmm5
61-
punpckhbw xmm6, xmm3
62-
movdqu oword ptr [esp + $30], xmm6
63-
{$ENDIF}
6453

6554
@adler32_sse2_loop:
6655
paddd xmm2, xmm0
@@ -74,13 +63,8 @@
7463
movdqa xmm5, xmm4
7564
punpcklbw xmm5, xmm3
7665
punpckhbw xmm4, xmm3
77-
{$IFDEF MSWINDOWS}
7866
movdqu xmm6, oword ptr [esp + $20]
7967
movdqu xmm7, oword ptr [esp + $30]
80-
{$ELSE}
81-
movdqu xmm6, oword ptr [esp]
82-
movdqu xmm7, oword ptr [esp + $10]
83-
{$ENDIF}
8468
pmaddwd xmm5, xmm6
8569
pmaddwd xmm4, xmm7
8670
paddd xmm5, xmm4
@@ -95,13 +79,8 @@
9579
movdqa xmm5, xmm4
9680
punpcklbw xmm5, xmm3
9781
punpckhbw xmm4, xmm3
98-
{$IFDEF MSWINDOWS}
9982
movdqu xmm6, oword ptr [esp + $40]
10083
movdqu xmm7, oword ptr [esp + $50]
101-
{$ELSE}
102-
movdqu xmm6, oword ptr [esp + $20]
103-
movdqu xmm7, oword ptr [esp + $30]
104-
{$ENDIF}
10584
pmaddwd xmm5, xmm6
10685
pmaddwd xmm4, xmm7
10786
paddd xmm5, xmm4
@@ -130,13 +109,9 @@
130109
mov dword ptr [edi], eax
131110
mov dword ptr [edi + 4], esi
132111

133-
{$IFDEF MSWINDOWS}
134112
movdqu xmm6, oword ptr [esp]
135113
movdqu xmm7, oword ptr [esp + $10]
136114
add esp, 96
137-
{$ELSE}
138-
add esp, 64
139-
{$ENDIF}
140115
add esp, 4 // discard saved AConstants
141116
pop edi
142117
pop esi

HashLib/src/Include/Simd/Adler32/Adler32BlocksSse2_x86_64.inc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,18 @@
33
// ASums layout: [SumA: UInt32, SumB: UInt32].
44
// Constants layout: [weights_hi: 16B, weights_lo: 16B] (only first 32 bytes used).
55
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
6-
// Uses xmm0-xmm9 (xmm6-xmm9 saved/restored on Windows).
6+
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
77
//
88
// Emulates SSSE3 pmaddubsw via punpcklbw/punpckhbw + pmaddwd:
99
// data bytes are zero-extended to i16, then multiplied with pre-widened
1010
// weight bytes via pmaddwd (SSE2), producing the same 4 x i32 weighted
1111
// sums per 16-byte half that pmaddubsw + pmaddwd would yield.
1212

13-
{$IFDEF MSWINDOWS}
1413
sub rsp, 64
1514
movdqu oword [rsp], xmm6
1615
movdqu oword [rsp + $10], xmm7
1716
movdqu oword [rsp + $20], xmm8
1817
movdqu oword [rsp + $30], xmm9
19-
{$ENDIF}
2018

2119
// Zero constant
2220
pxor xmm3, xmm3
@@ -104,10 +102,8 @@
104102
mov dword [r8], eax
105103
mov dword [r8 + 4], r10d
106104

107-
{$IFDEF MSWINDOWS}
108105
movdqu xmm6, oword [rsp]
109106
movdqu xmm7, oword [rsp + $10]
110107
movdqu xmm8, oword [rsp + $20]
111108
movdqu xmm9, oword [rsp + $30]
112109
add rsp, 64
113-
{$ENDIF}

HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_i386.inc

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
// Constants: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B] (48 bytes; same as x64 SSSE3).
44
// No xmm8 on IA-32: psadbw uses a copy in xmm5 (first half) or xmm4 (second half), then reload weights.
55
// Caller applies mod 65521.
6+
//
7+
// xmm6xmm7 saved/restored defensively (volatile on i386).
68

79
push eax
810

@@ -17,14 +19,10 @@
1719
pxor xmm0, xmm0
1820
pxor xmm3, xmm3
1921

20-
{$IFDEF MSWINDOWS}
2122
sub esp, 32
2223
movdqu oword ptr [esp], xmm6
2324
movdqu oword ptr [esp + $10], xmm7
2425
mov edx, dword ptr [esp + 32]
25-
{$ELSE}
26-
mov edx, dword ptr [esp]
27-
{$ENDIF}
2826

2927
movdqu xmm4, oword ptr [edx]
3028
movdqu xmm5, oword ptr [edx + 16]
@@ -76,11 +74,9 @@
7674
mov dword ptr [edi], eax
7775
mov dword ptr [edi + 4], esi
7876

79-
{$IFDEF MSWINDOWS}
8077
movdqu xmm6, oword ptr [esp]
8178
movdqu xmm7, oword ptr [esp + $10]
8279
add esp, 32
83-
{$ENDIF}
8480
add esp, 4
8581
pop edi
8682
pop esi

HashLib/src/Include/Simd/Adler32/Adler32BlocksSsse3_x86_64.inc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,12 @@
33
// ASums layout: [SumA: UInt32, SumB: UInt32].
44
// Constants layout: [weights_hi: 16B, weights_lo: 16B, ones_16: 16B].
55
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
6-
// Uses xmm0-xmm8 (xmm6-xmm8 saved/restored on Windows).
6+
// Uses xmm0-xmm8; xmm6-xmm8 are MS x64 non-volatile (saved/restored).
77

8-
{$IFDEF MSWINDOWS}
98
sub rsp, 48
109
movdqu oword [rsp], xmm6
1110
movdqu oword [rsp + $10], xmm7
1211
movdqu oword [rsp + $20], xmm8
13-
{$ENDIF}
1412

1513
// Load constants
1614
movdqu xmm4, oword [r9]
@@ -81,9 +79,7 @@
8179
mov dword [r8], eax
8280
mov dword [r8 + 4], r10d
8381

84-
{$IFDEF MSWINDOWS}
8582
movdqu xmm6, oword [rsp]
8683
movdqu xmm7, oword [rsp + $10]
8784
movdqu xmm8, oword [rsp + $20]
8885
add rsp, 48
89-
{$ENDIF}

HashLib/src/Include/Simd/Argon2/Argon2FillBlockAvx2_x86_64.inc

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
// AVX/AVX2 instructions are db-encoded for broad assembler compatibility.
33
// Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1).
44
// Each pointer addresses 128 QWords (1024 bytes).
5-
// Uses ymm0-ymm9. Non-volatile ymm6-ymm9 saved/restored on Windows.
5+
// Uses ymm0-ymm9; ymm6-ymm9 are MS x64 non-volatile (saved/restored).
66
// Register map during G rounds: ymm0 = A(v0..v3), ymm1 = B(v4..v7),
77
// ymm2 = C(v8..v11), ymm3 = D(v12..v15), ymm4-ymm5 = temps.
88
// Stack layout (sub rsp, 2184):
9-
// [rsp+0..127] ymm6-9 save area (Windows only, 4 * 32 = 128 bytes)
9+
// [rsp+0..127] ymm6-9 save area (4 * 32 = 128 bytes)
1010
// [rsp+128..1151] R_buf (1024 bytes)
1111
// [rsp+1152..2175] Z_buf (1024 bytes)
1212
// [rsp+2176..2183] alignment padding
@@ -17,12 +17,10 @@
1717

1818
sub rsp, 2184
1919

20-
{$IFDEF MSWINDOWS}
2120
db $C5, $FE, $7F, $34, $24 // vmovdqu yword [rsp], ymm6
2221
db $C5, $FE, $7F, $7C, $24, $20 // vmovdqu yword [rsp + $20], ymm7
2322
db $C5, $7E, $7F, $44, $24, $40 // vmovdqu yword [rsp + $40], ymm8
2423
db $C5, $7E, $7F, $4C, $24, $60 // vmovdqu yword [rsp + $60], ymm9
25-
{$ENDIF}
2624

2725
// =========================================================================
2826
// Step 1: Compute R_buf = Left XOR Right, store at [rsp+128]
@@ -328,12 +326,10 @@
328326
jb @final_xor_loop
329327

330328
@epilogue:
331-
{$IFDEF MSWINDOWS}
332329
db $C5, $FE, $6F, $34, $24 // vmovdqu ymm6, yword [rsp]
333330
db $C5, $FE, $6F, $7C, $24, $20 // vmovdqu ymm7, yword [rsp + $20]
334331
db $C5, $7E, $6F, $44, $24, $40 // vmovdqu ymm8, yword [rsp + $40]
335332
db $C5, $7E, $6F, $4C, $24, $60 // vmovdqu ymm9, yword [rsp + $60]
336-
{$ENDIF}
337333

338334
add rsp, 2184
339335
db $C5, $F8, $77 // vzeroupper

HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_i386.inc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// IA-32: after SimdProc4Begin_i386 — ebx, esi, edi, eax = Left, Right, Current, WithXor
33
// (parallel to MS x64 ABI: rcx, rdx, r8, r9).
44
// Each pointer addresses 128 QWords (1024 bytes).
5-
// Uses xmm0–xmm7 only. Non-volatile xmm6–xmm7 saved/restored on Windows (MSWINDOWS).
5+
// Uses xmm0xmm7; xmm6–xmm7 saved/restored defensively (volatile on i386).
66
// Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7),
77
// xmm4-5 = C(8..11), xmm6-7 = D(12..15) / temps (same roles as x64, fewer XMM).
88
// IA-32 stack (sub esp, 2132): WithXor at [esp+2128]; spill slots [esp+2080],[esp+2096],[esp+2112];
@@ -15,10 +15,8 @@
1515

1616
mov dword ptr [esp + 2128], eax
1717

18-
{$IFDEF MSWINDOWS}
1918
movdqu oword ptr [esp], xmm6
2019
movdqu oword ptr [esp + 16], xmm7
21-
{$ENDIF}
2220

2321
// =========================================================================
2422
// Step 1: Compute R_buf = Left XOR Right, store at [esp+32]
@@ -729,10 +727,8 @@
729727
jb @final_xor_loop
730728

731729
@epilogue:
732-
{$IFDEF MSWINDOWS}
733730
movdqu xmm6, oword ptr [esp]
734731
movdqu xmm7, oword ptr [esp + 16]
735-
{$ENDIF}
736732

737733
add esp, 2132
738734
pop edi

HashLib/src/Include/Simd/Argon2/Argon2FillBlockSse2_x86_64.inc

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
// SSE2 implementation of Argon2 FillBlock.
22
// Expects MS x64 ABI: rcx = Left ptr, rdx = Right ptr, r8 = Current ptr, r9 = WithXor (0 or 1).
33
// Each pointer addresses 128 QWords (1024 bytes).
4-
// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows.
4+
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
55
// Register map during G rounds: xmm0-1 = A(0..3), xmm2-3 = B(4..7),
66
// xmm4-5 = C(8..11), xmm6-7 = D(12..15), xmm8-9 = temps.
77
// Stack layout (sub rsp, 2120):
8-
// [rsp+0..63] xmm6-9 save area (Windows only)
8+
// [rsp+0..63] xmm6-9 save area
99
// [rsp+64..1087] R_buf (1024 bytes)
1010
// [rsp+1088..2111] Z_buf (1024 bytes)
1111
// [rsp+2112..2119] alignment padding
@@ -18,12 +18,10 @@
1818

1919
sub rsp, 2120
2020

21-
{$IFDEF MSWINDOWS}
2221
movdqu oword [rsp], xmm6
2322
movdqu oword [rsp + $10], xmm7
2423
movdqu oword [rsp + $20], xmm8
2524
movdqu oword [rsp + $30], xmm9
26-
{$ENDIF}
2725

2826
// =========================================================================
2927
// Step 1: Compute R_buf = Left XOR Right, store at [rsp+64]
@@ -558,11 +556,9 @@
558556
jb @final_xor_loop
559557

560558
@epilogue:
561-
{$IFDEF MSWINDOWS}
562559
movdqu xmm6, oword [rsp]
563560
movdqu xmm7, oword [rsp + $10]
564561
movdqu xmm8, oword [rsp + $20]
565562
movdqu xmm9, oword [rsp + $30]
566-
{$ENDIF}
567563

568564
add rsp, 2120

HashLib/src/Include/Simd/Blake2B/Blake2BCompressAvx2_x86_64.inc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// AVX2 implementation of BLAKE2b compress (fully unrolled 12 rounds).
22
// Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr.
3-
// Uses ymm0-ymm5 only (all volatile on Windows x64).
3+
// Uses ymm0-ymm5 only (volatile under both ABIs; no saves needed).
44
// Register map: ymm0 = a (v0-3), ymm1 = b (v4-7), ymm2 = c (v8-11), ymm3 = d (v12-15),
55
// ymm4 = message temp, ymm5 = computation temp.
66
// Rotations: ROT32 via vpshufd, ROT16/24/63 via shift+or.

HashLib/src/Include/Simd/Blake2B/Blake2BCompressSse2_x86_64.inc

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
11
// SSE2 implementation of BLAKE2b compress (fully unrolled 12 rounds).
22
// Expects MS x64 ABI: rcx = state ptr, rdx = msg ptr, r8 = counter+flags ptr, r9 = IV ptr.
3-
// Uses xmm0-xmm9. Non-volatile xmm6-xmm9 saved/restored on Windows.
3+
// Uses xmm0-xmm9; xmm6-xmm9 are MS x64 non-volatile (saved/restored).
44
// Register map: xmm0-1 = row1 (v0-3), xmm2-3 = row2 (v4-7),
55
// xmm4-5 = row3 (v8-11), xmm6-7 = row4 (v12-15), xmm8-9 = temps.
66
// Reference: BLAKE2/BLAKE2 sse/ by Samuel Neves.
77

8-
{$IFDEF MSWINDOWS}
98
sub rsp, 64
109
movdqu oword [rsp], xmm6
1110
movdqu oword [rsp + $10], xmm7
1211
movdqu oword [rsp + $20], xmm8
1312
movdqu oword [rsp + $30], xmm9
14-
{$ENDIF}
1513

1614
// Initialize working vector
1715
movdqu xmm0, oword [rcx]
@@ -1995,10 +1993,8 @@
19951993
movdqu oword [rcx + $20], xmm2
19961994
movdqu oword [rcx + $30], xmm3
19971995

1998-
{$IFDEF MSWINDOWS}
19991996
movdqu xmm6, oword [rsp]
20001997
movdqu xmm7, oword [rsp + $10]
20011998
movdqu xmm8, oword [rsp + $20]
20021999
movdqu xmm9, oword [rsp + $30]
20032000
add rsp, 64
2004-
{$ENDIF}

0 commit comments

Comments
 (0)