Skip to content

Commit db4f740

Browse files
committed
Implement SIMD Support for Scrypt
1 parent b61d92b commit db4f740

10 files changed

Lines changed: 562 additions & 144 deletions

File tree

HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ uses
109109
HlpArgon2Dispatch in '..\..\HashLib\src\KDF\HlpArgon2Dispatch.pas',
110110
HlpArgon2TypeAndVersion in '..\..\HashLib\src\KDF\HlpArgon2TypeAndVersion.pas',
111111
HlpPBKDF_ScryptNotBuildInAdapter in '..\..\HashLib\src\KDF\HlpPBKDF_ScryptNotBuildInAdapter.pas',
112+
HlpScryptDispatch in '..\..\HashLib\src\KDF\HlpScryptDispatch.pas',
112113
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
113114
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
114115
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',

HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ uses
108108
HlpArgon2Dispatch in '..\..\HashLib\src\KDF\HlpArgon2Dispatch.pas',
109109
HlpArgon2TypeAndVersion in '..\..\HashLib\src\KDF\HlpArgon2TypeAndVersion.pas',
110110
HlpPBKDF_ScryptNotBuildInAdapter in '..\..\HashLib\src\KDF\HlpPBKDF_ScryptNotBuildInAdapter.pas',
111+
HlpScryptDispatch in '..\..\HashLib\src\KDF\HlpScryptDispatch.pas',
111112
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
112113
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
113114
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',

HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ uses
130130
HlpArgon2Dispatch in '..\..\HashLib\src\KDF\HlpArgon2Dispatch.pas',
131131
HlpArgon2TypeAndVersion in '..\..\HashLib\src\KDF\HlpArgon2TypeAndVersion.pas',
132132
HlpPBKDF_ScryptNotBuildInAdapter in '..\..\HashLib\src\KDF\HlpPBKDF_ScryptNotBuildInAdapter.pas',
133+
HlpScryptDispatch in '..\..\HashLib\src\KDF\HlpScryptDispatch.pas',
133134
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
134135
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
135136
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
// AVX2 (VEX-128) implementation of fused XOR + Salsa20/8 on Percival-permuted data.
2+
//
3+
// Reference: Colin Percival, "Stronger Key Derivation via Sequential
4+
// Memory-Hard Functions" (2009), and the Tarsnap scrypt reference
5+
// implementation (crypto_scrypt-sse.c). The (i*5 mod 16) data permutation
6+
// arranges each 16-word Salsa20 state into role-based diagonal order,
7+
// enabling lane-parallel SIMD processing of column and row quarter-rounds
8+
// with vpshufd-based diagonalize/undiagonalize between them.
9+
//
10+
// Identical algorithm to the SSE2 variant but uses VEX-128 3-operand
11+
// encoding, eliminating movdqa register copies and reducing each QR step
12+
// from 7 to 5 instructions.
13+
//
14+
// Expects MS x64 ABI: rcx = State ptr, rdx = Input ptr.
15+
// Each pointer addresses 16 UInt32 (64 bytes) in permuted order:
16+
// xmm0 = A = {w0,w5,w10,w15}, xmm1 = B = {w4,w9,w14,w3},
17+
// xmm2 = C = {w8,w13,w2,w7}, xmm3 = D = {w12,w1,w6,w11}.
18+
// Operation: State = Salsa20/8(State XOR Input)
19+
// Uses xmm0-xmm5 (all volatile). No spills needed.
20+
// Stack: 72 bytes (64 for saved XOR'd state + 8 alignment padding).
21+
22+
sub rsp, 72
23+
24+
// =========================================================================
25+
// Load state, XOR with input, save for final addition
26+
// =========================================================================
27+
vmovdqu xmm0, oword [rcx]
28+
vmovdqu xmm4, oword [rdx]
29+
vpxor xmm0, xmm0, xmm4
30+
vmovdqu xmm1, oword [rcx + $10]
31+
vmovdqu xmm4, oword [rdx + $10]
32+
vpxor xmm1, xmm1, xmm4
33+
vmovdqu xmm2, oword [rcx + $20]
34+
vmovdqu xmm4, oword [rdx + $20]
35+
vpxor xmm2, xmm2, xmm4
36+
vmovdqu xmm3, oword [rcx + $30]
37+
vmovdqu xmm4, oword [rdx + $30]
38+
vpxor xmm3, xmm3, xmm4
39+
40+
vmovdqa oword [rsp], xmm0
41+
vmovdqa oword [rsp + $10], xmm1
42+
vmovdqa oword [rsp + $20], xmm2
43+
vmovdqa oword [rsp + $30], xmm3
44+
45+
// =========================================================================
46+
// 4 double-rounds (= 8 rounds = Salsa20/8)
47+
// =========================================================================
48+
mov r10d, 4
49+
@double_round:
50+
51+
// --- Column quarter-round ---
52+
53+
// xmm1 ^= rotl(xmm0 + xmm3, 7)
54+
vpaddd xmm4, xmm0, xmm3
55+
vpslld xmm5, xmm4, 7
56+
vpsrld xmm4, xmm4, 25
57+
vpxor xmm1, xmm1, xmm5
58+
vpxor xmm1, xmm1, xmm4
59+
60+
// xmm2 ^= rotl(xmm1 + xmm0, 9)
61+
vpaddd xmm4, xmm1, xmm0
62+
vpslld xmm5, xmm4, 9
63+
vpsrld xmm4, xmm4, 23
64+
vpxor xmm2, xmm2, xmm5
65+
vpxor xmm2, xmm2, xmm4
66+
67+
// xmm3 ^= rotl(xmm2 + xmm1, 13)
68+
vpaddd xmm4, xmm2, xmm1
69+
vpslld xmm5, xmm4, 13
70+
vpsrld xmm4, xmm4, 19
71+
vpxor xmm3, xmm3, xmm5
72+
vpxor xmm3, xmm3, xmm4
73+
74+
// xmm0 ^= rotl(xmm3 + xmm2, 18)
75+
vpaddd xmm4, xmm3, xmm2
76+
vpslld xmm5, xmm4, 18
77+
vpsrld xmm4, xmm4, 14
78+
vpxor xmm0, xmm0, xmm5
79+
vpxor xmm0, xmm0, xmm4
80+
81+
// Diagonalize: rotate B right by 1, C by 2, D left by 1
82+
vpshufd xmm1, xmm1, $93
83+
vpshufd xmm2, xmm2, $4E
84+
vpshufd xmm3, xmm3, $39
85+
86+
// --- Row quarter-round (B/D roles swapped after diagonal shuffle) ---
87+
88+
// xmm3 ^= rotl(xmm0 + xmm1, 7)
89+
vpaddd xmm4, xmm0, xmm1
90+
vpslld xmm5, xmm4, 7
91+
vpsrld xmm4, xmm4, 25
92+
vpxor xmm3, xmm3, xmm5
93+
vpxor xmm3, xmm3, xmm4
94+
95+
// xmm2 ^= rotl(xmm3 + xmm0, 9)
96+
vpaddd xmm4, xmm3, xmm0
97+
vpslld xmm5, xmm4, 9
98+
vpsrld xmm4, xmm4, 23
99+
vpxor xmm2, xmm2, xmm5
100+
vpxor xmm2, xmm2, xmm4
101+
102+
// xmm1 ^= rotl(xmm2 + xmm3, 13)
103+
vpaddd xmm4, xmm2, xmm3
104+
vpslld xmm5, xmm4, 13
105+
vpsrld xmm4, xmm4, 19
106+
vpxor xmm1, xmm1, xmm5
107+
vpxor xmm1, xmm1, xmm4
108+
109+
// xmm0 ^= rotl(xmm1 + xmm2, 18)
110+
vpaddd xmm4, xmm1, xmm2
111+
vpslld xmm5, xmm4, 18
112+
vpsrld xmm4, xmm4, 14
113+
vpxor xmm0, xmm0, xmm5
114+
vpxor xmm0, xmm0, xmm4
115+
116+
// Undiagonalize: reverse the shuffles
117+
vpshufd xmm1, xmm1, $39
118+
vpshufd xmm2, xmm2, $4E
119+
vpshufd xmm3, xmm3, $93
120+
121+
dec r10d
122+
jnz @double_round
123+
124+
// =========================================================================
125+
// Final addition and store
126+
// =========================================================================
127+
vmovdqa xmm4, oword [rsp]
128+
vpaddd xmm0, xmm0, xmm4
129+
vmovdqa xmm4, oword [rsp + $10]
130+
vpaddd xmm1, xmm1, xmm4
131+
vmovdqa xmm4, oword [rsp + $20]
132+
vpaddd xmm2, xmm2, xmm4
133+
vmovdqa xmm4, oword [rsp + $30]
134+
vpaddd xmm3, xmm3, xmm4
135+
136+
vmovdqu oword [rcx], xmm0
137+
vmovdqu oword [rcx + $10], xmm1
138+
vmovdqu oword [rcx + $20], xmm2
139+
vmovdqu oword [rcx + $30], xmm3
140+
141+
add rsp, 72
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// SSE2 implementation of fused XOR + Salsa20/8 on Percival-permuted data.
2+
//
3+
// Reference: Colin Percival, "Stronger Key Derivation via Sequential
4+
// Memory-Hard Functions" (2009), and the Tarsnap scrypt reference
5+
// implementation (crypto_scrypt-sse.c). The (i*5 mod 16) data permutation
6+
// arranges each 16-word Salsa20 state into role-based diagonal order,
7+
// enabling lane-parallel SIMD processing of column and row quarter-rounds
8+
// with pshufd-based diagonalize/undiagonalize between them.
9+
//
10+
// Expects MS x64 ABI: rcx = State ptr, rdx = Input ptr.
11+
// Each pointer addresses 16 UInt32 (64 bytes) in permuted order:
12+
// xmm0 = A = {w0,w5,w10,w15}, xmm1 = B = {w4,w9,w14,w3},
13+
// xmm2 = C = {w8,w13,w2,w7}, xmm3 = D = {w12,w1,w6,w11}.
14+
// Operation: State = Salsa20/8(State XOR Input)
15+
// Uses xmm0-xmm5 (all volatile on Windows and System V). No spills needed.
16+
// Stack: 72 bytes (64 for saved XOR'd state + 8 alignment padding).
17+
//
18+
// Column QR (lane-parallel):
19+
// B ^= rotl(A+D,7); C ^= rotl(B+A,9); D ^= rotl(C+B,13); A ^= rotl(D+C,18)
20+
// Diag: pshufd B,$93; pshufd C,$4E; pshufd D,$39
21+
// Row QR (lane-parallel, swapped B/D roles):
22+
// D' ^= rotl(A+B',7); C' ^= rotl(D'+A,9); B' ^= rotl(C'+D',13); A ^= rotl(B'+C',18)
23+
// Undiag: pshufd B,$39; pshufd C,$4E; pshufd D,$93
24+
25+
sub rsp, 72
26+
27+
// =========================================================================
28+
// Load state, XOR with input, save for final addition
29+
// =========================================================================
30+
movdqu xmm0, oword [rcx]
31+
movdqu xmm4, oword [rdx]
32+
pxor xmm0, xmm4
33+
movdqu xmm1, oword [rcx + $10]
34+
movdqu xmm4, oword [rdx + $10]
35+
pxor xmm1, xmm4
36+
movdqu xmm2, oword [rcx + $20]
37+
movdqu xmm4, oword [rdx + $20]
38+
pxor xmm2, xmm4
39+
movdqu xmm3, oword [rcx + $30]
40+
movdqu xmm4, oword [rdx + $30]
41+
pxor xmm3, xmm4
42+
43+
movdqa oword [rsp], xmm0
44+
movdqa oword [rsp + $10], xmm1
45+
movdqa oword [rsp + $20], xmm2
46+
movdqa oword [rsp + $30], xmm3
47+
48+
// =========================================================================
49+
// 4 double-rounds (= 8 rounds = Salsa20/8)
50+
// =========================================================================
51+
mov r10d, 4
52+
@double_round:
53+
54+
// --- Column quarter-round ---
55+
56+
// xmm1 ^= rotl(xmm0 + xmm3, 7)
57+
movdqa xmm4, xmm0
58+
paddd xmm4, xmm3
59+
movdqa xmm5, xmm4
60+
pslld xmm5, 7
61+
psrld xmm4, 25
62+
pxor xmm1, xmm5
63+
pxor xmm1, xmm4
64+
65+
// xmm2 ^= rotl(xmm1 + xmm0, 9)
66+
movdqa xmm4, xmm1
67+
paddd xmm4, xmm0
68+
movdqa xmm5, xmm4
69+
pslld xmm5, 9
70+
psrld xmm4, 23
71+
pxor xmm2, xmm5
72+
pxor xmm2, xmm4
73+
74+
// xmm3 ^= rotl(xmm2 + xmm1, 13)
75+
movdqa xmm4, xmm2
76+
paddd xmm4, xmm1
77+
movdqa xmm5, xmm4
78+
pslld xmm5, 13
79+
psrld xmm4, 19
80+
pxor xmm3, xmm5
81+
pxor xmm3, xmm4
82+
83+
// xmm0 ^= rotl(xmm3 + xmm2, 18)
84+
movdqa xmm4, xmm3
85+
paddd xmm4, xmm2
86+
movdqa xmm5, xmm4
87+
pslld xmm5, 18
88+
psrld xmm4, 14
89+
pxor xmm0, xmm5
90+
pxor xmm0, xmm4
91+
92+
// Diagonalize: rotate B right by 1, C by 2, D left by 1
93+
pshufd xmm1, xmm1, $93
94+
pshufd xmm2, xmm2, $4E
95+
pshufd xmm3, xmm3, $39
96+
97+
// --- Row quarter-round (B/D roles swapped after diagonal shuffle) ---
98+
99+
// xmm3 ^= rotl(xmm0 + xmm1, 7)
100+
movdqa xmm4, xmm0
101+
paddd xmm4, xmm1
102+
movdqa xmm5, xmm4
103+
pslld xmm5, 7
104+
psrld xmm4, 25
105+
pxor xmm3, xmm5
106+
pxor xmm3, xmm4
107+
108+
// xmm2 ^= rotl(xmm3 + xmm0, 9)
109+
movdqa xmm4, xmm3
110+
paddd xmm4, xmm0
111+
movdqa xmm5, xmm4
112+
pslld xmm5, 9
113+
psrld xmm4, 23
114+
pxor xmm2, xmm5
115+
pxor xmm2, xmm4
116+
117+
// xmm1 ^= rotl(xmm2 + xmm3, 13)
118+
movdqa xmm4, xmm2
119+
paddd xmm4, xmm3
120+
movdqa xmm5, xmm4
121+
pslld xmm5, 13
122+
psrld xmm4, 19
123+
pxor xmm1, xmm5
124+
pxor xmm1, xmm4
125+
126+
// xmm0 ^= rotl(xmm1 + xmm2, 18)
127+
movdqa xmm4, xmm1
128+
paddd xmm4, xmm2
129+
movdqa xmm5, xmm4
130+
pslld xmm5, 18
131+
psrld xmm4, 14
132+
pxor xmm0, xmm5
133+
pxor xmm0, xmm4
134+
135+
// Undiagonalize: reverse the shuffles
136+
pshufd xmm1, xmm1, $39
137+
pshufd xmm2, xmm2, $4E
138+
pshufd xmm3, xmm3, $93
139+
140+
dec r10d
141+
jnz @double_round
142+
143+
// =========================================================================
144+
// Final addition and store
145+
// =========================================================================
146+
movdqa xmm4, oword [rsp]
147+
paddd xmm0, xmm4
148+
movdqa xmm4, oword [rsp + $10]
149+
paddd xmm1, xmm4
150+
movdqa xmm4, oword [rsp + $20]
151+
paddd xmm2, xmm4
152+
movdqa xmm4, oword [rsp + $30]
153+
paddd xmm3, xmm4
154+
155+
movdqu oword [rcx], xmm0
156+
movdqu oword [rcx + $10], xmm1
157+
movdqu oword [rcx + $20], xmm2
158+
movdqu oword [rcx + $30], xmm3
159+
160+
add rsp, 72

0 commit comments

Comments
 (0)