Skip to content

Commit c8755fd

Browse files
authored
xxh3 simd (#44)
* XXH3 Simd implementation
1 parent c56a847 commit c8755fd

20 files changed

Lines changed: 804 additions & 27 deletions

HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ uses
8686
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
8787
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
8888
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
89+
HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
8990
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
9091
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
9192
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -108,6 +109,7 @@ uses
108109
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
109110
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
110111
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
112+
HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
111113
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
112114
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas';
113115

HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ uses
8585
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
8686
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
8787
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
88+
HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
8889
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
8990
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
9091
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -107,6 +108,7 @@ uses
107108
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
108109
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
109110
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
111+
HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
110112
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
111113
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas';
112114

HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ uses
107107
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
108108
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
109109
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
110+
HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
110111
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
111112
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
112113
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -129,6 +130,7 @@ uses
129130
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
130131
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
131132
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
133+
HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
132134
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
133135
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas',
134136
HashLibTestBase in '..\src\HashLibTestBase.pas',

HashLib/src/Hash64/HlpXXHash3.pas

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ TXXH3_State = record
179179

180180
implementation
181181

182+
uses
183+
HlpXXHash3Dispatch;
184+
182185
{ TXXH3Core }
183186

184187
class function TXXH3Core.XXH_mult32to64(AX, AY: UInt32): UInt64;
@@ -283,11 +286,8 @@ class procedure TXXH3Core.XXH3_scalarRound(var AAcc: TXXH3AccArray;
283286

284287
class procedure TXXH3Core.XXH3_accumulate_512(var AAcc: TXXH3AccArray;
285288
AInput, ASecret: PByte);
286-
var
287-
I: Int32;
288289
begin
289-
for I := 0 to XXH_ACC_NB - 1 do
290-
XXH3_scalarRound(AAcc, AInput, ASecret, I);
290+
HlpXXHash3Dispatch.XXH3_Accumulate512(@AAcc[0], AInput, ASecret);
291291
end;
292292

293293
class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray;
@@ -305,21 +305,14 @@ class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray;
305305

306306
class procedure TXXH3Core.XXH3_scrambleAcc(var AAcc: TXXH3AccArray;
307307
ASecret: PByte);
308-
var
309-
I: Int32;
310308
begin
311-
for I := 0 to XXH_ACC_NB - 1 do
312-
XXH3_scalarScrambleRound(AAcc, ASecret, I);
309+
HlpXXHash3Dispatch.XXH3_ScrambleAcc(@AAcc[0], ASecret);
313310
end;
314311

315312
class procedure TXXH3Core.XXH3_accumulate(var AAcc: TXXH3AccArray;
316313
AInput, ASecret: PByte; ANbStripes: Int32);
317-
var
318-
N: Int32;
319314
begin
320-
for N := 0 to ANbStripes - 1 do
321-
XXH3_accumulate_512(AAcc, AInput + N * XXH_STRIPE_LEN,
322-
ASecret + N * XXH_SECRET_CONSUME_RATE);
315+
HlpXXHash3Dispatch.XXH3_Accumulate(@AAcc[0], AInput, ASecret, ANbStripes);
323316
end;
324317

325318
class procedure TXXH3Core.XXH3_hashLong_internal_loop(
@@ -351,19 +344,8 @@ class procedure TXXH3Core.XXH3_hashLong_internal_loop(
351344

352345
class procedure TXXH3Core.XXH3_initCustomSecret(ACustomSecret: PByte;
353346
ASeed: UInt64);
354-
var
355-
I: Int32;
356-
LLo, LHi: UInt64;
357347
begin
358-
for I := 0 to (XXH3_SECRET_DEFAULT_SIZE div 16) - 1 do
359-
begin
360-
LLo := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]),
361-
16 * I) + ASeed;
362-
LHi := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]),
363-
16 * I + 8) - ASeed;
364-
PUInt64(ACustomSecret + 16 * I)^ := LLo;
365-
PUInt64(ACustomSecret + 16 * I + 8)^ := LHi;
366-
end;
348+
HlpXXHash3Dispatch.XXH3_InitSecret(ACustomSecret, @XXH3_SECRET[0], ASeed);
367349
end;
368350

369351
class procedure TXXH3Core.XXH3_consumeStripes(var AAcc: TXXH3AccArray;
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
unit HlpXXHash3Dispatch;
2+
3+
{$I ..\Include\HashLib.inc}
4+
5+
interface
6+
7+
type
8+
TXXH3Accumulate512Proc = procedure(AAcc: Pointer; AInput: Pointer;
9+
ASecret: Pointer);
10+
TXXH3AccumulateProc = procedure(AAcc: Pointer; AInput: Pointer;
11+
ASecret: Pointer; ANbStripes: Int32);
12+
TXXH3ScrambleAccProc = procedure(AAcc: Pointer; ASecret: Pointer);
13+
TXXH3InitSecretProc = procedure(ACustomSecret: Pointer;
14+
ADefaultSecret: Pointer; ASeed: UInt64);
15+
16+
var
17+
XXH3_Accumulate512: TXXH3Accumulate512Proc;
18+
XXH3_Accumulate: TXXH3AccumulateProc;
19+
XXH3_ScrambleAcc: TXXH3ScrambleAccProc;
20+
XXH3_InitSecret: TXXH3InitSecretProc;
21+
22+
implementation
23+
24+
uses
25+
HlpSimd;
26+
27+
const
28+
XXH_STRIPE_LEN = 64;
29+
XXH_ACC_NB = 8;
30+
XXH_SECRET_CONSUME_RATE = 8;
31+
XXH_PRIME32_1 = UInt32($9E3779B1);
32+
33+
// =============================================================================
34+
// Scalar fallback implementations
35+
// =============================================================================
36+
37+
procedure XXH3_accumulate_512_scalar(AAcc: Pointer; AInput: Pointer;
38+
ASecret: Pointer);
39+
var
40+
LPAcc: PUInt64;
41+
LPInput, LPSecret: PByte;
42+
I: Int32;
43+
LDataVal, LDataKey: UInt64;
44+
begin
45+
LPAcc := PUInt64(AAcc);
46+
LPInput := PByte(AInput);
47+
LPSecret := PByte(ASecret);
48+
for I := 0 to XXH_ACC_NB - 1 do
49+
begin
50+
LDataVal := PUInt64(LPInput + I * 8)^;
51+
LDataKey := LDataVal xor PUInt64(LPSecret + I * 8)^;
52+
PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ :=
53+
PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ + LDataVal;
54+
PUInt64(PByte(LPAcc) + I * 8)^ :=
55+
PUInt64(PByte(LPAcc) + I * 8)^ +
56+
UInt64(UInt32(LDataKey)) * UInt64(UInt32(LDataKey shr 32));
57+
end;
58+
end;
59+
60+
procedure XXH3_scrambleAcc_scalar(AAcc: Pointer; ASecret: Pointer);
61+
var
62+
LPAcc: PUInt64;
63+
LPSecret: PByte;
64+
I: Int32;
65+
LKey64, LAcc64: UInt64;
66+
begin
67+
LPAcc := PUInt64(AAcc);
68+
LPSecret := PByte(ASecret);
69+
for I := 0 to XXH_ACC_NB - 1 do
70+
begin
71+
LKey64 := PUInt64(LPSecret + I * 8)^;
72+
LAcc64 := PUInt64(PByte(LPAcc) + I * 8)^;
73+
LAcc64 := LAcc64 xor (LAcc64 shr 47);
74+
LAcc64 := LAcc64 xor LKey64;
75+
LAcc64 := LAcc64 * XXH_PRIME32_1;
76+
PUInt64(PByte(LPAcc) + I * 8)^ := LAcc64;
77+
end;
78+
end;
79+
80+
procedure XXH3_initSecret_scalar(ACustomSecret: Pointer;
81+
ADefaultSecret: Pointer; ASeed: UInt64);
82+
var
83+
I: Int32;
84+
LPSrc, LPDst: PByte;
85+
begin
86+
LPSrc := PByte(ADefaultSecret);
87+
LPDst := PByte(ACustomSecret);
88+
for I := 0 to (192 div 16) - 1 do
89+
begin
90+
PUInt64(LPDst + 16 * I)^ := PUInt64(LPSrc + 16 * I)^ + ASeed;
91+
PUInt64(LPDst + 16 * I + 8)^ := PUInt64(LPSrc + 16 * I + 8)^ - ASeed;
92+
end;
93+
end;
94+
95+
procedure XXH3_accumulate_scalar(AAcc: Pointer; AInput: Pointer;
96+
ASecret: Pointer; ANbStripes: Int32);
97+
var
98+
N: Int32;
99+
begin
100+
for N := 0 to ANbStripes - 1 do
101+
XXH3_accumulate_512_scalar(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
102+
PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
103+
end;
104+
105+
// =============================================================================
106+
// SSE2 and AVX2 implementations (x86-64 only)
107+
// =============================================================================
108+
109+
{$IFDEF HASHLIB_X86_64}
110+
111+
// ----- SSE2 -----
112+
113+
procedure XXH3_accumulate_512_sse2(AAcc: Pointer; AInput: Pointer;
114+
ASecret: Pointer);
115+
{$I ..\Include\Simd\Common\SimdProc3Begin.inc}
116+
{$I ..\Include\Simd\XXH3\XXH3Acc512Sse2.inc}
117+
end;
118+
119+
procedure XXH3_scrambleAcc_sse2(AAcc: Pointer; ASecret: Pointer);
120+
{$I ..\Include\Simd\Common\SimdProc2Begin.inc}
121+
{$I ..\Include\Simd\XXH3\XXH3ScrambleSse2.inc}
122+
end;
123+
124+
procedure XXH3_initSecret_sse2(ACustomSecret: Pointer;
125+
ADefaultSecret: Pointer; ASeed: UInt64);
126+
{$I ..\Include\Simd\Common\SimdProc3Begin.inc}
127+
{$I ..\Include\Simd\XXH3\XXH3InitSecretSse2.inc}
128+
end;
129+
130+
procedure XXH3_accumulate_sse2(AAcc: Pointer; AInput: Pointer;
131+
ASecret: Pointer; ANbStripes: Int32);
132+
var
133+
N: Int32;
134+
begin
135+
for N := 0 to ANbStripes - 1 do
136+
XXH3_accumulate_512_sse2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
137+
PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
138+
end;
139+
140+
{$IFDEF HASHLIB_AVX2_ASM_SUPPORTED}
141+
142+
// ----- AVX2 -----
143+
144+
procedure XXH3_accumulate_512_avx2(AAcc: Pointer; AInput: Pointer;
145+
ASecret: Pointer);
146+
{$I ..\Include\Simd\Common\SimdProc3Begin.inc}
147+
{$I ..\Include\Simd\XXH3\XXH3Acc512Avx2.inc}
148+
end;
149+
150+
procedure XXH3_scrambleAcc_avx2(AAcc: Pointer; ASecret: Pointer);
151+
{$I ..\Include\Simd\Common\SimdProc2Begin.inc}
152+
{$I ..\Include\Simd\XXH3\XXH3ScrambleAvx2.inc}
153+
end;
154+
155+
procedure XXH3_initSecret_avx2(ACustomSecret: Pointer;
156+
ADefaultSecret: Pointer; ASeed: UInt64);
157+
{$I ..\Include\Simd\Common\SimdProc3Begin.inc}
158+
{$I ..\Include\Simd\XXH3\XXH3InitSecretAvx2.inc}
159+
end;
160+
161+
procedure XXH3_accumulate_avx2(AAcc: Pointer; AInput: Pointer;
162+
ASecret: Pointer; ANbStripes: Int32);
163+
var
164+
N: Int32;
165+
begin
166+
for N := 0 to ANbStripes - 1 do
167+
XXH3_accumulate_512_avx2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
168+
PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
169+
end;
170+
171+
{$ENDIF HASHLIB_AVX2_ASM_SUPPORTED}
172+
173+
{$ENDIF HASHLIB_X86_64}
174+
175+
// =============================================================================
176+
// Dispatch initialization
177+
// =============================================================================
178+
179+
procedure InitDispatch();
180+
begin
181+
case TSimd.GetActiveLevel() of
182+
{$IFDEF HASHLIB_X86_64}
183+
{$IFDEF HASHLIB_AVX2_ASM_SUPPORTED}
184+
TSimdLevel.AVX2:
185+
begin
186+
XXH3_Accumulate512 := @XXH3_accumulate_512_avx2;
187+
XXH3_Accumulate := @XXH3_accumulate_avx2;
188+
XXH3_ScrambleAcc := @XXH3_scrambleAcc_avx2;
189+
XXH3_InitSecret := @XXH3_initSecret_avx2;
190+
end;
191+
{$ENDIF HASHLIB_AVX2_ASM_SUPPORTED}
192+
TSimdLevel.SSE2:
193+
begin
194+
XXH3_Accumulate512 := @XXH3_accumulate_512_sse2;
195+
XXH3_Accumulate := @XXH3_accumulate_sse2;
196+
XXH3_ScrambleAcc := @XXH3_scrambleAcc_sse2;
197+
XXH3_InitSecret := @XXH3_initSecret_sse2;
198+
end;
199+
{$ENDIF}
200+
TSimdLevel.Scalar:
201+
begin
202+
XXH3_Accumulate512 := @XXH3_accumulate_512_scalar;
203+
XXH3_Accumulate := @XXH3_accumulate_scalar;
204+
XXH3_ScrambleAcc := @XXH3_scrambleAcc_scalar;
205+
XXH3_InitSecret := @XXH3_initSecret_scalar;
206+
end;
207+
end;
208+
end;
209+
210+
initialization
211+
InitDispatch();
212+
213+
end.

HashLib/src/Include/HashLib.inc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,24 @@
5454
{$SCOPEDENUMS ON}
5555
{$POINTERMATH ON}
5656

57+
{============================== SIMD Settings =================================}
58+
59+
{$IF DEFINED(CPUX86_64) OR DEFINED(CPUX64)}
60+
{$DEFINE HASHLIB_X86_64}
61+
{$IFEND}
62+
63+
{$IFDEF FPC}
64+
{$IFDEF HASHLIB_X86_64}
65+
{$DEFINE HASHLIB_AVX2_ASM_SUPPORTED}
66+
{$ENDIF}
67+
{$ENDIF}
68+
69+
// Uncomment ONE of the following to force a specific SIMD dispatch level:
70+
// {$DEFINE HASHLIB_FORCE_SCALAR}
71+
// {$DEFINE HASHLIB_FORCE_SSE2}
72+
73+
{$IF DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SSE2)}
74+
{$MESSAGE ERROR 'HASHLIB_FORCE_SCALAR and HASHLIB_FORCE_SSE2 cannot both be defined. Enable only one.'}
75+
{$IFEND}
76+
5777
(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)

HashLib/src/Include/HashLibFPC.inc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737

3838
{$MODE DELPHI}
3939
{$MACRO ON}
40+
{$ASMMODE INTEL}
4041
{$NOTES OFF}
4142
{$OPTIMIZATION LEVEL3}
4243
{$OPTIMIZATION NOUSELOADMODIFYSTORE}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// Shared SIMD procedure prologue for 2-parameter assembly functions.
2+
// After inclusion: rcx = param1, rdx = param2 (MS x64 ABI).
3+
// On FPC non-Windows (System V ABI), remaps rdi,rsi -> rcx,rdx.
4+
// Usage:
5+
// procedure MyProc(P1, P2: Pointer);
6+
// {$I SimdProc2Begin.inc}
7+
// // ... SIMD instructions using rcx, rdx ...
8+
// end;
9+
{$IFDEF FPC}
10+
assembler; nostackframe;
11+
asm
12+
{$IFNDEF MSWINDOWS}
13+
mov rdx, rsi
14+
mov rcx, rdi
15+
{$ENDIF}
16+
{$ELSE}
17+
asm
18+
.noframe
19+
{$ENDIF}

0 commit comments

Comments
 (0)