diff --git a/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr b/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr
index 990185a..6e2a9c5 100644
--- a/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr
+++ b/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr
@@ -86,6 +86,7 @@ uses
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
+ HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -108,6 +109,7 @@ uses
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
+ HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas';
diff --git a/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr b/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr
index 66026a6..5f079cd 100644
--- a/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr
+++ b/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr
@@ -85,6 +85,7 @@ uses
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
+ HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -107,6 +108,7 @@ uses
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
+ HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas';
diff --git a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr
index 4e61f05..deded47 100644
--- a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr
+++ b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr
@@ -107,6 +107,7 @@ uses
HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas',
HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas',
HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas',
+ HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas',
HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas',
HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas',
HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas',
@@ -129,6 +130,7 @@ uses
HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas',
HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas',
HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas',
+ HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas',
HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas',
HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas',
HashLibTestBase in '..\src\HashLibTestBase.pas',
diff --git a/HashLib/src/Hash64/HlpXXHash3.pas b/HashLib/src/Hash64/HlpXXHash3.pas
index 7851d39..76e13bc 100644
--- a/HashLib/src/Hash64/HlpXXHash3.pas
+++ b/HashLib/src/Hash64/HlpXXHash3.pas
@@ -179,6 +179,9 @@ TXXH3_State = record
implementation
+uses
+ HlpXXHash3Dispatch;
+
{ TXXH3Core }
class function TXXH3Core.XXH_mult32to64(AX, AY: UInt32): UInt64;
@@ -283,11 +286,8 @@ class procedure TXXH3Core.XXH3_scalarRound(var AAcc: TXXH3AccArray;
class procedure TXXH3Core.XXH3_accumulate_512(var AAcc: TXXH3AccArray;
AInput, ASecret: PByte);
-var
- I: Int32;
begin
- for I := 0 to XXH_ACC_NB - 1 do
- XXH3_scalarRound(AAcc, AInput, ASecret, I);
+ HlpXXHash3Dispatch.XXH3_Accumulate512(@AAcc[0], AInput, ASecret);
end;
class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray;
@@ -305,21 +305,14 @@ class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray;
class procedure TXXH3Core.XXH3_scrambleAcc(var AAcc: TXXH3AccArray;
ASecret: PByte);
-var
- I: Int32;
begin
- for I := 0 to XXH_ACC_NB - 1 do
- XXH3_scalarScrambleRound(AAcc, ASecret, I);
+ HlpXXHash3Dispatch.XXH3_ScrambleAcc(@AAcc[0], ASecret);
end;
class procedure TXXH3Core.XXH3_accumulate(var AAcc: TXXH3AccArray;
AInput, ASecret: PByte; ANbStripes: Int32);
-var
- N: Int32;
begin
- for N := 0 to ANbStripes - 1 do
- XXH3_accumulate_512(AAcc, AInput + N * XXH_STRIPE_LEN,
- ASecret + N * XXH_SECRET_CONSUME_RATE);
+ HlpXXHash3Dispatch.XXH3_Accumulate(@AAcc[0], AInput, ASecret, ANbStripes);
end;
class procedure TXXH3Core.XXH3_hashLong_internal_loop(
@@ -351,19 +344,8 @@ class procedure TXXH3Core.XXH3_hashLong_internal_loop(
class procedure TXXH3Core.XXH3_initCustomSecret(ACustomSecret: PByte;
ASeed: UInt64);
-var
- I: Int32;
- LLo, LHi: UInt64;
begin
- for I := 0 to (XXH3_SECRET_DEFAULT_SIZE div 16) - 1 do
- begin
- LLo := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]),
- 16 * I) + ASeed;
- LHi := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]),
- 16 * I + 8) - ASeed;
- PUInt64(ACustomSecret + 16 * I)^ := LLo;
- PUInt64(ACustomSecret + 16 * I + 8)^ := LHi;
- end;
+ HlpXXHash3Dispatch.XXH3_InitSecret(ACustomSecret, @XXH3_SECRET[0], ASeed);
end;
class procedure TXXH3Core.XXH3_consumeStripes(var AAcc: TXXH3AccArray;
diff --git a/HashLib/src/Hash64/HlpXXHash3Dispatch.pas b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas
new file mode 100644
index 0000000..84433e4
--- /dev/null
+++ b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas
@@ -0,0 +1,213 @@
+unit HlpXXHash3Dispatch;
+
+{$I ..\Include\HashLib.inc}
+
+interface
+
+type
+ TXXH3Accumulate512Proc = procedure(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer);
+ TXXH3AccumulateProc = procedure(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer; ANbStripes: Int32);
+ TXXH3ScrambleAccProc = procedure(AAcc: Pointer; ASecret: Pointer);
+ TXXH3InitSecretProc = procedure(ACustomSecret: Pointer;
+ ADefaultSecret: Pointer; ASeed: UInt64);
+
+var
+ XXH3_Accumulate512: TXXH3Accumulate512Proc;
+ XXH3_Accumulate: TXXH3AccumulateProc;
+ XXH3_ScrambleAcc: TXXH3ScrambleAccProc;
+ XXH3_InitSecret: TXXH3InitSecretProc;
+
+implementation
+
+uses
+ HlpSimd;
+
+const
+ XXH_STRIPE_LEN = 64;
+ XXH_ACC_NB = 8;
+ XXH_SECRET_CONSUME_RATE = 8;
+ XXH_PRIME32_1 = UInt32($9E3779B1);
+
+// =============================================================================
+// Scalar fallback implementations
+// =============================================================================
+
+procedure XXH3_accumulate_512_scalar(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer);
+var
+ LPAcc: PUInt64;
+ LPInput, LPSecret: PByte;
+ I: Int32;
+ LDataVal, LDataKey: UInt64;
+begin
+ LPAcc := PUInt64(AAcc);
+ LPInput := PByte(AInput);
+ LPSecret := PByte(ASecret);
+ for I := 0 to XXH_ACC_NB - 1 do
+ begin
+ LDataVal := PUInt64(LPInput + I * 8)^;
+ LDataKey := LDataVal xor PUInt64(LPSecret + I * 8)^;
+ PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ :=
+ PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ + LDataVal;
+ PUInt64(PByte(LPAcc) + I * 8)^ :=
+ PUInt64(PByte(LPAcc) + I * 8)^ +
+ UInt64(UInt32(LDataKey)) * UInt64(UInt32(LDataKey shr 32));
+ end;
+end;
+
+procedure XXH3_scrambleAcc_scalar(AAcc: Pointer; ASecret: Pointer);
+var
+ LPAcc: PUInt64;
+ LPSecret: PByte;
+ I: Int32;
+ LKey64, LAcc64: UInt64;
+begin
+ LPAcc := PUInt64(AAcc);
+ LPSecret := PByte(ASecret);
+ for I := 0 to XXH_ACC_NB - 1 do
+ begin
+ LKey64 := PUInt64(LPSecret + I * 8)^;
+ LAcc64 := PUInt64(PByte(LPAcc) + I * 8)^;
+ LAcc64 := LAcc64 xor (LAcc64 shr 47);
+ LAcc64 := LAcc64 xor LKey64;
+ LAcc64 := LAcc64 * XXH_PRIME32_1;
+ PUInt64(PByte(LPAcc) + I * 8)^ := LAcc64;
+ end;
+end;
+
+procedure XXH3_initSecret_scalar(ACustomSecret: Pointer;
+ ADefaultSecret: Pointer; ASeed: UInt64);
+var
+ I: Int32;
+ LPSrc, LPDst: PByte;
+begin
+ LPSrc := PByte(ADefaultSecret);
+ LPDst := PByte(ACustomSecret);
+ for I := 0 to (192 div 16) - 1 do
+ begin
+ PUInt64(LPDst + 16 * I)^ := PUInt64(LPSrc + 16 * I)^ + ASeed;
+ PUInt64(LPDst + 16 * I + 8)^ := PUInt64(LPSrc + 16 * I + 8)^ - ASeed;
+ end;
+end;
+
+procedure XXH3_accumulate_scalar(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer; ANbStripes: Int32);
+var
+ N: Int32;
+begin
+ for N := 0 to ANbStripes - 1 do
+ XXH3_accumulate_512_scalar(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
+ PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
+end;
+
+// =============================================================================
+// SSE2 and AVX2 implementations (x86-64 only)
+// =============================================================================
+
+{$IFDEF HASHLIB_X86_64}
+
+// ----- SSE2 -----
+
+procedure XXH3_accumulate_512_sse2(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer);
+ {$I ..\Include\Simd\Common\SimdProc3Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3Acc512Sse2.inc}
+end;
+
+procedure XXH3_scrambleAcc_sse2(AAcc: Pointer; ASecret: Pointer);
+ {$I ..\Include\Simd\Common\SimdProc2Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3ScrambleSse2.inc}
+end;
+
+procedure XXH3_initSecret_sse2(ACustomSecret: Pointer;
+ ADefaultSecret: Pointer; ASeed: UInt64);
+ {$I ..\Include\Simd\Common\SimdProc3Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3InitSecretSse2.inc}
+end;
+
+procedure XXH3_accumulate_sse2(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer; ANbStripes: Int32);
+var
+ N: Int32;
+begin
+ for N := 0 to ANbStripes - 1 do
+ XXH3_accumulate_512_sse2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
+ PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
+end;
+
+{$IFDEF HASHLIB_AVX2_ASM_SUPPORTED}
+
+// ----- AVX2 -----
+
+procedure XXH3_accumulate_512_avx2(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer);
+ {$I ..\Include\Simd\Common\SimdProc3Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3Acc512Avx2.inc}
+end;
+
+procedure XXH3_scrambleAcc_avx2(AAcc: Pointer; ASecret: Pointer);
+ {$I ..\Include\Simd\Common\SimdProc2Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3ScrambleAvx2.inc}
+end;
+
+procedure XXH3_initSecret_avx2(ACustomSecret: Pointer;
+ ADefaultSecret: Pointer; ASeed: UInt64);
+ {$I ..\Include\Simd\Common\SimdProc3Begin.inc}
+ {$I ..\Include\Simd\XXH3\XXH3InitSecretAvx2.inc}
+end;
+
+procedure XXH3_accumulate_avx2(AAcc: Pointer; AInput: Pointer;
+ ASecret: Pointer; ANbStripes: Int32);
+var
+ N: Int32;
+begin
+ for N := 0 to ANbStripes - 1 do
+ XXH3_accumulate_512_avx2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN,
+ PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE);
+end;
+
+{$ENDIF HASHLIB_AVX2_ASM_SUPPORTED}
+
+{$ENDIF HASHLIB_X86_64}
+
+// =============================================================================
+// Dispatch initialization
+// =============================================================================
+
+procedure InitDispatch();
+begin
+ case TSimd.GetActiveLevel() of
+{$IFDEF HASHLIB_X86_64}
+ {$IFDEF HASHLIB_AVX2_ASM_SUPPORTED}
+ TSimdLevel.AVX2:
+ begin
+ XXH3_Accumulate512 := @XXH3_accumulate_512_avx2;
+ XXH3_Accumulate := @XXH3_accumulate_avx2;
+ XXH3_ScrambleAcc := @XXH3_scrambleAcc_avx2;
+ XXH3_InitSecret := @XXH3_initSecret_avx2;
+ end;
+ {$ENDIF HASHLIB_AVX2_ASM_SUPPORTED}
+ TSimdLevel.SSE2:
+ begin
+ XXH3_Accumulate512 := @XXH3_accumulate_512_sse2;
+ XXH3_Accumulate := @XXH3_accumulate_sse2;
+ XXH3_ScrambleAcc := @XXH3_scrambleAcc_sse2;
+ XXH3_InitSecret := @XXH3_initSecret_sse2;
+ end;
+{$ENDIF}
+ TSimdLevel.Scalar:
+ begin
+ XXH3_Accumulate512 := @XXH3_accumulate_512_scalar;
+ XXH3_Accumulate := @XXH3_accumulate_scalar;
+ XXH3_ScrambleAcc := @XXH3_scrambleAcc_scalar;
+ XXH3_InitSecret := @XXH3_initSecret_scalar;
+ end;
+ end;
+end;
+
+initialization
+ InitDispatch();
+
+end.
diff --git a/HashLib/src/Include/HashLib.inc b/HashLib/src/Include/HashLib.inc
index 1f96646..41b75a6 100644
--- a/HashLib/src/Include/HashLib.inc
+++ b/HashLib/src/Include/HashLib.inc
@@ -54,4 +54,24 @@
{$SCOPEDENUMS ON}
{$POINTERMATH ON}
+{============================== SIMD Settings =================================}
+
+{$IF DEFINED(CPUX86_64) OR DEFINED(CPUX64)}
+ {$DEFINE HASHLIB_X86_64}
+{$IFEND}
+
+{$IFDEF FPC}
+ {$IFDEF HASHLIB_X86_64}
+ {$DEFINE HASHLIB_AVX2_ASM_SUPPORTED}
+ {$ENDIF}
+{$ENDIF}
+
+// Uncomment ONE of the following to force a specific SIMD dispatch level:
+// {$DEFINE HASHLIB_FORCE_SCALAR}
+// {$DEFINE HASHLIB_FORCE_SSE2}
+
+{$IF DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SSE2)}
+ {$MESSAGE ERROR 'HASHLIB_FORCE_SCALAR and HASHLIB_FORCE_SSE2 cannot both be defined. Enable only one.'}
+{$IFEND}
+
(* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *)
diff --git a/HashLib/src/Include/HashLibFPC.inc b/HashLib/src/Include/HashLibFPC.inc
index 7434d78..4d5ae3e 100644
--- a/HashLib/src/Include/HashLibFPC.inc
+++ b/HashLib/src/Include/HashLibFPC.inc
@@ -37,6 +37,7 @@
{$MODE DELPHI}
{$MACRO ON}
+ {$ASMMODE INTEL}
{$NOTES OFF}
{$OPTIMIZATION LEVEL3}
{$OPTIMIZATION NOUSELOADMODIFYSTORE}
diff --git a/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc
new file mode 100644
index 0000000..0e1a35f
--- /dev/null
+++ b/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc
@@ -0,0 +1,19 @@
+// Shared SIMD procedure prologue for 2-parameter assembly functions.
+// After inclusion: rcx = param1, rdx = param2 (MS x64 ABI).
+// On FPC non-Windows (System V ABI), remaps rdi,rsi -> rcx,rdx.
+// Usage:
+// procedure MyProc(P1, P2: Pointer);
+// {$I SimdProc2Begin.inc}
+// // ... SIMD instructions using rcx, rdx ...
+// end;
+{$IFDEF FPC}
+ assembler; nostackframe;
+asm
+ {$IFNDEF MSWINDOWS}
+ mov rdx, rsi
+ mov rcx, rdi
+ {$ENDIF}
+{$ELSE}
+asm
+ .noframe
+{$ENDIF}
diff --git a/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc
new file mode 100644
index 0000000..b5554b0
--- /dev/null
+++ b/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc
@@ -0,0 +1,21 @@
+// Shared SIMD procedure prologue for 3-parameter assembly functions.
+// After inclusion: rcx = param1, rdx = param2, r8 = param3 (MS x64 ABI).
+// On FPC non-Windows (System V ABI), remaps rdi,rsi,rdx -> rcx,rdx,r8.
+// Move order avoids register clobbering: save rdx before overwriting.
+// Usage:
+// procedure MyProc(P1, P2, P3: Pointer);
+// {$I SimdProc3Begin.inc}
+// // ... SIMD instructions using rcx, rdx, r8 ...
+// end;
+{$IFDEF FPC}
+ assembler; nostackframe;
+asm
+ {$IFNDEF MSWINDOWS}
+ mov r8, rdx
+ mov rdx, rsi
+ mov rcx, rdi
+ {$ENDIF}
+{$ELSE}
+asm
+ .noframe
+{$ENDIF}
diff --git a/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc
new file mode 100644
index 0000000..f004bfa
--- /dev/null
+++ b/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc
@@ -0,0 +1,23 @@
+// Shared SIMD procedure prologue for 4-parameter assembly functions.
+// After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4
+// (MS x64 ABI).
+// On FPC non-Windows (System V ABI), remaps rdi,rsi,rdx,rcx -> rcx,rdx,r8,r9.
+// Move order avoids register clobbering: save rcx and rdx first.
+// Usage:
+// procedure MyProc(P1, P2, P3: Pointer; P4: Int32);
+// {$I SimdProc4Begin.inc}
+// // ... SIMD instructions using rcx, rdx, r8, r9 ...
+// end;
+{$IFDEF FPC}
+ assembler; nostackframe;
+asm
+ {$IFNDEF MSWINDOWS}
+ mov r9, rcx
+ mov r8, rdx
+ mov rdx, rsi
+ mov rcx, rdi
+ {$ENDIF}
+{$ELSE}
+asm
+ .noframe
+{$ENDIF}
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc
new file mode 100644
index 0000000..d84ef20
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc
@@ -0,0 +1,30 @@
+// AVX2 implementation of XXH3_accumulate_512 (fully unrolled, 2 x 32-byte chunks).
+// Expects MS x64 ABI: rcx = acc ptr, rdx = input ptr, r8 = secret ptr.
+// Uses only volatile registers: ymm0-ymm4.
+// Reference: official xxHash C - XXH3_accumulate_512_avx2 in xxhash.h
+
+ // --- Chunk 0: acc[0..3], input[0..31], secret[0..31] ---
+ vmovdqu ymm0, yword [rcx]
+ vmovdqu ymm1, yword [rdx]
+ vmovdqu ymm2, yword [r8]
+ vpxor ymm2, ymm2, ymm1
+ vpsrlq ymm3, ymm2, 32
+ vpmuludq ymm2, ymm2, ymm3
+ vpshufd ymm1, ymm1, $4E
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm0, ymm0, ymm2
+ vmovdqu yword [rcx], ymm0
+
+ // --- Chunk 1: acc[4..7], input[32..63], secret[32..63] ---
+ vmovdqu ymm0, yword [rcx + $20]
+ vmovdqu ymm1, yword [rdx + $20]
+ vmovdqu ymm2, yword [r8 + $20]
+ vpxor ymm2, ymm2, ymm1
+ vpsrlq ymm3, ymm2, 32
+ vpmuludq ymm2, ymm2, ymm3
+ vpshufd ymm1, ymm1, $4E
+ vpaddq ymm0, ymm0, ymm1
+ vpaddq ymm0, ymm0, ymm2
+ vmovdqu yword [rcx + $20], ymm0
+
+ vzeroupper
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc
new file mode 100644
index 0000000..e1ecd3a
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc
@@ -0,0 +1,52 @@
+// SSE2 implementation of XXH3_accumulate_512 (fully unrolled, 4 x 16-byte chunks).
+// Expects MS x64 ABI: rcx = acc ptr, rdx = input ptr, r8 = secret ptr.
+// Uses only volatile registers: xmm0-xmm5.
+// Reference: official xxHash C - XXH3_accumulate_512_sse2 in xxhash.h
+
+ // --- Chunk 0: acc[0..1], input[0..15], secret[0..15] ---
+ movdqu xmm0, oword [rcx]
+ movdqu xmm1, oword [rdx]
+ movdqu xmm2, oword [r8]
+ pxor xmm2, xmm1
+ pshufd xmm3, xmm2, $31
+ pmuludq xmm2, xmm3
+ pshufd xmm1, xmm1, $4E
+ paddq xmm0, xmm1
+ paddq xmm0, xmm2
+ movdqu oword [rcx], xmm0
+
+ // --- Chunk 1: acc[2..3], input[16..31], secret[16..31] ---
+ movdqu xmm0, oword [rcx + $10]
+ movdqu xmm1, oword [rdx + $10]
+ movdqu xmm2, oword [r8 + $10]
+ pxor xmm2, xmm1
+ pshufd xmm3, xmm2, $31
+ pmuludq xmm2, xmm3
+ pshufd xmm1, xmm1, $4E
+ paddq xmm0, xmm1
+ paddq xmm0, xmm2
+ movdqu oword [rcx + $10], xmm0
+
+ // --- Chunk 2: acc[4..5], input[32..47], secret[32..47] ---
+ movdqu xmm0, oword [rcx + $20]
+ movdqu xmm1, oword [rdx + $20]
+ movdqu xmm2, oword [r8 + $20]
+ pxor xmm2, xmm1
+ pshufd xmm3, xmm2, $31
+ pmuludq xmm2, xmm3
+ pshufd xmm1, xmm1, $4E
+ paddq xmm0, xmm1
+ paddq xmm0, xmm2
+ movdqu oword [rcx + $20], xmm0
+
+ // --- Chunk 3: acc[6..7], input[48..63], secret[48..63] ---
+ movdqu xmm0, oword [rcx + $30]
+ movdqu xmm1, oword [rdx + $30]
+ movdqu xmm2, oword [r8 + $30]
+ pxor xmm2, xmm1
+ pshufd xmm3, xmm2, $31
+ pmuludq xmm2, xmm3
+ pshufd xmm1, xmm1, $4E
+ paddq xmm0, xmm1
+ paddq xmm0, xmm2
+ movdqu oword [rcx + $30], xmm0
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc
new file mode 100644
index 0000000..b2fa2ac
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc
@@ -0,0 +1,44 @@
+// AVX2 implementation of XXH3_initCustomSecret (fully unrolled, 6 x 32-byte chunks).
+// Expects MS x64 ABI: rcx = customSecret ptr, rdx = defaultSecret ptr, r8 = seed (UInt64).
+// Uses only volatile registers: ymm0-ymm2, rax.
+// Reference: official xxHash C - XXH3_initCustomSecret_avx2 in xxhash.h
+
+ // Build seed vector ymm0 = [seed, -seed, seed, -seed]
+ vmovq xmm0, r8
+ mov rax, r8
+ neg rax
+ vmovq xmm1, rax
+ vpunpcklqdq xmm0, xmm0, xmm1
+ vinserti128 ymm0, ymm0, xmm0, 1
+
+ // Block 0
+ vmovdqu ymm1, yword [rdx]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx], ymm1
+
+ // Block 1
+ vmovdqu ymm1, yword [rdx + $20]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx + $20], ymm1
+
+ // Block 2
+ vmovdqu ymm1, yword [rdx + $40]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx + $40], ymm1
+
+ // Block 3
+ vmovdqu ymm1, yword [rdx + $60]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx + $60], ymm1
+
+ // Block 4
+ vmovdqu ymm1, yword [rdx + $80]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx + $80], ymm1
+
+ // Block 5
+ vmovdqu ymm1, yword [rdx + $A0]
+ vpaddq ymm1, ymm1, ymm0
+ vmovdqu yword [rcx + $A0], ymm1
+
+ vzeroupper
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc
new file mode 100644
index 0000000..8b1306f
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc
@@ -0,0 +1,72 @@
+// SSE2 implementation of XXH3_initCustomSecret (fully unrolled, 12 x 16-byte chunks).
+// Expects MS x64 ABI: rcx = customSecret ptr, rdx = defaultSecret ptr, r8 = seed (UInt64).
+// Uses only volatile registers: xmm0-xmm2.
+// Algorithm: for each 16-byte block, lo_qword += seed, hi_qword -= seed.
+// Reference: official xxHash C - XXH3_initCustomSecret_sse2 in xxhash.h
+
+ // Build seed vector xmm0 = [seed, -seed]
+ movq xmm0, r8
+ mov rax, r8
+ neg rax
+ movq xmm1, rax
+ punpcklqdq xmm0, xmm1
+
+ // Block 0
+ movdqu xmm1, oword [rdx]
+ paddq xmm1, xmm0
+ movdqu oword [rcx], xmm1
+
+ // Block 1
+ movdqu xmm1, oword [rdx + $10]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $10], xmm1
+
+ // Block 2
+ movdqu xmm1, oword [rdx + $20]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $20], xmm1
+
+ // Block 3
+ movdqu xmm1, oword [rdx + $30]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $30], xmm1
+
+ // Block 4
+ movdqu xmm1, oword [rdx + $40]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $40], xmm1
+
+ // Block 5
+ movdqu xmm1, oword [rdx + $50]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $50], xmm1
+
+ // Block 6
+ movdqu xmm1, oword [rdx + $60]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $60], xmm1
+
+ // Block 7
+ movdqu xmm1, oword [rdx + $70]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $70], xmm1
+
+ // Block 8
+ movdqu xmm1, oword [rdx + $80]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $80], xmm1
+
+ // Block 9
+ movdqu xmm1, oword [rdx + $90]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $90], xmm1
+
+ // Block 10
+ movdqu xmm1, oword [rdx + $A0]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $A0], xmm1
+
+ // Block 11
+ movdqu xmm1, oword [rdx + $B0]
+ paddq xmm1, xmm0
+ movdqu oword [rcx + $B0], xmm1
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc
new file mode 100644
index 0000000..9bc2b28
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc
@@ -0,0 +1,38 @@
+// AVX2 implementation of XXH3_scrambleAcc (fully unrolled, 2 x 32-byte chunks).
+// Expects MS x64 ABI: rcx = acc ptr, rdx = secret ptr.
+// Uses only volatile registers: ymm0-ymm4, eax.
+// Reference: official xxHash C - XXH3_scrambleAcc_avx2 in xxhash.h
+
+ // Broadcast XXH_PRIME32_1 ($9E3779B1) to all dword lanes of ymm4
+ mov eax, $9E3779B1
+ vmovd xmm4, eax
+ vpbroadcastd ymm4, xmm4
+
+ // --- Chunk 0: acc[0..3], secret[0..31] ---
+ vmovdqu ymm0, yword [rcx]
+ vpsrlq ymm1, ymm0, 47
+ vpxor ymm0, ymm0, ymm1
+ vmovdqu ymm1, yword [rdx]
+ vpxor ymm0, ymm0, ymm1
+ // Multiply 64-bit lanes by PRIME32_1
+ vpsrlq ymm1, ymm0, 32
+ vpmuludq ymm2, ymm0, ymm4
+ vpmuludq ymm1, ymm1, ymm4
+ vpsllq ymm1, ymm1, 32
+ vpaddq ymm0, ymm2, ymm1
+ vmovdqu yword [rcx], ymm0
+
+ // --- Chunk 1: acc[4..7], secret[32..63] ---
+ vmovdqu ymm0, yword [rcx + $20]
+ vpsrlq ymm1, ymm0, 47
+ vpxor ymm0, ymm0, ymm1
+ vmovdqu ymm1, yword [rdx + $20]
+ vpxor ymm0, ymm0, ymm1
+ vpsrlq ymm1, ymm0, 32
+ vpmuludq ymm2, ymm0, ymm4
+ vpmuludq ymm1, ymm1, ymm4
+ vpsllq ymm1, ymm1, 32
+ vpaddq ymm0, ymm2, ymm1
+ vmovdqu yword [rcx + $20], ymm0
+
+ vzeroupper
diff --git a/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc
new file mode 100644
index 0000000..50d8218
--- /dev/null
+++ b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc
@@ -0,0 +1,70 @@
+// SSE2 implementation of XXH3_scrambleAcc (fully unrolled, 4 x 16-byte chunks).
+// Expects MS x64 ABI: rcx = acc ptr, rdx = secret ptr.
+// Uses only volatile registers: xmm0-xmm5, eax.
+// Algorithm per 128-bit chunk:
+// acc ^= (acc >> 47)
+// acc ^= secret
+// acc *= XXH_PRIME32_1
+// Reference: official xxHash C - XXH3_scrambleAcc_sse2 in xxhash.h
+
+ // Load XXH_PRIME32_1 ($9E3779B1) into xmm5 as broadcast dword
+ mov eax, $9E3779B1
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+
+ // --- Chunk 0: acc[0..1], secret[0..15] ---
+ movdqu xmm0, oword [rcx]
+ movdqu xmm1, xmm0
+ psrlq xmm1, 47
+ pxor xmm0, xmm1
+ movdqu xmm1, oword [rdx]
+ pxor xmm0, xmm1
+ // Multiply 64-bit lanes by PRIME32_1 using two 32x32->64 multiplies
+ pshufd xmm1, xmm0, $31
+ pmuludq xmm0, xmm5
+ pmuludq xmm1, xmm5
+ psllq xmm1, 32
+ paddq xmm0, xmm1
+ movdqu oword [rcx], xmm0
+
+ // --- Chunk 1: acc[2..3], secret[16..31] ---
+ movdqu xmm0, oword [rcx + $10]
+ movdqu xmm1, xmm0
+ psrlq xmm1, 47
+ pxor xmm0, xmm1
+ movdqu xmm1, oword [rdx + $10]
+ pxor xmm0, xmm1
+ pshufd xmm1, xmm0, $31
+ pmuludq xmm0, xmm5
+ pmuludq xmm1, xmm5
+ psllq xmm1, 32
+ paddq xmm0, xmm1
+ movdqu oword [rcx + $10], xmm0
+
+ // --- Chunk 2: acc[4..5], secret[32..47] ---
+ movdqu xmm0, oword [rcx + $20]
+ movdqu xmm1, xmm0
+ psrlq xmm1, 47
+ pxor xmm0, xmm1
+ movdqu xmm1, oword [rdx + $20]
+ pxor xmm0, xmm1
+ pshufd xmm1, xmm0, $31
+ pmuludq xmm0, xmm5
+ pmuludq xmm1, xmm5
+ psllq xmm1, 32
+ paddq xmm0, xmm1
+ movdqu oword [rcx + $20], xmm0
+
+ // --- Chunk 3: acc[6..7], secret[48..63] ---
+ movdqu xmm0, oword [rcx + $30]
+ movdqu xmm1, xmm0
+ psrlq xmm1, 47
+ pxor xmm0, xmm1
+ movdqu xmm1, oword [rdx + $30]
+ pxor xmm0, xmm1
+ pshufd xmm1, xmm0, $31
+ pmuludq xmm0, xmm5
+ pmuludq xmm1, xmm5
+ psllq xmm1, 32
+ paddq xmm0, xmm1
+ movdqu oword [rcx + $30], xmm0
diff --git a/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk b/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk
index d796bde..b9eef24 100644
--- a/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk
+++ b/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk
@@ -113,6 +113,7 @@ contains
HlpMurmur2_64 in '..\..\Hash64\HlpMurmur2_64.pas',
HlpSipHash in '..\..\Hash64\HlpSipHash.pas',
HlpXXHash64 in '..\..\Hash64\HlpXXHash64.pas',
+ HlpXXHash3Dispatch in '..\..\Hash64\HlpXXHash3Dispatch.pas',
HlpXXHash3 in '..\..\Hash64\HlpXXHash3.pas',
HlpMurmurHash3_x64_128 in '..\..\Hash128\HlpMurmurHash3_x64_128.pas',
HlpMurmurHash3_x86_128 in '..\..\Hash128\HlpMurmurHash3_x86_128.pas',
@@ -132,6 +133,7 @@ contains
HlpConverters in '..\..\Utils\HlpConverters.pas',
HlpBitConverter in '..\..\Utils\HlpBitConverter.pas',
HlpBits in '..\..\Utils\HlpBits.pas',
+ HlpSimd in '..\..\Utils\HlpSimd.pas',
HlpArrayUtils in '..\..\Utils\HlpArrayUtils.pas',
HlpHashLibTypes in '..\..\Utils\HlpHashLibTypes.pas',
HlpBlake2SParams in '..\..\Crypto\Blake2SParams\HlpBlake2SParams.pas',
diff --git a/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk b/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk
index 1bc9c97..01871f6 100644
--- a/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk
+++ b/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk
@@ -23,7 +23,7 @@
"/>
-
+
@@ -448,6 +448,14 @@
+
+
+
+
+
+
+
+
diff --git a/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas b/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas
index 6c7c82e..349b28f 100644
--- a/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas
+++ b/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas
@@ -28,7 +28,8 @@ interface
HlpArgon2TypeAndVersion, HlpPBKDF_Argon2NotBuildInAdapter,
HlpPBKDF_ScryptNotBuildInAdapter, HlpArrayUtils, HlpBlake2BP, HlpBlake2SP,
HlpSipHash128, HlpBlake2SParams, HlpBlake2BParams, HlpIBlake2SParams,
- HlpIBlake2BParams, HlpBlake3, HlpXXHash3, HlpXXHash128;
+ HlpIBlake2BParams, HlpBlake3, HlpXXHash3, HlpXXHash128, HlpSimd,
+ HlpXXHash3Dispatch;
implementation
diff --git a/HashLib/src/Utils/HlpSimd.pas b/HashLib/src/Utils/HlpSimd.pas
new file mode 100644
index 0000000..392cc2e
--- /dev/null
+++ b/HashLib/src/Utils/HlpSimd.pas
@@ -0,0 +1,175 @@
+unit HlpSimd;
+
+{$I ..\Include\HashLib.inc}
+
+interface
+
+type
+ TSimdLevel = (Scalar, SSE2, AVX2);
+
+ TSimd = class sealed
+ private
+ class var FDetectedLevel: TSimdLevel;
+ class function CPUHasSSE2(): Boolean; static;
+ class function CPUHasAVX2(): Boolean; static;
+ class procedure DetectFeatures(); static;
+ public
+ class function GetActiveLevel(): TSimdLevel; static;
+ end;
+
+implementation
+
+{$IFDEF HASHLIB_X86_64}
+
+type
+ TCpuIdResult = record
+ RegEAX, RegEBX, RegECX, RegEDX: UInt32;
+ end;
+
+{$IFDEF FPC}
+procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; AResult: Pointer);
+ assembler; nostackframe;
+asm
+ push rbx
+ {$IFDEF MSWINDOWS}
+ mov eax, ecx
+ mov ecx, edx
+ cpuid
+ mov dword ptr [r8], eax
+ mov dword ptr [r8 + 4], ebx
+ mov dword ptr [r8 + 8], ecx
+ mov dword ptr [r8 + 12], edx
+ {$ELSE}
+ mov eax, edi
+ mov ecx, esi
+ mov r8, rdx
+ cpuid
+ mov dword ptr [r8], eax
+ mov dword ptr [r8 + 4], ebx
+ mov dword ptr [r8 + 8], ecx
+ mov dword ptr [r8 + 12], edx
+ {$ENDIF}
+ pop rbx
+end;
+{$ELSE}
+procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; AResult: Pointer);
+asm
+ .PUSHNV RBX
+ mov eax, ecx
+ mov ecx, edx
+ cpuid
+ mov dword ptr [r8], eax
+ mov dword ptr [r8 + 4], ebx
+ mov dword ptr [r8 + 8], ecx
+ mov dword ptr [r8 + 12], edx
+end;
+{$ENDIF}
+
+{$IFDEF FPC}
+procedure XGetBvQuery(AResult: Pointer);
+ assembler; nostackframe;
+asm
+ {$IFDEF MSWINDOWS}
+ mov r8, rcx
+ {$ELSE}
+ mov r8, rdi
+ {$ENDIF}
+ xor ecx, ecx
+ xgetbv
+ mov dword ptr [r8], eax
+ mov dword ptr [r8 + 4], edx
+end;
+{$ELSE}
+procedure XGetBvQuery(AResult: Pointer);
+asm
+ .noframe
+ mov r8, rcx
+ xor ecx, ecx
+ xgetbv
+ mov dword ptr [r8], eax
+ mov dword ptr [r8 + 4], edx
+end;
+{$ENDIF}
+
+{$ENDIF HASHLIB_X86_64}
+
+{ TSimd }
+
+class function TSimd.CPUHasSSE2(): Boolean;
+{$IFDEF HASHLIB_X86_64}
+var
+ LCpuId: TCpuIdResult;
+{$ENDIF}
+begin
+{$IFDEF HASHLIB_X86_64}
+ CpuIdQuery(1, 0, @LCpuId);
+ Result := (LCpuId.RegEDX and (1 shl 26)) <> 0;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class function TSimd.CPUHasAVX2(): Boolean;
+{$IFDEF HASHLIB_X86_64}
+var
+ LCpuId: TCpuIdResult;
+ LXcr0: UInt64;
+{$ENDIF}
+begin
+{$IFDEF HASHLIB_X86_64}
+ CpuIdQuery(1, 0, @LCpuId);
+
+ // OSXSAVE: ECX bit 27 (required for OS AVX state saving)
+ if (LCpuId.RegECX and (1 shl 27)) = 0 then
+ Exit(False);
+
+ // XCR0 bits 1 and 2 must be set for AVX state support
+ LXcr0 := 0;
+ XGetBvQuery(@LXcr0);
+ if (UInt32(LXcr0) and $06) <> $06 then
+ Exit(False);
+
+ CpuIdQuery(7, 0, @LCpuId);
+
+ // AVX2: EBX bit 5
+ Result := (LCpuId.RegEBX and (1 shl 5)) <> 0;
+{$ELSE}
+ Result := False;
+{$ENDIF}
+end;
+
+class procedure TSimd.DetectFeatures();
+begin
+ FDetectedLevel := TSimdLevel.Scalar;
+
+ if CPUHasSSE2() then
+ begin
+ FDetectedLevel := TSimdLevel.SSE2;
+ if CPUHasAVX2() then
+ FDetectedLevel := TSimdLevel.AVX2;
+ end;
+
+ // Cap based on compiler assembler capability
+{$IFNDEF HASHLIB_AVX2_ASM_SUPPORTED}
+ if FDetectedLevel > TSimdLevel.SSE2 then
+ FDetectedLevel := TSimdLevel.SSE2;
+{$ENDIF}
+
+ // Cap based on user force defines
+{$IF DEFINED(HASHLIB_FORCE_SCALAR)}
+ FDetectedLevel := TSimdLevel.Scalar;
+{$ELSEIF DEFINED(HASHLIB_FORCE_SSE2)}
+ if FDetectedLevel > TSimdLevel.SSE2 then
+ FDetectedLevel := TSimdLevel.SSE2;
+{$IFEND}
+end;
+
+class function TSimd.GetActiveLevel(): TSimdLevel;
+begin
+ Result := FDetectedLevel;
+end;
+
+initialization
+ TSimd.DetectFeatures();
+
+end.