diff --git a/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr b/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr index 990185a..6e2a9c5 100644 --- a/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr +++ b/HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr @@ -86,6 +86,7 @@ uses HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas', HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas', HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas', + HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas', HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas', HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas', HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas', @@ -108,6 +109,7 @@ uses HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas', HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas', HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas', + HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas', HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas', HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas'; diff --git a/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr b/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr index 66026a6..5f079cd 100644 --- a/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr +++ b/HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr @@ -85,6 +85,7 @@ uses HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas', HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas', HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas', + HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas', HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas', HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas', HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas', @@ -107,6 +108,7 @@ uses HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas', HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas', HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas', + HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas', HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas', HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas'; diff --git a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr index 4e61f05..deded47 100644 --- a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr +++ b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr @@ -107,6 +107,7 @@ uses HlpMurmur2_64 in '..\..\HashLib\src\Hash64\HlpMurmur2_64.pas', HlpSipHash in '..\..\HashLib\src\Hash64\HlpSipHash.pas', HlpXXHash64 in '..\..\HashLib\src\Hash64\HlpXXHash64.pas', + HlpXXHash3Dispatch in '..\..\HashLib\src\Hash64\HlpXXHash3Dispatch.pas', HlpXXHash3 in '..\..\HashLib\src\Hash64\HlpXXHash3.pas', HlpMurmurHash3_x86_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x86_128.pas', HlpMurmurHash3_x64_128 in '..\..\HashLib\src\Hash128\HlpMurmurHash3_x64_128.pas', @@ -129,6 +130,7 @@ uses HlpConverters in '..\..\HashLib\src\Utils\HlpConverters.pas', HlpBitConverter in '..\..\HashLib\src\Utils\HlpBitConverter.pas', HlpBits in '..\..\HashLib\src\Utils\HlpBits.pas', + HlpSimd in '..\..\HashLib\src\Utils\HlpSimd.pas', HlpHashLibTypes in '..\..\HashLib\src\Utils\HlpHashLibTypes.pas', HlpArrayUtils in '..\..\HashLib\src\Utils\HlpArrayUtils.pas', HashLibTestBase in '..\src\HashLibTestBase.pas', diff --git a/HashLib/src/Hash64/HlpXXHash3.pas b/HashLib/src/Hash64/HlpXXHash3.pas index 7851d39..76e13bc 100644 --- a/HashLib/src/Hash64/HlpXXHash3.pas +++ b/HashLib/src/Hash64/HlpXXHash3.pas @@ -179,6 +179,9 @@ TXXH3_State = record implementation +uses + HlpXXHash3Dispatch; + { TXXH3Core } class function TXXH3Core.XXH_mult32to64(AX, AY: UInt32): UInt64; @@ -283,11 +286,8 @@ class procedure TXXH3Core.XXH3_scalarRound(var AAcc: TXXH3AccArray; class procedure TXXH3Core.XXH3_accumulate_512(var AAcc: TXXH3AccArray; AInput, ASecret: PByte); -var - I: Int32; begin - for I := 0 to XXH_ACC_NB - 1 do - XXH3_scalarRound(AAcc, AInput, ASecret, I); + HlpXXHash3Dispatch.XXH3_Accumulate512(@AAcc[0], AInput, ASecret); end; class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray; @@ -305,21 +305,14 @@ class procedure TXXH3Core.XXH3_scalarScrambleRound(var AAcc: TXXH3AccArray; class procedure TXXH3Core.XXH3_scrambleAcc(var AAcc: TXXH3AccArray; ASecret: PByte); -var - I: Int32; begin - for I := 0 to XXH_ACC_NB - 1 do - XXH3_scalarScrambleRound(AAcc, ASecret, I); + HlpXXHash3Dispatch.XXH3_ScrambleAcc(@AAcc[0], ASecret); end; class procedure TXXH3Core.XXH3_accumulate(var AAcc: TXXH3AccArray; AInput, ASecret: PByte; ANbStripes: Int32); -var - N: Int32; begin - for N := 0 to ANbStripes - 1 do - XXH3_accumulate_512(AAcc, AInput + N * XXH_STRIPE_LEN, - ASecret + N * XXH_SECRET_CONSUME_RATE); + HlpXXHash3Dispatch.XXH3_Accumulate(@AAcc[0], AInput, ASecret, ANbStripes); end; class procedure TXXH3Core.XXH3_hashLong_internal_loop( @@ -351,19 +344,8 @@ class procedure TXXH3Core.XXH3_hashLong_internal_loop( class procedure TXXH3Core.XXH3_initCustomSecret(ACustomSecret: PByte; ASeed: UInt64); -var - I: Int32; - LLo, LHi: UInt64; begin - for I := 0 to (XXH3_SECRET_DEFAULT_SIZE div 16) - 1 do - begin - LLo := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]), - 16 * I) + ASeed; - LHi := TConverters.ReadBytesAsUInt64LE(PByte(@XXH3_SECRET[0]), - 16 * I + 8) - ASeed; - PUInt64(ACustomSecret + 16 * I)^ := LLo; - PUInt64(ACustomSecret + 16 * I + 8)^ := LHi; - end; + HlpXXHash3Dispatch.XXH3_InitSecret(ACustomSecret, @XXH3_SECRET[0], ASeed); end; class procedure TXXH3Core.XXH3_consumeStripes(var AAcc: TXXH3AccArray; diff --git a/HashLib/src/Hash64/HlpXXHash3Dispatch.pas b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas new file mode 100644 index 0000000..84433e4 --- /dev/null +++ b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas @@ -0,0 +1,213 @@ +unit HlpXXHash3Dispatch; + +{$I ..\Include\HashLib.inc} + +interface + +type + TXXH3Accumulate512Proc = procedure(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer); + TXXH3AccumulateProc = procedure(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer; ANbStripes: Int32); + TXXH3ScrambleAccProc = procedure(AAcc: Pointer; ASecret: Pointer); + TXXH3InitSecretProc = procedure(ACustomSecret: Pointer; + ADefaultSecret: Pointer; ASeed: UInt64); + +var + XXH3_Accumulate512: TXXH3Accumulate512Proc; + XXH3_Accumulate: TXXH3AccumulateProc; + XXH3_ScrambleAcc: TXXH3ScrambleAccProc; + XXH3_InitSecret: TXXH3InitSecretProc; + +implementation + +uses + HlpSimd; + +const + XXH_STRIPE_LEN = 64; + XXH_ACC_NB = 8; + XXH_SECRET_CONSUME_RATE = 8; + XXH_PRIME32_1 = UInt32($9E3779B1); + +// ============================================================================= +// Scalar fallback implementations +// ============================================================================= + +procedure XXH3_accumulate_512_scalar(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer); +var + LPAcc: PUInt64; + LPInput, LPSecret: PByte; + I: Int32; + LDataVal, LDataKey: UInt64; +begin + LPAcc := PUInt64(AAcc); + LPInput := PByte(AInput); + LPSecret := PByte(ASecret); + for I := 0 to XXH_ACC_NB - 1 do + begin + LDataVal := PUInt64(LPInput + I * 8)^; + LDataKey := LDataVal xor PUInt64(LPSecret + I * 8)^; + PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ := + PUInt64(PByte(LPAcc) + (I xor 1) * 8)^ + LDataVal; + PUInt64(PByte(LPAcc) + I * 8)^ := + PUInt64(PByte(LPAcc) + I * 8)^ + + UInt64(UInt32(LDataKey)) * UInt64(UInt32(LDataKey shr 32)); + end; +end; + +procedure XXH3_scrambleAcc_scalar(AAcc: Pointer; ASecret: Pointer); +var + LPAcc: PUInt64; + LPSecret: PByte; + I: Int32; + LKey64, LAcc64: UInt64; +begin + LPAcc := PUInt64(AAcc); + LPSecret := PByte(ASecret); + for I := 0 to XXH_ACC_NB - 1 do + begin + LKey64 := PUInt64(LPSecret + I * 8)^; + LAcc64 := PUInt64(PByte(LPAcc) + I * 8)^; + LAcc64 := LAcc64 xor (LAcc64 shr 47); + LAcc64 := LAcc64 xor LKey64; + LAcc64 := LAcc64 * XXH_PRIME32_1; + PUInt64(PByte(LPAcc) + I * 8)^ := LAcc64; + end; +end; + +procedure XXH3_initSecret_scalar(ACustomSecret: Pointer; + ADefaultSecret: Pointer; ASeed: UInt64); +var + I: Int32; + LPSrc, LPDst: PByte; +begin + LPSrc := PByte(ADefaultSecret); + LPDst := PByte(ACustomSecret); + for I := 0 to (192 div 16) - 1 do + begin + PUInt64(LPDst + 16 * I)^ := PUInt64(LPSrc + 16 * I)^ + ASeed; + PUInt64(LPDst + 16 * I + 8)^ := PUInt64(LPSrc + 16 * I + 8)^ - ASeed; + end; +end; + +procedure XXH3_accumulate_scalar(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer; ANbStripes: Int32); +var + N: Int32; +begin + for N := 0 to ANbStripes - 1 do + XXH3_accumulate_512_scalar(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN, + PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE); +end; + +// ============================================================================= +// SSE2 and AVX2 implementations (x86-64 only) +// ============================================================================= + +{$IFDEF HASHLIB_X86_64} + +// ----- SSE2 ----- + +procedure XXH3_accumulate_512_sse2(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer); + {$I ..\Include\Simd\Common\SimdProc3Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3Acc512Sse2.inc} +end; + +procedure XXH3_scrambleAcc_sse2(AAcc: Pointer; ASecret: Pointer); + {$I ..\Include\Simd\Common\SimdProc2Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3ScrambleSse2.inc} +end; + +procedure XXH3_initSecret_sse2(ACustomSecret: Pointer; + ADefaultSecret: Pointer; ASeed: UInt64); + {$I ..\Include\Simd\Common\SimdProc3Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3InitSecretSse2.inc} +end; + +procedure XXH3_accumulate_sse2(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer; ANbStripes: Int32); +var + N: Int32; +begin + for N := 0 to ANbStripes - 1 do + XXH3_accumulate_512_sse2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN, + PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE); +end; + +{$IFDEF HASHLIB_AVX2_ASM_SUPPORTED} + +// ----- AVX2 ----- + +procedure XXH3_accumulate_512_avx2(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer); + {$I ..\Include\Simd\Common\SimdProc3Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3Acc512Avx2.inc} +end; + +procedure XXH3_scrambleAcc_avx2(AAcc: Pointer; ASecret: Pointer); + {$I ..\Include\Simd\Common\SimdProc2Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3ScrambleAvx2.inc} +end; + +procedure XXH3_initSecret_avx2(ACustomSecret: Pointer; + ADefaultSecret: Pointer; ASeed: UInt64); + {$I ..\Include\Simd\Common\SimdProc3Begin.inc} + {$I ..\Include\Simd\XXH3\XXH3InitSecretAvx2.inc} +end; + +procedure XXH3_accumulate_avx2(AAcc: Pointer; AInput: Pointer; + ASecret: Pointer; ANbStripes: Int32); +var + N: Int32; +begin + for N := 0 to ANbStripes - 1 do + XXH3_accumulate_512_avx2(AAcc, PByte(AInput) + N * XXH_STRIPE_LEN, + PByte(ASecret) + N * XXH_SECRET_CONSUME_RATE); +end; + +{$ENDIF HASHLIB_AVX2_ASM_SUPPORTED} + +{$ENDIF HASHLIB_X86_64} + +// ============================================================================= +// Dispatch initialization +// ============================================================================= + +procedure InitDispatch(); +begin + case TSimd.GetActiveLevel() of +{$IFDEF HASHLIB_X86_64} + {$IFDEF HASHLIB_AVX2_ASM_SUPPORTED} + TSimdLevel.AVX2: + begin + XXH3_Accumulate512 := @XXH3_accumulate_512_avx2; + XXH3_Accumulate := @XXH3_accumulate_avx2; + XXH3_ScrambleAcc := @XXH3_scrambleAcc_avx2; + XXH3_InitSecret := @XXH3_initSecret_avx2; + end; + {$ENDIF HASHLIB_AVX2_ASM_SUPPORTED} + TSimdLevel.SSE2: + begin + XXH3_Accumulate512 := @XXH3_accumulate_512_sse2; + XXH3_Accumulate := @XXH3_accumulate_sse2; + XXH3_ScrambleAcc := @XXH3_scrambleAcc_sse2; + XXH3_InitSecret := @XXH3_initSecret_sse2; + end; +{$ENDIF} + TSimdLevel.Scalar: + begin + XXH3_Accumulate512 := @XXH3_accumulate_512_scalar; + XXH3_Accumulate := @XXH3_accumulate_scalar; + XXH3_ScrambleAcc := @XXH3_scrambleAcc_scalar; + XXH3_InitSecret := @XXH3_initSecret_scalar; + end; + end; +end; + +initialization + InitDispatch(); + +end. diff --git a/HashLib/src/Include/HashLib.inc b/HashLib/src/Include/HashLib.inc index 1f96646..41b75a6 100644 --- a/HashLib/src/Include/HashLib.inc +++ b/HashLib/src/Include/HashLib.inc @@ -54,4 +54,24 @@ {$SCOPEDENUMS ON} {$POINTERMATH ON} +{============================== SIMD Settings =================================} + +{$IF DEFINED(CPUX86_64) OR DEFINED(CPUX64)} + {$DEFINE HASHLIB_X86_64} +{$IFEND} + +{$IFDEF FPC} + {$IFDEF HASHLIB_X86_64} + {$DEFINE HASHLIB_AVX2_ASM_SUPPORTED} + {$ENDIF} +{$ENDIF} + +// Uncomment ONE of the following to force a specific SIMD dispatch level: +// {$DEFINE HASHLIB_FORCE_SCALAR} +// {$DEFINE HASHLIB_FORCE_SSE2} + +{$IF DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SSE2)} + {$MESSAGE ERROR 'HASHLIB_FORCE_SCALAR and HASHLIB_FORCE_SSE2 cannot both be defined. Enable only one.'} +{$IFEND} + (* &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& *) diff --git a/HashLib/src/Include/HashLibFPC.inc b/HashLib/src/Include/HashLibFPC.inc index 7434d78..4d5ae3e 100644 --- a/HashLib/src/Include/HashLibFPC.inc +++ b/HashLib/src/Include/HashLibFPC.inc @@ -37,6 +37,7 @@ {$MODE DELPHI} {$MACRO ON} + {$ASMMODE INTEL} {$NOTES OFF} {$OPTIMIZATION LEVEL3} {$OPTIMIZATION NOUSELOADMODIFYSTORE} diff --git a/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc new file mode 100644 index 0000000..0e1a35f --- /dev/null +++ b/HashLib/src/Include/Simd/Common/SimdProc2Begin.inc @@ -0,0 +1,19 @@ +// Shared SIMD procedure prologue for 2-parameter assembly functions. +// After inclusion: rcx = param1, rdx = param2 (MS x64 ABI). +// On FPC non-Windows (System V ABI), remaps rdi,rsi -> rcx,rdx. +// Usage: +// procedure MyProc(P1, P2: Pointer); +// {$I SimdProc2Begin.inc} +// // ... SIMD instructions using rcx, rdx ... +// end; +{$IFDEF FPC} + assembler; nostackframe; +asm + {$IFNDEF MSWINDOWS} + mov rdx, rsi + mov rcx, rdi + {$ENDIF} +{$ELSE} +asm + .noframe +{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc new file mode 100644 index 0000000..b5554b0 --- /dev/null +++ b/HashLib/src/Include/Simd/Common/SimdProc3Begin.inc @@ -0,0 +1,21 @@ +// Shared SIMD procedure prologue for 3-parameter assembly functions. +// After inclusion: rcx = param1, rdx = param2, r8 = param3 (MS x64 ABI). +// On FPC non-Windows (System V ABI), remaps rdi,rsi,rdx -> rcx,rdx,r8. +// Move order avoids register clobbering: save rdx before overwriting. +// Usage: +// procedure MyProc(P1, P2, P3: Pointer); +// {$I SimdProc3Begin.inc} +// // ... SIMD instructions using rcx, rdx, r8 ... +// end; +{$IFDEF FPC} + assembler; nostackframe; +asm + {$IFNDEF MSWINDOWS} + mov r8, rdx + mov rdx, rsi + mov rcx, rdi + {$ENDIF} +{$ELSE} +asm + .noframe +{$ENDIF} diff --git a/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc b/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc new file mode 100644 index 0000000..f004bfa --- /dev/null +++ b/HashLib/src/Include/Simd/Common/SimdProc4Begin.inc @@ -0,0 +1,23 @@ +// Shared SIMD procedure prologue for 4-parameter assembly functions. +// After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4 +// (MS x64 ABI). +// On FPC non-Windows (System V ABI), remaps rdi,rsi,rdx,rcx -> rcx,rdx,r8,r9. +// Move order avoids register clobbering: save rcx and rdx first. +// Usage: +// procedure MyProc(P1, P2, P3: Pointer; P4: Int32); +// {$I SimdProc4Begin.inc} +// // ... SIMD instructions using rcx, rdx, r8, r9 ... +// end; +{$IFDEF FPC} + assembler; nostackframe; +asm + {$IFNDEF MSWINDOWS} + mov r9, rcx + mov r8, rdx + mov rdx, rsi + mov rcx, rdi + {$ENDIF} +{$ELSE} +asm + .noframe +{$ENDIF} diff --git a/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc new file mode 100644 index 0000000..d84ef20 --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Avx2.inc @@ -0,0 +1,30 @@ +// AVX2 implementation of XXH3_accumulate_512 (fully unrolled, 2 x 32-byte chunks). +// Expects MS x64 ABI: rcx = acc ptr, rdx = input ptr, r8 = secret ptr. +// Uses only volatile registers: ymm0-ymm4. +// Reference: official xxHash C - XXH3_accumulate_512_avx2 in xxhash.h + + // --- Chunk 0: acc[0..3], input[0..31], secret[0..31] --- + vmovdqu ymm0, yword [rcx] + vmovdqu ymm1, yword [rdx] + vmovdqu ymm2, yword [r8] + vpxor ymm2, ymm2, ymm1 + vpsrlq ymm3, ymm2, 32 + vpmuludq ymm2, ymm2, ymm3 + vpshufd ymm1, ymm1, $4E + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm0, ymm0, ymm2 + vmovdqu yword [rcx], ymm0 + + // --- Chunk 1: acc[4..7], input[32..63], secret[32..63] --- + vmovdqu ymm0, yword [rcx + $20] + vmovdqu ymm1, yword [rdx + $20] + vmovdqu ymm2, yword [r8 + $20] + vpxor ymm2, ymm2, ymm1 + vpsrlq ymm3, ymm2, 32 + vpmuludq ymm2, ymm2, ymm3 + vpshufd ymm1, ymm1, $4E + vpaddq ymm0, ymm0, ymm1 + vpaddq ymm0, ymm0, ymm2 + vmovdqu yword [rcx + $20], ymm0 + + vzeroupper diff --git a/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc new file mode 100644 index 0000000..e1ecd3a --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3Acc512Sse2.inc @@ -0,0 +1,52 @@ +// SSE2 implementation of XXH3_accumulate_512 (fully unrolled, 4 x 16-byte chunks). +// Expects MS x64 ABI: rcx = acc ptr, rdx = input ptr, r8 = secret ptr. +// Uses only volatile registers: xmm0-xmm5. +// Reference: official xxHash C - XXH3_accumulate_512_sse2 in xxhash.h + + // --- Chunk 0: acc[0..1], input[0..15], secret[0..15] --- + movdqu xmm0, oword [rcx] + movdqu xmm1, oword [rdx] + movdqu xmm2, oword [r8] + pxor xmm2, xmm1 + pshufd xmm3, xmm2, $31 + pmuludq xmm2, xmm3 + pshufd xmm1, xmm1, $4E + paddq xmm0, xmm1 + paddq xmm0, xmm2 + movdqu oword [rcx], xmm0 + + // --- Chunk 1: acc[2..3], input[16..31], secret[16..31] --- + movdqu xmm0, oword [rcx + $10] + movdqu xmm1, oword [rdx + $10] + movdqu xmm2, oword [r8 + $10] + pxor xmm2, xmm1 + pshufd xmm3, xmm2, $31 + pmuludq xmm2, xmm3 + pshufd xmm1, xmm1, $4E + paddq xmm0, xmm1 + paddq xmm0, xmm2 + movdqu oword [rcx + $10], xmm0 + + // --- Chunk 2: acc[4..5], input[32..47], secret[32..47] --- + movdqu xmm0, oword [rcx + $20] + movdqu xmm1, oword [rdx + $20] + movdqu xmm2, oword [r8 + $20] + pxor xmm2, xmm1 + pshufd xmm3, xmm2, $31 + pmuludq xmm2, xmm3 + pshufd xmm1, xmm1, $4E + paddq xmm0, xmm1 + paddq xmm0, xmm2 + movdqu oword [rcx + $20], xmm0 + + // --- Chunk 3: acc[6..7], input[48..63], secret[48..63] --- + movdqu xmm0, oword [rcx + $30] + movdqu xmm1, oword [rdx + $30] + movdqu xmm2, oword [r8 + $30] + pxor xmm2, xmm1 + pshufd xmm3, xmm2, $31 + pmuludq xmm2, xmm3 + pshufd xmm1, xmm1, $4E + paddq xmm0, xmm1 + paddq xmm0, xmm2 + movdqu oword [rcx + $30], xmm0 diff --git a/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc new file mode 100644 index 0000000..b2fa2ac --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretAvx2.inc @@ -0,0 +1,44 @@ +// AVX2 implementation of XXH3_initCustomSecret (fully unrolled, 6 x 32-byte chunks). +// Expects MS x64 ABI: rcx = customSecret ptr, rdx = defaultSecret ptr, r8 = seed (UInt64). +// Uses only volatile registers: ymm0-ymm2, rax. +// Reference: official xxHash C - XXH3_initCustomSecret_avx2 in xxhash.h + + // Build seed vector ymm0 = [seed, -seed, seed, -seed] + vmovq xmm0, r8 + mov rax, r8 + neg rax + vmovq xmm1, rax + vpunpcklqdq xmm0, xmm0, xmm1 + vinserti128 ymm0, ymm0, xmm0, 1 + + // Block 0 + vmovdqu ymm1, yword [rdx] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx], ymm1 + + // Block 1 + vmovdqu ymm1, yword [rdx + $20] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx + $20], ymm1 + + // Block 2 + vmovdqu ymm1, yword [rdx + $40] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx + $40], ymm1 + + // Block 3 + vmovdqu ymm1, yword [rdx + $60] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx + $60], ymm1 + + // Block 4 + vmovdqu ymm1, yword [rdx + $80] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx + $80], ymm1 + + // Block 5 + vmovdqu ymm1, yword [rdx + $A0] + vpaddq ymm1, ymm1, ymm0 + vmovdqu yword [rcx + $A0], ymm1 + + vzeroupper diff --git a/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc new file mode 100644 index 0000000..8b1306f --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3InitSecretSse2.inc @@ -0,0 +1,72 @@ +// SSE2 implementation of XXH3_initCustomSecret (fully unrolled, 12 x 16-byte chunks). +// Expects MS x64 ABI: rcx = customSecret ptr, rdx = defaultSecret ptr, r8 = seed (UInt64). +// Uses only volatile registers: xmm0-xmm2. +// Algorithm: for each 16-byte block, lo_qword += seed, hi_qword -= seed. +// Reference: official xxHash C - XXH3_initCustomSecret_sse2 in xxhash.h + + // Build seed vector xmm0 = [seed, -seed] + movq xmm0, r8 + mov rax, r8 + neg rax + movq xmm1, rax + punpcklqdq xmm0, xmm1 + + // Block 0 + movdqu xmm1, oword [rdx] + paddq xmm1, xmm0 + movdqu oword [rcx], xmm1 + + // Block 1 + movdqu xmm1, oword [rdx + $10] + paddq xmm1, xmm0 + movdqu oword [rcx + $10], xmm1 + + // Block 2 + movdqu xmm1, oword [rdx + $20] + paddq xmm1, xmm0 + movdqu oword [rcx + $20], xmm1 + + // Block 3 + movdqu xmm1, oword [rdx + $30] + paddq xmm1, xmm0 + movdqu oword [rcx + $30], xmm1 + + // Block 4 + movdqu xmm1, oword [rdx + $40] + paddq xmm1, xmm0 + movdqu oword [rcx + $40], xmm1 + + // Block 5 + movdqu xmm1, oword [rdx + $50] + paddq xmm1, xmm0 + movdqu oword [rcx + $50], xmm1 + + // Block 6 + movdqu xmm1, oword [rdx + $60] + paddq xmm1, xmm0 + movdqu oword [rcx + $60], xmm1 + + // Block 7 + movdqu xmm1, oword [rdx + $70] + paddq xmm1, xmm0 + movdqu oword [rcx + $70], xmm1 + + // Block 8 + movdqu xmm1, oword [rdx + $80] + paddq xmm1, xmm0 + movdqu oword [rcx + $80], xmm1 + + // Block 9 + movdqu xmm1, oword [rdx + $90] + paddq xmm1, xmm0 + movdqu oword [rcx + $90], xmm1 + + // Block 10 + movdqu xmm1, oword [rdx + $A0] + paddq xmm1, xmm0 + movdqu oword [rcx + $A0], xmm1 + + // Block 11 + movdqu xmm1, oword [rdx + $B0] + paddq xmm1, xmm0 + movdqu oword [rcx + $B0], xmm1 diff --git a/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc new file mode 100644 index 0000000..9bc2b28 --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleAvx2.inc @@ -0,0 +1,38 @@ +// AVX2 implementation of XXH3_scrambleAcc (fully unrolled, 2 x 32-byte chunks). +// Expects MS x64 ABI: rcx = acc ptr, rdx = secret ptr. +// Uses only volatile registers: ymm0-ymm4, eax. +// Reference: official xxHash C - XXH3_scrambleAcc_avx2 in xxhash.h + + // Broadcast XXH_PRIME32_1 ($9E3779B1) to all dword lanes of ymm4 + mov eax, $9E3779B1 + vmovd xmm4, eax + vpbroadcastd ymm4, xmm4 + + // --- Chunk 0: acc[0..3], secret[0..31] --- + vmovdqu ymm0, yword [rcx] + vpsrlq ymm1, ymm0, 47 + vpxor ymm0, ymm0, ymm1 + vmovdqu ymm1, yword [rdx] + vpxor ymm0, ymm0, ymm1 + // Multiply 64-bit lanes by PRIME32_1 + vpsrlq ymm1, ymm0, 32 + vpmuludq ymm2, ymm0, ymm4 + vpmuludq ymm1, ymm1, ymm4 + vpsllq ymm1, ymm1, 32 + vpaddq ymm0, ymm2, ymm1 + vmovdqu yword [rcx], ymm0 + + // --- Chunk 1: acc[4..7], secret[32..63] --- + vmovdqu ymm0, yword [rcx + $20] + vpsrlq ymm1, ymm0, 47 + vpxor ymm0, ymm0, ymm1 + vmovdqu ymm1, yword [rdx + $20] + vpxor ymm0, ymm0, ymm1 + vpsrlq ymm1, ymm0, 32 + vpmuludq ymm2, ymm0, ymm4 + vpmuludq ymm1, ymm1, ymm4 + vpsllq ymm1, ymm1, 32 + vpaddq ymm0, ymm2, ymm1 + vmovdqu yword [rcx + $20], ymm0 + + vzeroupper diff --git a/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc new file mode 100644 index 0000000..50d8218 --- /dev/null +++ b/HashLib/src/Include/Simd/XXH3/XXH3ScrambleSse2.inc @@ -0,0 +1,70 @@ +// SSE2 implementation of XXH3_scrambleAcc (fully unrolled, 4 x 16-byte chunks). +// Expects MS x64 ABI: rcx = acc ptr, rdx = secret ptr. +// Uses only volatile registers: xmm0-xmm5, eax. +// Algorithm per 128-bit chunk: +// acc ^= (acc >> 47) +// acc ^= secret +// acc *= XXH_PRIME32_1 +// Reference: official xxHash C - XXH3_scrambleAcc_sse2 in xxhash.h + + // Load XXH_PRIME32_1 ($9E3779B1) into xmm5 as broadcast dword + mov eax, $9E3779B1 + movd xmm5, eax + pshufd xmm5, xmm5, 0 + + // --- Chunk 0: acc[0..1], secret[0..15] --- + movdqu xmm0, oword [rcx] + movdqu xmm1, xmm0 + psrlq xmm1, 47 + pxor xmm0, xmm1 + movdqu xmm1, oword [rdx] + pxor xmm0, xmm1 + // Multiply 64-bit lanes by PRIME32_1 using two 32x32->64 multiplies + pshufd xmm1, xmm0, $31 + pmuludq xmm0, xmm5 + pmuludq xmm1, xmm5 + psllq xmm1, 32 + paddq xmm0, xmm1 + movdqu oword [rcx], xmm0 + + // --- Chunk 1: acc[2..3], secret[16..31] --- + movdqu xmm0, oword [rcx + $10] + movdqu xmm1, xmm0 + psrlq xmm1, 47 + pxor xmm0, xmm1 + movdqu xmm1, oword [rdx + $10] + pxor xmm0, xmm1 + pshufd xmm1, xmm0, $31 + pmuludq xmm0, xmm5 + pmuludq xmm1, xmm5 + psllq xmm1, 32 + paddq xmm0, xmm1 + movdqu oword [rcx + $10], xmm0 + + // --- Chunk 2: acc[4..5], secret[32..47] --- + movdqu xmm0, oword [rcx + $20] + movdqu xmm1, xmm0 + psrlq xmm1, 47 + pxor xmm0, xmm1 + movdqu xmm1, oword [rdx + $20] + pxor xmm0, xmm1 + pshufd xmm1, xmm0, $31 + pmuludq xmm0, xmm5 + pmuludq xmm1, xmm5 + psllq xmm1, 32 + paddq xmm0, xmm1 + movdqu oword [rcx + $20], xmm0 + + // --- Chunk 3: acc[6..7], secret[48..63] --- + movdqu xmm0, oword [rcx + $30] + movdqu xmm1, xmm0 + psrlq xmm1, 47 + pxor xmm0, xmm1 + movdqu xmm1, oword [rdx + $30] + pxor xmm0, xmm1 + pshufd xmm1, xmm0, $31 + pmuludq xmm0, xmm5 + pmuludq xmm1, xmm5 + psllq xmm1, 32 + paddq xmm0, xmm1 + movdqu oword [rcx + $30], xmm0 diff --git a/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk b/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk index d796bde..b9eef24 100644 --- a/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk +++ b/HashLib/src/Packages/Delphi/HashLib4PascalPackage.dpk @@ -113,6 +113,7 @@ contains HlpMurmur2_64 in '..\..\Hash64\HlpMurmur2_64.pas', HlpSipHash in '..\..\Hash64\HlpSipHash.pas', HlpXXHash64 in '..\..\Hash64\HlpXXHash64.pas', + HlpXXHash3Dispatch in '..\..\Hash64\HlpXXHash3Dispatch.pas', HlpXXHash3 in '..\..\Hash64\HlpXXHash3.pas', HlpMurmurHash3_x64_128 in '..\..\Hash128\HlpMurmurHash3_x64_128.pas', HlpMurmurHash3_x86_128 in '..\..\Hash128\HlpMurmurHash3_x86_128.pas', @@ -132,6 +133,7 @@ contains HlpConverters in '..\..\Utils\HlpConverters.pas', HlpBitConverter in '..\..\Utils\HlpBitConverter.pas', HlpBits in '..\..\Utils\HlpBits.pas', + HlpSimd in '..\..\Utils\HlpSimd.pas', HlpArrayUtils in '..\..\Utils\HlpArrayUtils.pas', HlpHashLibTypes in '..\..\Utils\HlpHashLibTypes.pas', HlpBlake2SParams in '..\..\Crypto\Blake2SParams\HlpBlake2SParams.pas', diff --git a/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk b/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk index 1bc9c97..01871f6 100644 --- a/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk +++ b/HashLib/src/Packages/FPC/HashLib4PascalPackage.lpk @@ -23,7 +23,7 @@ "/> - + @@ -448,6 +448,14 @@ + + + + + + + + diff --git a/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas b/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas index 6c7c82e..349b28f 100644 --- a/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas +++ b/HashLib/src/Packages/FPC/HashLib4PascalPackage.pas @@ -28,7 +28,8 @@ interface HlpArgon2TypeAndVersion, HlpPBKDF_Argon2NotBuildInAdapter, HlpPBKDF_ScryptNotBuildInAdapter, HlpArrayUtils, HlpBlake2BP, HlpBlake2SP, HlpSipHash128, HlpBlake2SParams, HlpBlake2BParams, HlpIBlake2SParams, - HlpIBlake2BParams, HlpBlake3, HlpXXHash3, HlpXXHash128; + HlpIBlake2BParams, HlpBlake3, HlpXXHash3, HlpXXHash128, HlpSimd, + HlpXXHash3Dispatch; implementation diff --git a/HashLib/src/Utils/HlpSimd.pas b/HashLib/src/Utils/HlpSimd.pas new file mode 100644 index 0000000..392cc2e --- /dev/null +++ b/HashLib/src/Utils/HlpSimd.pas @@ -0,0 +1,175 @@ +unit HlpSimd; + +{$I ..\Include\HashLib.inc} + +interface + +type + TSimdLevel = (Scalar, SSE2, AVX2); + + TSimd = class sealed + private + class var FDetectedLevel: TSimdLevel; + class function CPUHasSSE2(): Boolean; static; + class function CPUHasAVX2(): Boolean; static; + class procedure DetectFeatures(); static; + public + class function GetActiveLevel(): TSimdLevel; static; + end; + +implementation + +{$IFDEF HASHLIB_X86_64} + +type + TCpuIdResult = record + RegEAX, RegEBX, RegECX, RegEDX: UInt32; + end; + +{$IFDEF FPC} +procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; AResult: Pointer); + assembler; nostackframe; +asm + push rbx + {$IFDEF MSWINDOWS} + mov eax, ecx + mov ecx, edx + cpuid + mov dword ptr [r8], eax + mov dword ptr [r8 + 4], ebx + mov dword ptr [r8 + 8], ecx + mov dword ptr [r8 + 12], edx + {$ELSE} + mov eax, edi + mov ecx, esi + mov r8, rdx + cpuid + mov dword ptr [r8], eax + mov dword ptr [r8 + 4], ebx + mov dword ptr [r8 + 8], ecx + mov dword ptr [r8 + 12], edx + {$ENDIF} + pop rbx +end; +{$ELSE} +procedure CpuIdQuery(ALeaf, ASubLeaf: UInt32; AResult: Pointer); +asm + .PUSHNV RBX + mov eax, ecx + mov ecx, edx + cpuid + mov dword ptr [r8], eax + mov dword ptr [r8 + 4], ebx + mov dword ptr [r8 + 8], ecx + mov dword ptr [r8 + 12], edx +end; +{$ENDIF} + +{$IFDEF FPC} +procedure XGetBvQuery(AResult: Pointer); + assembler; nostackframe; +asm + {$IFDEF MSWINDOWS} + mov r8, rcx + {$ELSE} + mov r8, rdi + {$ENDIF} + xor ecx, ecx + xgetbv + mov dword ptr [r8], eax + mov dword ptr [r8 + 4], edx +end; +{$ELSE} +procedure XGetBvQuery(AResult: Pointer); +asm + .noframe + mov r8, rcx + xor ecx, ecx + xgetbv + mov dword ptr [r8], eax + mov dword ptr [r8 + 4], edx +end; +{$ENDIF} + +{$ENDIF HASHLIB_X86_64} + +{ TSimd } + +class function TSimd.CPUHasSSE2(): Boolean; +{$IFDEF HASHLIB_X86_64} +var + LCpuId: TCpuIdResult; +{$ENDIF} +begin +{$IFDEF HASHLIB_X86_64} + CpuIdQuery(1, 0, @LCpuId); + Result := (LCpuId.RegEDX and (1 shl 26)) <> 0; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class function TSimd.CPUHasAVX2(): Boolean; +{$IFDEF HASHLIB_X86_64} +var + LCpuId: TCpuIdResult; + LXcr0: UInt64; +{$ENDIF} +begin +{$IFDEF HASHLIB_X86_64} + CpuIdQuery(1, 0, @LCpuId); + + // OSXSAVE: ECX bit 27 (required for OS AVX state saving) + if (LCpuId.RegECX and (1 shl 27)) = 0 then + Exit(False); + + // XCR0 bits 1 and 2 must be set for AVX state support + LXcr0 := 0; + XGetBvQuery(@LXcr0); + if (UInt32(LXcr0) and $06) <> $06 then + Exit(False); + + CpuIdQuery(7, 0, @LCpuId); + + // AVX2: EBX bit 5 + Result := (LCpuId.RegEBX and (1 shl 5)) <> 0; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class procedure TSimd.DetectFeatures(); +begin + FDetectedLevel := TSimdLevel.Scalar; + + if CPUHasSSE2() then + begin + FDetectedLevel := TSimdLevel.SSE2; + if CPUHasAVX2() then + FDetectedLevel := TSimdLevel.AVX2; + end; + + // Cap based on compiler assembler capability +{$IFNDEF HASHLIB_AVX2_ASM_SUPPORTED} + if FDetectedLevel > TSimdLevel.SSE2 then + FDetectedLevel := TSimdLevel.SSE2; +{$ENDIF} + + // Cap based on user force defines +{$IF DEFINED(HASHLIB_FORCE_SCALAR)} + FDetectedLevel := TSimdLevel.Scalar; +{$ELSEIF DEFINED(HASHLIB_FORCE_SSE2)} + if FDetectedLevel > TSimdLevel.SSE2 then + FDetectedLevel := TSimdLevel.SSE2; +{$IFEND} +end; + +class function TSimd.GetActiveLevel(): TSimdLevel; +begin + Result := FDetectedLevel; +end; + +initialization + TSimd.DetectFeatures(); + +end.