Skip to content

Commit a93fa1f

Browse files
committed
Add SIMD support for Adler32
1 parent fa16dd8 commit a93fa1f

11 files changed

Lines changed: 478 additions & 53 deletions

File tree

HashLib.Benchmark/Delphi/PerformanceBenchmarkConsole.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ uses
1717
HlpHMACNotBuildInAdapter in '..\..\HashLib\src\Base\HlpHMACNotBuildInAdapter.pas',
1818
HlpMultipleTransformNonBlock in '..\..\HashLib\src\Base\HlpMultipleTransformNonBlock.pas',
1919
HlpAdler32 in '..\..\HashLib\src\Checksum\HlpAdler32.pas',
20+
HlpAdler32Dispatch in '..\..\HashLib\src\Checksum\HlpAdler32Dispatch.pas',
2021
HlpCRC in '..\..\HashLib\src\Checksum\HlpCRC.pas',
2122
HlpCRC16 in '..\..\HashLib\src\Checksum\HlpCRC16.pas',
2223
HlpCRC32 in '..\..\HashLib\src\Checksum\HlpCRC32.pas',

HashLib.Benchmark/Delphi/PerformanceBenchmarkFMX.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ uses
1616
HlpHMACNotBuildInAdapter in '..\..\HashLib\src\Base\HlpHMACNotBuildInAdapter.pas',
1717
HlpMultipleTransformNonBlock in '..\..\HashLib\src\Base\HlpMultipleTransformNonBlock.pas',
1818
HlpAdler32 in '..\..\HashLib\src\Checksum\HlpAdler32.pas',
19+
HlpAdler32Dispatch in '..\..\HashLib\src\Checksum\HlpAdler32Dispatch.pas',
1920
HlpCRC in '..\..\HashLib\src\Checksum\HlpCRC.pas',
2021
HlpCRC16 in '..\..\HashLib\src\Checksum\HlpCRC16.pas',
2122
HlpCRC32 in '..\..\HashLib\src\Checksum\HlpCRC32.pas',

HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ uses
3838
HlpHMACNotBuildInAdapter in '..\..\HashLib\src\Base\HlpHMACNotBuildInAdapter.pas',
3939
HlpMultipleTransformNonBlock in '..\..\HashLib\src\Base\HlpMultipleTransformNonBlock.pas',
4040
HlpAdler32 in '..\..\HashLib\src\Checksum\HlpAdler32.pas',
41+
HlpAdler32Dispatch in '..\..\HashLib\src\Checksum\HlpAdler32Dispatch.pas',
4142
HlpCRC in '..\..\HashLib\src\Checksum\HlpCRC.pas',
4243
HlpCRC16 in '..\..\HashLib\src\Checksum\HlpCRC16.pas',
4344
HlpCRC32 in '..\..\HashLib\src\Checksum\HlpCRC32.pas',

HashLib/src/Checksum/HlpAdler32.pas

Lines changed: 9 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ TAdler32 = class sealed(THash, IChecksum, IHash32, ITransformBlock)
2020
var
2121
FSumA, FSumB: UInt32;
2222

23-
const
24-
ModAdler = UInt32(65521);
25-
2623
public
2724
constructor Create();
2825
procedure Initialize(); override;
@@ -35,6 +32,9 @@ TAdler32 = class sealed(THash, IChecksum, IHash32, ITransformBlock)
3532

3633
implementation
3734

35+
uses
36+
HlpAdler32Dispatch;
37+
3838
{ TAdler32 }
3939

4040
function TAdler32.Clone(): IHash;
@@ -62,60 +62,18 @@ procedure TAdler32.Initialize;
6262
procedure TAdler32.TransformBytes(const AData: THashLibByteArray;
6363
AIndex, ALength: Int32);
6464
var
65-
LChunkLength: Int32;
66-
LPtrData: PByte;
67-
LSumA, LSumB: UInt32;
65+
LSums: array [0 .. 1] of UInt32;
6866
begin
6967
{$IFDEF DEBUG}
7068
System.Assert(AIndex >= 0);
7169
System.Assert(ALength >= 0);
7270
System.Assert(AIndex + ALength <= System.Length(AData));
7371
{$ENDIF DEBUG}
74-
LPtrData := PByte(AData) + AIndex;
75-
76-
{
77-
LSumA := FSumA;
78-
LSumB := FSumB;
79-
while ALength > 0 do
80-
begin
81-
LSumA := (LSumA + LPtrData^) mod ModAdler;
82-
LSumB := (LSumB + LSumA) mod ModAdler;
83-
System.Inc(LPtrData);
84-
System.Dec(ALength);
85-
end;
86-
FSumA := LSumA;
87-
FSumB := LSumB;
88-
}
89-
90-
// lifted from PngEncoder Adler32.cs
91-
92-
while ALength > 0 do
93-
begin
94-
// We can defer the modulo operation:
95-
// FSumA maximally grows from 65521 to 65521 + 255 * 3800
96-
// FSumB maximally grows by 3800 * median(FSumA) = 2090079800 < 2^31
97-
LChunkLength := 3800;
98-
if (LChunkLength > ALength) then
99-
begin
100-
LChunkLength := ALength;
101-
end;
102-
ALength := ALength - LChunkLength;
103-
104-
LSumA := FSumA;
105-
LSumB := FSumB;
106-
while (LChunkLength - 1) >= 0 do
107-
begin
108-
LSumA := (LSumA + LPtrData^);
109-
LSumB := (LSumB + LSumA);
110-
System.Inc(LPtrData);
111-
System.Dec(LChunkLength);
112-
end;
113-
LSumA := LSumA mod ModAdler;
114-
LSumB := LSumB mod ModAdler;
115-
116-
FSumA := LSumA;
117-
FSumB := LSumB;
118-
end;
72+
LSums[0] := FSumA;
73+
LSums[1] := FSumB;
74+
Adler32_Update(PByte(AData) + AIndex, UInt32(ALength), @LSums[0]);
75+
FSumA := LSums[0];
76+
FSumB := LSums[1];
11977
end;
12078

12179
function TAdler32.TransformFinal: IHashResult;
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
unit HlpAdler32Dispatch;
2+
3+
{$I ..\Include\HashLib.inc}
4+
5+
interface
6+
7+
type
8+
TAdler32UpdateProc = procedure(AData: PByte; ALength: UInt32; ASums: Pointer);
9+
10+
var
11+
Adler32_Update: TAdler32UpdateProc;
12+
13+
implementation
14+
15+
uses
16+
HlpSimd;
17+
18+
const
19+
ModAdler = UInt32(65521);
20+
NMAX = UInt32(5552);
21+
BLOCK_SIZE = UInt32(32);
22+
MAX_BLOCKS_PER_CHUNK = NMAX div BLOCK_SIZE; // 173
23+
24+
Adler32Constants: array [0 .. 63] of Byte = (
25+
// Offset 0..31: weights [32,31,...,1]
26+
// SSE2/SSSE3 use as two 16-byte halves; AVX2 uses full 32 bytes.
27+
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
28+
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
29+
// Offset 32..63: ones_16 (16-bit value 1 in little-endian, repeated)
30+
// SSSE3 uses first 16 bytes; AVX2 uses all 32 bytes.
31+
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
32+
1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0
33+
);
34+
35+
// =============================================================================
36+
// Scalar fallback implementation
37+
// =============================================================================
38+
39+
procedure Adler32_Update_Scalar(AData: PByte; ALength: UInt32; ASums: Pointer);
40+
var
41+
LChunkLen: UInt32;
42+
LPSumA, LPSumB: PUInt32;
43+
begin
44+
LPSumA := PUInt32(ASums);
45+
LPSumB := PUInt32(PByte(ASums) + SizeOf(UInt32));
46+
47+
while ALength > 0 do
48+
begin
49+
LChunkLen := ALength;
50+
if LChunkLen > NMAX then
51+
LChunkLen := NMAX;
52+
Dec(ALength, LChunkLen);
53+
54+
while LChunkLen > 0 do
55+
begin
56+
LPSumA^ := LPSumA^ + AData^;
57+
LPSumB^ := LPSumB^ + LPSumA^;
58+
Inc(AData);
59+
Dec(LChunkLen);
60+
end;
61+
62+
LPSumA^ := LPSumA^ mod ModAdler;
63+
LPSumB^ := LPSumB^ mod ModAdler;
64+
end;
65+
end;
66+
67+
// =============================================================================
68+
// SIMD implementations (x86-64 only)
69+
// =============================================================================
70+
71+
{$IFDEF HASHLIB_X86_64}
72+
73+
type
74+
TProcessBlocksProc = procedure(AData: PByte; ANumBlocks: UInt32;
75+
ASums, AConstants: Pointer);
76+
77+
procedure Adler32_ProcessBlocks_Sse2(AData: PByte; ANumBlocks: UInt32;
78+
ASums, AConstants: Pointer);
79+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
80+
{$I ..\Include\Simd\Adler32\Adler32BlocksSse2.inc}
81+
end;
82+
83+
procedure Adler32_ProcessBlocks_Ssse3(AData: PByte; ANumBlocks: UInt32;
84+
ASums, AConstants: Pointer);
85+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
86+
{$I ..\Include\Simd\Adler32\Adler32BlocksSsse3.inc}
87+
end;
88+
89+
procedure Adler32_ProcessBlocks_Avx2(AData: PByte; ANumBlocks: UInt32;
90+
ASums, AConstants: Pointer);
91+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
92+
{$I ..\Include\Simd\Adler32\Adler32BlocksAvx2.inc}
93+
end;
94+
95+
procedure Adler32_Update_Simd(AData: PByte; ALength: UInt32; ASums: Pointer;
96+
AProcessBlocks: TProcessBlocksProc);
97+
var
98+
LChunkLen, LBlocks: UInt32;
99+
LPSumA, LPSumB: PUInt32;
100+
begin
101+
LPSumA := PUInt32(ASums);
102+
LPSumB := PUInt32(PByte(ASums) + SizeOf(UInt32));
103+
104+
while ALength > 0 do
105+
begin
106+
LChunkLen := ALength;
107+
if LChunkLen > NMAX then
108+
LChunkLen := NMAX;
109+
Dec(ALength, LChunkLen);
110+
111+
LBlocks := LChunkLen div BLOCK_SIZE;
112+
if LBlocks > 0 then
113+
begin
114+
AProcessBlocks(AData, LBlocks, ASums, @Adler32Constants[0]);
115+
Inc(AData, LBlocks * BLOCK_SIZE);
116+
Dec(LChunkLen, LBlocks * BLOCK_SIZE);
117+
end;
118+
119+
while LChunkLen > 0 do
120+
begin
121+
LPSumA^ := LPSumA^ + AData^;
122+
LPSumB^ := LPSumB^ + LPSumA^;
123+
Inc(AData);
124+
Dec(LChunkLen);
125+
end;
126+
127+
LPSumA^ := LPSumA^ mod ModAdler;
128+
LPSumB^ := LPSumB^ mod ModAdler;
129+
end;
130+
end;
131+
132+
procedure Adler32_Update_Sse2(AData: PByte; ALength: UInt32; ASums: Pointer);
133+
begin
134+
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Sse2);
135+
end;
136+
137+
procedure Adler32_Update_Ssse3(AData: PByte; ALength: UInt32; ASums: Pointer);
138+
begin
139+
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Ssse3);
140+
end;
141+
142+
procedure Adler32_Update_Avx2(AData: PByte; ALength: UInt32; ASums: Pointer);
143+
begin
144+
Adler32_Update_Simd(AData, ALength, ASums, @Adler32_ProcessBlocks_Avx2);
145+
end;
146+
147+
{$ENDIF HASHLIB_X86_64}
148+
149+
// =============================================================================
150+
// Dispatch initialization
151+
// =============================================================================
152+
153+
procedure InitDispatch();
154+
begin
155+
case TSimd.GetActiveLevel() of
156+
{$IFDEF HASHLIB_X86_64}
157+
TSimdLevel.AVX2:
158+
begin
159+
Adler32_Update := @Adler32_Update_Avx2;
160+
end;
161+
TSimdLevel.SSSE3:
162+
begin
163+
Adler32_Update := @Adler32_Update_Ssse3;
164+
end;
165+
TSimdLevel.SSE2:
166+
begin
167+
Adler32_Update := @Adler32_Update_Sse2;
168+
end;
169+
{$ENDIF}
170+
TSimdLevel.Scalar:
171+
begin
172+
Adler32_Update := @Adler32_Update_Scalar;
173+
end;
174+
end;
175+
end;
176+
177+
initialization
178+
InitDispatch();
179+
180+
end.
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// AVX2 implementation of Adler-32 block processing.
2+
// Expects MS x64 ABI: rcx = data ptr, edx = num_blocks, r8 = sums ptr, r9 = constants ptr.
3+
// ASums layout: [SumA: UInt32, SumB: UInt32].
4+
// Constants layout: [weights: 32B, ones_16: 32B] at offsets 0 and 32.
5+
// Processes num_blocks x 32-byte blocks. Does NOT apply mod 65521 (caller does it).
6+
// Uses ymm0-ymm5 only (all volatile on Windows x64, no saves needed).
7+
// Weights and ones are reloaded from memory each iteration to avoid
8+
// using non-volatile ymm registers.
9+
// AVX/AVX2 instructions are db-encoded for broad assembler compatibility.
10+
11+
// Zero register
12+
db $C5, $E5, $EF, $DB // vpxor ymm3, ymm3, ymm3
13+
14+
// Load initial sums
15+
mov eax, dword [r8]
16+
mov r10d, dword [r8 + 4]
17+
mov r11d, edx
18+
19+
// v_ps = [SumA * num_blocks, 0, 0, 0]
20+
imul edx, eax
21+
db $C5, $F9, $6E, $D2 // vmovd xmm2, edx
22+
23+
// v_s2 = [SumB, 0, 0, 0]
24+
db $C4, $C1, $79, $6E, $CA // vmovd xmm1, r10d
25+
26+
// v_s1 = 0
27+
db $C5, $F9, $EF, $C0 // vpxor xmm0, xmm0, xmm0
28+
29+
@loop:
30+
db $C5, $ED, $FE, $D0 // vpaddd ymm2, ymm2, ymm0
31+
32+
// Load 32 data bytes
33+
db $C5, $FE, $6F, $21 // vmovdqu ymm4, yword [rcx]
34+
35+
// Byte sum for s1
36+
db $C5, $DD, $F6, $EB // vpsadbw ymm5, ymm4, ymm3
37+
db $C5, $FD, $FE, $C5 // vpaddd ymm0, ymm0, ymm5
38+
39+
// Weighted sum for s2
40+
db $C4, $C1, $7E, $6F, $29 // vmovdqu ymm5, yword [r9]
41+
db $C4, $E2, $5D, $04, $E5 // vpmaddubsw ymm4, ymm4, ymm5
42+
db $C4, $C1, $7E, $6F, $69, $20 // vmovdqu ymm5, yword [r9 + 32]
43+
db $C5, $DD, $F5, $E5 // vpmaddwd ymm4, ymm4, ymm5
44+
db $C5, $F5, $FE, $CC // vpaddd ymm1, ymm1, ymm4
45+
46+
add rcx, 32
47+
dec r11d
48+
jnz @loop
49+
50+
// v_s2 += v_ps * 32
51+
db $C5, $ED, $72, $F2, $05 // vpslld ymm2, ymm2, 5
52+
db $C5, $F5, $FE, $CA // vpaddd ymm1, ymm1, ymm2
53+
54+
// Horizontal reduce v_s1: extract high 128 + 128-bit hsum
55+
db $C4, $E3, $7D, $39, $C5, $01 // vextracti128 xmm5, ymm0, 1
56+
db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5
57+
db $C5, $F9, $70, $E8, $B1 // vpshufd xmm5, xmm0, $B1
58+
db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5
59+
db $C5, $F9, $70, $E8, $4E // vpshufd xmm5, xmm0, $4E
60+
db $C5, $F9, $FE, $C5 // vpaddd xmm0, xmm0, xmm5
61+
db $C5, $F9, $7E, $C2 // vmovd edx, xmm0
62+
add eax, edx
63+
64+
// Horizontal reduce v_s2: extract high 128 + 128-bit hsum
65+
db $C4, $E3, $7D, $39, $CD, $01 // vextracti128 xmm5, ymm1, 1
66+
db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5
67+
db $C5, $F9, $70, $E9, $B1 // vpshufd xmm5, xmm1, $B1
68+
db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5
69+
db $C5, $F9, $70, $E9, $4E // vpshufd xmm5, xmm1, $4E
70+
db $C5, $F1, $FE, $CD // vpaddd xmm1, xmm1, xmm5
71+
db $C4, $C1, $79, $7E, $CA // vmovd r10d, xmm1
72+
73+
// Store results
74+
mov dword [r8], eax
75+
mov dword [r8 + 4], r10d
76+
77+
db $C5, $F8, $77 // vzeroupper

0 commit comments

Comments
 (0)