Skip to content

Commit 6e0ae04

Browse files
committed
Initial attempt at CRC SIMD (vpclmulqdq)
1 parent 1875149 commit 6e0ae04

4 files changed

Lines changed: 216 additions & 10 deletions

File tree

HashLib/src/Checksum/HlpCRCDispatch.pas

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,22 @@ function CRC_Fold_Pclmul(AData: PByte; ALength: UInt32;
2727
{$I ..\Include\Simd\CRC\CRCFoldPclmul.inc}
2828
end;
2929

30+
function CRC_Fold_Vpclmul(AData: PByte; ALength: UInt32;
31+
AState: Pointer; AConstants: Pointer): UInt64;
32+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
33+
{$I ..\Include\Simd\CRC\CRCFoldVpclmul.inc}
34+
end;
35+
3036
{$ENDIF HASHLIB_X86_64}
3137

3238
procedure InitDispatch();
3339
begin
3440
CRC_Fold_Lsb := nil;
3541
{$IFDEF HASHLIB_X86_64}
36-
if TSimd.HasPCLMULQDQ() then
37-
begin
42+
if TSimd.HasVPCLMULQDQ() then
43+
CRC_Fold_Lsb := @CRC_Fold_Vpclmul
44+
else if TSimd.HasPCLMULQDQ() then
3845
CRC_Fold_Lsb := @CRC_Fold_Pclmul;
39-
end;
4046
{$ENDIF}
4147
end;
4248

HashLib/src/Checksum/HlpGF2.pas

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@ TUInt128 = record
99
Lo, Hi: UInt64;
1010
end;
1111

12-
// PCLMULQDQ CRC folding and Barrett reduction constants.
13-
// Layout must match the assembly expectations in CRCFoldPclmul.inc.
12+
// PCLMULQDQ / VPCLMULQDQ CRC folding and Barrett reduction constants.
13+
// Layout must match the assembly expectations in CRCFoldPclmul.inc
14+
// and CRCFoldVpclmul.inc.
1415
TCRCFoldConstants = packed record
15-
Fold_4x128: array [0 .. 1] of UInt64; // offset 0: fold-by-4 constants
16-
Fold_1x128: array [0 .. 1] of UInt64; // offset 16: fold-by-1 constants
16+
Fold_4x128: array [0 .. 1] of UInt64; // offset 0: fold-by-4 constants (stride 512)
17+
Fold_1x128: array [0 .. 1] of UInt64; // offset 16: fold-by-1 constants (stride 128)
1718
Barrett: array [0 .. 1] of UInt64; // offset 32: Barrett reduction constants
19+
Fold_8x128: array [0 .. 1] of UInt64; // offset 48: fold-by-8 constants (stride 1024)
1820
end;
1921

2022
TGF2 = class sealed
@@ -254,6 +256,20 @@ class procedure TGF2.GenerateFoldConstants(APoly: UInt64; ABits: Int32;
254256
AConstants.Fold_1x128[1] := LConst0;
255257
end;
256258

259+
// --- Fold-by-8 constants (stride = 1024 bits, for VPCLMULQDQ) ---
260+
LConst0 := PowerMod(1024 + 64 + LK, APoly, ABits);
261+
LConst1 := PowerMod(1024 + LK, APoly, ABits);
262+
if AReflected then
263+
begin
264+
AConstants.Fold_8x128[0] := BitReverse(LConst0 shl (64 - ABits), 64);
265+
AConstants.Fold_8x128[1] := BitReverse(LConst1 shl (64 - ABits), 64);
266+
end
267+
else
268+
begin
269+
AConstants.Fold_8x128[0] := LConst1;
270+
AConstants.Fold_8x128[1] := LConst0;
271+
end;
272+
257273
// --- Barrett reduction constants ---
258274
// barrett[0] = floor(x^(63+n) / G)
259275
LDiv128.Lo := 0;

HashLib/src/Include/Simd/CRC/CRCFoldPclmul.inc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
1010
// AState: [0..7] = initial CRC (reflected), [8..15] = 0.
1111
// AConstants layout (TCRCFoldConstants):
12-
// [0..15] = fold_4x128
13-
// [16..31] = fold_1x128
14-
// [32..47] = barrett
12+
// [0..15] = Fold_4x128
13+
// [16..31] = Fold_1x128
14+
// [32..47] = Barrett
15+
// [48..63] = Fold_8x128 (unused by this path)
1516
// Returns: final CRC in RAX.
1617
//
1718
// Reference: Linux kernel crc-pclmul-template.S by Eric Biggers (Google).
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// VPCLMULQDQ + AVX2 CRC folding + Barrett reduction for reflected (LSB-first)
2+
// CRCs with width 8..32 (CRC-64 requires a separate path).
3+
//
4+
// Function signature (included after SimdProc4Begin.inc):
5+
// function CRC_Fold_Vpclmul(AData: PByte; ALength: UInt32;
6+
// AState: Pointer; AConstants: Pointer): UInt64;
7+
//
8+
// Register mapping (MS x64 ABI after prologue):
9+
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
10+
// AState: [0..7] = initial CRC (reflected), [8..15] = 0.
11+
// AConstants layout (TCRCFoldConstants):
12+
// [0..15] = Fold_4x128 (stride 512)
13+
// [16..31] = Fold_1x128 (stride 128)
14+
// [32..47] = Barrett
15+
// [48..63] = Fold_8x128 (stride 1024)
16+
// Returns: final CRC in RAX.
17+
//
18+
// Reference: zlib-ng crc32_pclmulqdq_tpl.h (256-bit VPCLMULQDQ path)
19+
// and Linux kernel crc-pclmul-template.S by Eric Biggers (Barrett reduction).
20+
//
21+
// All VEX-encoded instructions are db-encoded for FPC compatibility.
22+
// VEX byte layout reference:
23+
// 2-byte: C5 [R.vvvv.L.pp]
24+
// 3-byte: C4 [R.X.B.mmmmm] [W.vvvv.L.pp]
25+
26+
cmp edx, 128
27+
jb @xmm_path
28+
29+
// ===== YMM PATH (>= 128 bytes) =====
30+
31+
// Broadcast Fold_8x128 (stride 1024) into ymm6
32+
db $C4, $C2, $7D, $5A, $71, $30 // vbroadcasti128 ymm6, [r9 + 48]
33+
34+
// Load first 128 bytes into ymm0..ymm3
35+
db $C5, $FE, $6F, $01 // vmovdqu ymm0, [rcx]
36+
db $C5, $FE, $6F, $49, $20 // vmovdqu ymm1, [rcx + 32]
37+
db $C5, $FE, $6F, $51, $40 // vmovdqu ymm2, [rcx + 64]
38+
db $C5, $FE, $6F, $59, $60 // vmovdqu ymm3, [rcx + 96]
39+
40+
// XOR initial CRC into low 64 bits of ymm0
41+
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
42+
db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4
43+
44+
add rcx, 128
45+
sub edx, 128
46+
47+
// --- Main fold-by-8 loop (4 ymm accumulators, 128 bytes per iteration) ---
48+
cmp edx, 128
49+
jb @ymm_done
50+
51+
@ymm_loop:
52+
// Fold ymm0
53+
db $C4, $E3, $7D, $44, $E6, $11 // vpclmulqdq ymm4, ymm0, ymm6, $11
54+
db $C4, $E3, $7D, $44, $C6, $00 // vpclmulqdq ymm0, ymm0, ymm6, $00
55+
db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4
56+
db $C5, $FE, $6F, $29 // vmovdqu ymm5, [rcx]
57+
db $C5, $FD, $EF, $C5 // vpxor ymm0, ymm0, ymm5
58+
59+
// Fold ymm1
60+
db $C4, $E3, $75, $44, $E6, $11 // vpclmulqdq ymm4, ymm1, ymm6, $11
61+
db $C4, $E3, $75, $44, $CE, $00 // vpclmulqdq ymm1, ymm1, ymm6, $00
62+
db $C5, $F5, $EF, $CC // vpxor ymm1, ymm1, ymm4
63+
db $C5, $FE, $6F, $69, $20 // vmovdqu ymm5, [rcx + 32]
64+
db $C5, $F5, $EF, $CD // vpxor ymm1, ymm1, ymm5
65+
66+
// Fold ymm2
67+
db $C4, $E3, $6D, $44, $E6, $11 // vpclmulqdq ymm4, ymm2, ymm6, $11
68+
db $C4, $E3, $6D, $44, $D6, $00 // vpclmulqdq ymm2, ymm2, ymm6, $00
69+
db $C5, $ED, $EF, $D4 // vpxor ymm2, ymm2, ymm4
70+
db $C5, $FE, $6F, $69, $40 // vmovdqu ymm5, [rcx + 64]
71+
db $C5, $ED, $EF, $D5 // vpxor ymm2, ymm2, ymm5
72+
73+
// Fold ymm3
74+
db $C4, $E3, $65, $44, $E6, $11 // vpclmulqdq ymm4, ymm3, ymm6, $11
75+
db $C4, $E3, $65, $44, $DE, $00 // vpclmulqdq ymm3, ymm3, ymm6, $00
76+
db $C5, $E5, $EF, $DC // vpxor ymm3, ymm3, ymm4
77+
db $C5, $FE, $6F, $69, $60 // vmovdqu ymm5, [rcx + 96]
78+
db $C5, $E5, $EF, $DD // vpxor ymm3, ymm3, ymm5
79+
80+
add rcx, 128
81+
sub edx, 128
82+
cmp edx, 128
83+
jae @ymm_loop
84+
85+
@ymm_done:
86+
// --- Reduce 4 ymm -> 2 ymm using Fold_4x128 (stride 512) ---
87+
db $C4, $C2, $7D, $5A, $31 // vbroadcasti128 ymm6, [r9]
88+
89+
// ymm0 = fold(ymm0) XOR ymm2
90+
db $C4, $E3, $7D, $44, $E6, $11 // vpclmulqdq ymm4, ymm0, ymm6, $11
91+
db $C4, $E3, $7D, $44, $C6, $00 // vpclmulqdq ymm0, ymm0, ymm6, $00
92+
db $C5, $FD, $EF, $C4 // vpxor ymm0, ymm0, ymm4
93+
db $C5, $FD, $EF, $C2 // vpxor ymm0, ymm0, ymm2
94+
95+
// ymm1 = fold(ymm1) XOR ymm3
96+
db $C4, $E3, $75, $44, $E6, $11 // vpclmulqdq ymm4, ymm1, ymm6, $11
97+
db $C4, $E3, $75, $44, $CE, $00 // vpclmulqdq ymm1, ymm1, ymm6, $00
98+
db $C5, $F5, $EF, $CC // vpxor ymm1, ymm1, ymm4
99+
db $C5, $F5, $EF, $CB // vpxor ymm1, ymm1, ymm3
100+
101+
// --- Extract 4 xmm from 2 ymm ---
102+
// ymm0 = [result0 | result1], ymm1 = [result2 | result3]
103+
db $C5, $F9, $6F, $D1 // vmovdqa xmm2, xmm1 (save low of ymm1 = result2)
104+
db $C4, $E3, $7D, $39, $CB, $01 // vextracti128 xmm3, ymm1, 1 (high of ymm1 = result3)
105+
db $C4, $E3, $7D, $39, $C1, $01 // vextracti128 xmm1, ymm0, 1 (high of ymm0 = result1)
106+
// xmm0 = low(ymm0) = result0 (untouched)
107+
108+
db $C5, $F8, $77 // vzeroupper
109+
jmp @fold4to1
110+
111+
@xmm_path:
112+
// ===== XMM PATH (64..127 bytes) =====
113+
114+
// Load first 64 bytes
115+
db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx]
116+
db $C5, $FA, $6F, $49, $10 // vmovdqu xmm1, [rcx + 16]
117+
db $C5, $FA, $6F, $51, $20 // vmovdqu xmm2, [rcx + 32]
118+
db $C5, $FA, $6F, $59, $30 // vmovdqu xmm3, [rcx + 48]
119+
120+
// XOR initial CRC into low 64 bits of xmm0
121+
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
122+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
123+
124+
add rcx, 64
125+
sub edx, 64
126+
127+
@fold4to1:
128+
// ===== FOLD 4 XMM -> 1 XMM using Fold_1x128 (stride 128) =====
129+
db $C4, $C1, $7A, $6F, $79, $10 // vmovdqu xmm7, [r9 + 16]
130+
131+
// xmm0 = fold(xmm0) XOR xmm1
132+
db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11
133+
db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00
134+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
135+
db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1
136+
137+
// xmm0 = fold(xmm0) XOR xmm2
138+
db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11
139+
db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00
140+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
141+
db $C5, $F9, $EF, $C2 // vpxor xmm0, xmm0, xmm2
142+
143+
// xmm0 = fold(xmm0) XOR xmm3
144+
db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11
145+
db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00
146+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
147+
db $C5, $F9, $EF, $C3 // vpxor xmm0, xmm0, xmm3
148+
149+
// ===== FOLD REMAINING 16-BYTE BLOCKS =====
150+
cmp edx, 16
151+
jb @tail_done
152+
153+
@tail_loop:
154+
db $C4, $E3, $79, $44, $E7, $11 // vpclmulqdq xmm4, xmm0, xmm7, $11
155+
db $C4, $E3, $79, $44, $C7, $00 // vpclmulqdq xmm0, xmm0, xmm7, $00
156+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
157+
db $C5, $FA, $6F, $29 // vmovdqu xmm5, [rcx]
158+
db $C5, $F9, $EF, $C5 // vpxor xmm0, xmm0, xmm5
159+
add rcx, 16
160+
sub edx, 16
161+
cmp edx, 16
162+
jae @tail_loop
163+
164+
@tail_done:
165+
// =================================================================
166+
// Final reduction: 128-bit xmm0 -> CRC in rax
167+
// Following Linux kernel crc-pclmul-template.S Barrett reduction.
168+
// =================================================================
169+
170+
// Step 1: Multiply by x^n and reduce 128 bits to 64+n bits.
171+
db $C4, $E3, $79, $44, $CF, $10 // vpclmulqdq xmm1, xmm0, xmm7, $10
172+
db $C5, $F9, $73, $D8, $08 // vpsrldq xmm0, xmm0, 8
173+
db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1
174+
175+
// Step 2: Barrett reduction.
176+
db $C4, $C1, $7A, $6F, $71, $20 // vmovdqu xmm6, [r9 + 32]
177+
db $C4, $E3, $79, $44, $CE, $00 // vpclmulqdq xmm1, xmm0, xmm6, $00
178+
db $C4, $E3, $71, $44, $CE, $10 // vpclmulqdq xmm1, xmm1, xmm6, $10
179+
db $C5, $F9, $EF, $C1 // vpxor xmm0, xmm0, xmm1
180+
181+
// Extract high qword (bits 64..127) into rax and return.
182+
db $C5, $F8, $77 // vzeroupper
183+
db $C4, $E3, $F9, $16, $C0, $01 // vpextrq rax, xmm0, 1

0 commit comments

Comments
 (0)