Skip to content

Commit def47d7

Browse files
committed
CRC SIMD-related updates
1 parent b036e31 commit def47d7

7 files changed

Lines changed: 550 additions & 48 deletions

File tree

HashLib/src/Checksum/HlpCRC.pas

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1468,9 +1468,10 @@ procedure TCRC.Initialize;
14681468

14691469
if not FHasPclmulConstants then
14701470
begin
1471-
if (Width >= 8) and (Width <= 32) and IsInputReflected then
1471+
if (Width >= 8) and (Width <= 64) then
14721472
begin
1473-
TGF2.GenerateFoldConstants(Polynomial, Width, True, FPclmulConstants);
1473+
TGF2.GenerateFoldConstants(Polynomial, Width, IsInputReflected,
1474+
FPclmulConstants);
14741475
FHasPclmulConstants := True;
14751476
end;
14761477
end;
@@ -1515,17 +1516,35 @@ procedure TCRC.TransformBytes(const AData: THashLibByteArray;
15151516

15161517
if Width > Delta then
15171518
begin
1518-
if FHasPclmulConstants and IsInputReflected and (ALength >= 64)
1519-
and Assigned(CRC_Fold_Lsb) then
1519+
if FHasPclmulConstants and (ALength >= 64) then
15201520
begin
1521-
LState[0] := FHash;
1522-
LState[1] := 0;
1523-
LProcessed := ALength and (not Int32(15));
1524-
FHash := CRC_Fold_Lsb(LPtrAData + AIndex, UInt32(LProcessed),
1525-
@LState[0], @FPclmulConstants) and FCRCMask;
1526-
LTail := ALength - LProcessed;
1527-
if LTail > 0 then
1528-
CalculateCRCbyTable(LPtrAData, LTail, AIndex + LProcessed);
1521+
if IsInputReflected and Assigned(CRC_Fold_Lsb) then
1522+
begin
1523+
LState[0] := FHash;
1524+
LState[1] := 0;
1525+
LProcessed := ALength and (not Int32(15));
1526+
FHash := CRC_Fold_Lsb(LPtrAData + AIndex, UInt32(LProcessed),
1527+
@LState[0], @FPclmulConstants) and FCRCMask;
1528+
LTail := ALength - LProcessed;
1529+
if LTail > 0 then
1530+
CalculateCRCbyTable(LPtrAData, LTail, AIndex + LProcessed);
1531+
end
1532+
else if (not IsInputReflected) and Assigned(CRC_Fold_Msb) then
1533+
begin
1534+
if Width < 64 then
1535+
LState[0] := FHash shl (64 - Width)
1536+
else
1537+
LState[0] := FHash;
1538+
LState[1] := 0;
1539+
LProcessed := ALength and (not Int32(15));
1540+
FHash := CRC_Fold_Msb(LPtrAData + AIndex, UInt32(LProcessed),
1541+
@LState[0], @FPclmulConstants) and FCRCMask;
1542+
LTail := ALength - LProcessed;
1543+
if LTail > 0 then
1544+
CalculateCRCbyTable(LPtrAData, LTail, AIndex + LProcessed);
1545+
end
1546+
else
1547+
CalculateCRCbyTable(LPtrAData, ALength, AIndex);
15291548
end
15301549
else
15311550
CalculateCRCbyTable(LPtrAData, ALength, AIndex);

HashLib/src/Checksum/HlpCRCDispatch.pas

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ interface
1313

1414
var
1515
CRC_Fold_Lsb: TCRCFoldFunc;
16+
CRC_Fold_Msb: TCRCFoldFunc;
1617

1718
implementation
1819

@@ -33,16 +34,35 @@ function CRC_Fold_Vpclmul(AData: PByte; ALength: UInt32;
3334
{$I ..\Include\Simd\CRC\CRCFoldVpclmul.inc}
3435
end;
3536

37+
function CRC_Fold_Pclmul_Msb(AData: PByte; ALength: UInt32;
38+
AState: Pointer; AConstants: Pointer): UInt64;
39+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
40+
{$I ..\Include\Simd\CRC\CRCFoldPclmulMsb.inc}
41+
end;
42+
43+
function CRC_Fold_Vpclmul_Msb(AData: PByte; ALength: UInt32;
44+
AState: Pointer; AConstants: Pointer): UInt64;
45+
{$I ..\Include\Simd\Common\SimdProc4Begin.inc}
46+
{$I ..\Include\Simd\CRC\CRCFoldVpclmulMsb.inc}
47+
end;
48+
3649
{$ENDIF HASHLIB_X86_64}
3750

3851
procedure InitDispatch();
3952
begin
4053
CRC_Fold_Lsb := nil;
54+
CRC_Fold_Msb := nil;
4155
{$IFDEF HASHLIB_X86_64}
4256
if TSimd.HasVPCLMULQDQ() then
43-
CRC_Fold_Lsb := @CRC_Fold_Vpclmul
57+
begin
58+
CRC_Fold_Lsb := @CRC_Fold_Vpclmul;
59+
CRC_Fold_Msb := @CRC_Fold_Vpclmul_Msb;
60+
end
4461
else if TSimd.HasPCLMULQDQ() then
62+
begin
4563
CRC_Fold_Lsb := @CRC_Fold_Pclmul;
64+
CRC_Fold_Msb := @CRC_Fold_Pclmul_Msb;
65+
end;
4666
{$ENDIF}
4767
end;
4868

HashLib/src/Checksum/HlpGF2.pas

Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@ TUInt128 = record
1010
end;
1111

1212
// PCLMULQDQ / VPCLMULQDQ CRC folding and Barrett reduction constants.
13-
// Layout must match the assembly expectations in CRCFoldPclmul.inc
14-
// and CRCFoldVpclmul.inc.
13+
// Layout must match the assembly expectations in the CRCFold*.inc files.
1514
TCRCFoldConstants = packed record
1615
Fold_4x128: array [0 .. 1] of UInt64; // offset 0: fold-by-4 constants (stride 512)
1716
Fold_1x128: array [0 .. 1] of UInt64; // offset 16: fold-by-1 constants (stride 128)
1817
Barrett: array [0 .. 1] of UInt64; // offset 32: Barrett reduction constants
1918
Fold_8x128: array [0 .. 1] of UInt64; // offset 48: fold-by-8 constants (stride 1024)
19+
BswapMask: array [0 .. 15] of Byte; // offset 64: byte-reverse mask for MSB-first CRCs
20+
CrcBits: UInt64; // offset 80: CRC width (8..64)
21+
BarrettShift: UInt64; // offset 88: = 64 - CrcBits (MSB psrlq alignment)
2022
end;
2123

2224
TGF2 = class sealed
@@ -214,8 +216,8 @@ class function TGF2.BitReverse(AValue: UInt64; ANumBits: Int32): UInt64;
214216
class procedure TGF2.GenerateFoldConstants(APoly: UInt64; ABits: Int32;
215217
AReflected: Boolean; out AConstants: TCRCFoldConstants);
216218
var
217-
LK, LPowOfX: Int32;
218-
LConst0, LConst1, LBarrett0, LBarrett1, LGMinusXn: UInt64;
219+
LK, LPowOfX, I: Int32;
220+
LConst0, LConst1, LBarrett0, LGMinusXn: UInt64;
219221
LDiv128: TUInt128;
220222
begin
221223
// Following the Linux kernel gen-crc-consts.py algorithm.
@@ -271,40 +273,61 @@ class procedure TGF2.GenerateFoldConstants(APoly: UInt64; ABits: Int32;
271273
end;
272274

273275
// --- Barrett reduction constants ---
274-
// barrett[0] = floor(x^(63+n) / G)
275-
LDiv128.Lo := 0;
276-
LDiv128.Hi := 0;
277-
if (63 + ABits) < 64 then
278-
LDiv128.Lo := UInt64(1) shl (63 + ABits)
279-
else if (63 + ABits) = 64 then
280-
LDiv128.Hi := 1
281-
else
282-
LDiv128.Hi := UInt64(1) shl ((63 + ABits) - 64);
283-
LBarrett0 := DivPoly(LDiv128, APoly, ABits);
284-
285-
// barrett[1] = (G - x^n) * x^(64-n-1) for n < 64
286-
// = ((G - x^n) - x^0) / x for n = 64
287276
LGMinusXn := APoly;
288-
if ABits < 64 then
289-
begin
290-
LPowOfX := 64 - ABits - 1;
291-
LBarrett1 := LGMinusXn shl LPowOfX;
292-
end
293-
else
294-
begin
295-
LBarrett1 := LGMinusXn shr 1;
296-
end;
297277

298278
if AReflected then
299279
begin
280+
// LSB-first: m = 63.
281+
// Barrett[0] = bitreflect(floor(x^(63+n) / G), 64)
282+
// Barrett[1] = bitreflect(G, n+1) (truncated: for n=64, x^0 removed)
283+
LDiv128.Lo := 0;
284+
LDiv128.Hi := 0;
285+
if (63 + ABits) < 64 then
286+
LDiv128.Lo := UInt64(1) shl (63 + ABits)
287+
else if (63 + ABits) = 64 then
288+
LDiv128.Hi := 1
289+
else
290+
LDiv128.Hi := UInt64(1) shl ((63 + ABits) - 64);
291+
LBarrett0 := DivPoly(LDiv128, APoly, ABits);
292+
300293
AConstants.Barrett[0] := BitReverse(LBarrett0, 64);
301-
AConstants.Barrett[1] := BitReverse(LBarrett1, 64);
294+
if ABits < 64 then
295+
begin
296+
LPowOfX := 64 - ABits - 1;
297+
AConstants.Barrett[1] := BitReverse(LGMinusXn shl LPowOfX, 64);
298+
end
299+
else
300+
AConstants.Barrett[1] := BitReverse(LGMinusXn shr 1, 64);
302301
end
303302
else
304303
begin
305-
AConstants.Barrett[0] := LBarrett1;
306-
AConstants.Barrett[1] := LBarrett0;
304+
// MSB-first: m = 64. Two-round Barrett per Linux kernel.
305+
// Barrett[0] = floor(x^(64+n) / G) - x^64 (mu, low 64 bits)
306+
// Since floor(x^(64+n)/G) = x^64 + floor(APoly*x^64 / G),
307+
// we compute Barrett[0] = DivPoly(APoly*x^64, G) to avoid the x^64 overflow.
308+
// Barrett[1] = G (full generator polynomial; for n=64, x^64 term implicit)
309+
LDiv128.Hi := APoly;
310+
LDiv128.Lo := 0;
311+
AConstants.Barrett[0] := DivPoly(LDiv128, APoly, ABits);
312+
if ABits < 64 then
313+
AConstants.Barrett[1] := (UInt64(1) shl ABits) or APoly
314+
else
315+
AConstants.Barrett[1] := APoly;
307316
end;
317+
318+
// --- Byte-swap mask for MSB-first CRCs ---
319+
if AReflected then
320+
FillChar(AConstants.BswapMask[0], 16, 0)
321+
else
322+
for I := 0 to 15 do
323+
AConstants.BswapMask[I] := Byte(15 - I);
324+
325+
// --- Metadata ---
326+
AConstants.CrcBits := UInt64(ABits);
327+
if ABits < 64 then
328+
AConstants.BarrettShift := UInt64(64 - ABits)
329+
else
330+
AConstants.BarrettShift := 0;
308331
end;
309332

310333
end.

HashLib/src/Include/Simd/CRC/CRCFoldPclmul.inc

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// SSE2 + PCLMULQDQ CRC folding + Barrett reduction for reflected (LSB-first)
2-
// CRCs with width 8..32 (CRC-64 requires a separate path).
2+
// CRCs with width 8..64. CRC-64 uses a special Barrett path for the 65-bit G.
33
// PCLMULQDQ instructions are db-encoded for broad assembler compatibility.
44
//
55
// Function signature (included after SimdProc4Begin.inc):
@@ -14,6 +14,8 @@
1414
// [16..31] = Fold_1x128
1515
// [32..47] = Barrett
1616
// [48..63] = Fold_8x128 (unused by this path)
17+
// [64..79] = BswapMask (unused by this path)
18+
// [80..87] = CrcBits
1719
// Returns: final CRC in RAX.
1820
//
1921
// Reference: Linux kernel crc-pclmul-template.S by Eric Biggers (Google).
@@ -143,19 +145,36 @@
143145
// Step 2: Barrett reduction.
144146
// Load Barrett constants into xmm6:
145147
// xmm6.lo = bitreflect(floor(x^(63+n)/G), 64)
146-
// xmm6.hi = bitreflect((G - x^n) * x^(64-n-1), 64)
148+
// xmm6.hi:
149+
// n < 64: bitreflect((G - x^n) * x^(64-n-1), 64)
150+
// n = 64: bitreflect(((G - x^n) - x^0) / x, 64)
147151
movdqu xmm6, dqword ptr [r9 + 32]
148152

149-
// t1 = clmul(t0.lo, barrett[0])
153+
// t1 = clmul(t0.lo, barrett[0]) => q = floor(t0 / G)
150154
movdqa xmm1, xmm0
151155
db $66, $0F, $3A, $44, $CE, $00 // pclmulqdq xmm1, xmm6, $00
152156

153-
// t2 = clmul(t1.lo, barrett[1]) = G * floor(t0 / G)
157+
// Check if CRC-64: G is 65-bit, needs split multiplication
158+
cmp qword ptr [r9 + 80], 64
159+
je @barrett_64
160+
161+
// Standard Barrett (width < 64): single pclmulqdq covers full G
154162
db $66, $0F, $3A, $44, $CE, $10 // pclmulqdq xmm1, xmm6, $10
163+
pxor xmm0, xmm1
164+
jmp @extract_crc
155165

156-
// xmm0 = t0 XOR t2, CRC is in bits 64..(64+n-1)
166+
@barrett_64:
167+
// CRC-64 Barrett: G is 65-bit, split into 64-bit pclmulqdq + x^0 correction.
168+
// Barrett[1] stores ((G - x^64) - x^0) / x; the x^0 term is handled
169+
// separately by duplicating q.lo and XORing it into the result.
170+
movdqa xmm2, xmm1 // save q
171+
db $66, $0F, $3A, $44, $CE, $10 // pclmulqdq xmm1, xmm6, $10
172+
// punpcklqdq xmm2, xmm2: 66 0F 6C D2
173+
db $66, $0F, $6C, $D2 // xmm2 = {q.lo, q.lo}
157174
pxor xmm0, xmm1
175+
pxor xmm0, xmm2
158176

177+
@extract_crc:
159178
// Extract high qword (bits 64..127) into rax.
160179
// pextrq rax, xmm0, 1: 66 48 0F 3A 16 C0 01
161180
db $66, $48, $0F, $3A, $16, $C0, $01

0 commit comments

Comments
 (0)