Skip to content

Commit 6ffe7c6

Browse files
committed
reduce min simd size for crc to 16 bytes
1 parent def47d7 commit 6ffe7c6

5 files changed

Lines changed: 77 additions & 19 deletions

File tree

HashLib/src/Checksum/HlpCRC.pas

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,7 @@ TCRC = class sealed(THash, IChecksum, ICRC, ITransformBlock)
590590

591591
const
592592
Delta = Int32(7);
593+
MinSimdBytes = Int32(16);
593594

594595
function GetNames: THashLibStringArray; inline;
595596
procedure SetNames(const AValue: THashLibStringArray); inline;
@@ -1516,7 +1517,7 @@ procedure TCRC.TransformBytes(const AData: THashLibByteArray;
15161517

15171518
if Width > Delta then
15181519
begin
1519-
if FHasPclmulConstants and (ALength >= 64) then
1520+
if FHasPclmulConstants and (ALength >= MinSimdBytes) then
15201521
begin
15211522
if IsInputReflected and Assigned(CRC_Fold_Lsb) then
15221523
begin

HashLib/src/Include/Simd/CRC/CRCFoldPclmul.inc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// AState: Pointer; AConstants: Pointer): UInt64;
88
//
99
// Register mapping (MS x64 ABI after prologue):
10-
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
10+
// rcx = AData, edx = ALength (>= 16), r8 = AState, r9 = AConstants
1111
// AState: [0..7] = initial CRC (reflected), [8..15] = 0.
1212
// AConstants layout (TCRCFoldConstants):
1313
// [0..15] = Fold_4x128
@@ -26,17 +26,29 @@
2626
// xmm0=000 xmm1=001 xmm2=010 xmm3=011 xmm4=100
2727
// xmm5=101 xmm6=110 xmm7=111
2828

29-
// --- Pre-load fold constants into registers ---
30-
movdqu xmm6, dqword ptr [r9] // fold_4x128
29+
// fold_1x128 is always needed (fold-by-1 loop and Barrett step 1)
3130
movdqu xmm7, dqword ptr [r9 + 16] // fold_1x128
3231

33-
// --- Load first 64 bytes ---
32+
cmp edx, 64
33+
jae @large_input
34+
35+
// --- Small input (16..63 bytes): load 1 block ---
36+
movdqu xmm0, dqword ptr [rcx]
37+
movq xmm4, qword ptr [r8]
38+
pxor xmm0, xmm4
39+
add rcx, 16
40+
sub edx, 16
41+
jmp @fold1_check
42+
43+
@large_input:
44+
// --- Large input (>= 64 bytes): load 4 blocks ---
45+
movdqu xmm6, dqword ptr [r9] // fold_4x128
46+
3447
movdqu xmm0, dqword ptr [rcx]
3548
movdqu xmm1, dqword ptr [rcx + 16]
3649
movdqu xmm2, dqword ptr [rcx + 32]
3750
movdqu xmm3, dqword ptr [rcx + 48]
3851

39-
// XOR current CRC into the low bits of xmm0
4052
movq xmm4, qword ptr [r8]
4153
pxor xmm0, xmm4
4254

@@ -110,6 +122,7 @@
110122
pxor xmm0, xmm3
111123

112124
// --- Fold remaining 16-byte blocks ---
125+
@fold1_check:
113126
cmp edx, 16
114127
jb @fold1_done
115128

HashLib/src/Include/Simd/CRC/CRCFoldPclmulMsb.inc

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// AState: Pointer; AConstants: Pointer): UInt64;
88
//
99
// Register mapping (MS x64 ABI after prologue):
10-
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
10+
// rcx = AData, edx = ALength (>= 16), r8 = AState, r9 = AConstants
1111
// AState: [0..7] = initial CRC pre-shifted left by (64 - Width), [8..15] = 0.
1212
// AConstants layout (TCRCFoldConstants):
1313
// [0..15] = Fold_4x128
@@ -29,11 +29,27 @@
2929
// pshufb xmm_dst, xmm_src (SSSE3):
3030
// 66 0F 38 00 ModRM
3131

32-
// --- Pre-load constants ---
33-
movdqu xmm6, dqword ptr [r9 + 64] // BswapMask into xmm6 (kept throughout)
32+
// BswapMask is always needed (kept in xmm6 throughout)
33+
movdqu xmm6, dqword ptr [r9 + 64] // BswapMask
34+
35+
cmp edx, 64
36+
jae @large_input
37+
38+
// --- Small input (16..63 bytes): load 1 block with byte-swap ---
39+
movdqu xmm7, dqword ptr [r9 + 16] // fold_1x128
40+
movdqu xmm0, dqword ptr [rcx]
41+
db $66, $0F, $38, $00, $C6 // pshufb xmm0, xmm6
42+
movq xmm4, qword ptr [r8]
43+
pslldq xmm4, 8
44+
pxor xmm0, xmm4
45+
add rcx, 16
46+
sub edx, 16
47+
jmp @fold1_check
48+
49+
@large_input:
50+
// --- Large input (>= 64 bytes): load 4 blocks with byte-swap ---
3451
movdqu xmm7, dqword ptr [r9] // fold_4x128
3552

36-
// --- Load first 64 bytes with byte-swap ---
3753
movdqu xmm0, dqword ptr [rcx]
3854
db $66, $0F, $38, $00, $C6 // pshufb xmm0, xmm6
3955
movdqu xmm1, dqword ptr [rcx + 16]
@@ -43,7 +59,6 @@
4359
movdqu xmm3, dqword ptr [rcx + 48]
4460
db $66, $0F, $38, $00, $DE // pshufb xmm3, xmm6
4561

46-
// XOR pre-shifted CRC into the HIGH qword of xmm0 (MSB-first placement)
4762
movq xmm4, qword ptr [r8]
4863
pslldq xmm4, 8
4964
pxor xmm0, xmm4
@@ -123,6 +138,7 @@
123138
pxor xmm0, xmm3
124139

125140
// --- Fold remaining 16-byte blocks ---
141+
@fold1_check:
126142
cmp edx, 16
127143
jb @fold1_done
128144

HashLib/src/Include/Simd/CRC/CRCFoldVpclmul.inc

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
// AState: Pointer; AConstants: Pointer): UInt64;
88
//
99
// Register mapping (MS x64 ABI after prologue):
10-
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
10+
// rcx = AData, edx = ALength (>= 16), r8 = AState, r9 = AConstants
1111
// AState: [0..7] = initial CRC (reflected), [8..15] = 0.
1212
// AConstants layout (TCRCFoldConstants):
1313
// [0..15] = Fold_4x128 (stride 512)
@@ -111,15 +111,27 @@
111111
jmp @fold4to1
112112

113113
@xmm_path:
114-
// ===== XMM PATH (64..127 bytes) =====
114+
// ===== XMM PATH (16..127 bytes) =====
115115

116-
// Load first 64 bytes
116+
cmp edx, 64
117+
jae @xmm_large
118+
119+
// --- Small input (16..63 bytes): load 1 block ---
120+
db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx]
121+
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
122+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
123+
add rcx, 16
124+
sub edx, 16
125+
db $C4, $C1, $7A, $6F, $79, $10 // vmovdqu xmm7, [r9 + 16]
126+
jmp @tail_check
127+
128+
@xmm_large:
129+
// --- Large input (64..127 bytes): load 4 blocks ---
117130
db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx]
118131
db $C5, $FA, $6F, $49, $10 // vmovdqu xmm1, [rcx + 16]
119132
db $C5, $FA, $6F, $51, $20 // vmovdqu xmm2, [rcx + 32]
120133
db $C5, $FA, $6F, $59, $30 // vmovdqu xmm3, [rcx + 48]
121134

122-
// XOR initial CRC into low 64 bits of xmm0
123135
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
124136
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
125137

@@ -149,6 +161,7 @@
149161
db $C5, $F9, $EF, $C3 // vpxor xmm0, xmm0, xmm3
150162

151163
// ===== FOLD REMAINING 16-BYTE BLOCKS =====
164+
@tail_check:
152165
cmp edx, 16
153166
jb @tail_done
154167

HashLib/src/Include/Simd/CRC/CRCFoldVpclmulMsb.inc

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
// AState: Pointer; AConstants: Pointer): UInt64;
99
//
1010
// Register mapping (MS x64 ABI after prologue):
11-
// rcx = AData, edx = ALength (>= 64), r8 = AState, r9 = AConstants
11+
// rcx = AData, edx = ALength (>= 16), r8 = AState, r9 = AConstants
1212
// AState: [0..7] = initial CRC pre-shifted left by (64 - Width), [8..15] = 0.
1313
// AConstants layout (TCRCFoldConstants):
1414
// [0..15] = Fold_4x128 (stride 512)
@@ -122,12 +122,27 @@
122122
jmp @fold4to1
123123

124124
@xmm_path:
125-
// ===== XMM PATH (64..127 bytes) =====
125+
// ===== XMM PATH (16..127 bytes) =====
126126

127127
// Load BswapMask into xmm5
128128
db $C4, $C1, $7A, $6F, $69, $40 // vmovdqu xmm5, [r9 + 64]
129129

130-
// Load first 64 bytes with byte-swap
130+
cmp edx, 64
131+
jae @xmm_large
132+
133+
// --- Small input (16..63 bytes): load 1 block with byte-swap ---
134+
db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx]
135+
db $C4, $E2, $79, $00, $C5 // vpshufb xmm0, xmm0, xmm5
136+
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
137+
db $C5, $D9, $73, $FC, $08 // vpslldq xmm4, xmm4, 8
138+
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
139+
add rcx, 16
140+
sub edx, 16
141+
db $C4, $C1, $7A, $6F, $79, $10 // vmovdqu xmm7, [r9 + 16]
142+
jmp @tail_check
143+
144+
@xmm_large:
145+
// --- Large input (64..127 bytes): load 4 blocks with byte-swap ---
131146
db $C5, $FA, $6F, $01 // vmovdqu xmm0, [rcx]
132147
db $C4, $E2, $79, $00, $C5 // vpshufb xmm0, xmm0, xmm5
133148
db $C5, $FA, $6F, $49, $10 // vmovdqu xmm1, [rcx + 16]
@@ -137,7 +152,6 @@
137152
db $C5, $FA, $6F, $59, $30 // vmovdqu xmm3, [rcx + 48]
138153
db $C4, $E2, $61, $00, $DD // vpshufb xmm3, xmm3, xmm5
139154

140-
// XOR pre-shifted CRC into HIGH qword of xmm0
141155
db $C4, $C1, $7A, $7E, $20 // vmovq xmm4, [r8]
142156
db $C5, $D9, $73, $FC, $08 // vpslldq xmm4, xmm4, 8
143157
db $C5, $F9, $EF, $C4 // vpxor xmm0, xmm0, xmm4
@@ -168,6 +182,7 @@
168182
db $C5, $F9, $EF, $C3 // vpxor xmm0, xmm0, xmm3
169183

170184
// ===== FOLD REMAINING 16-BYTE BLOCKS =====
185+
@tail_check:
171186
cmp edx, 16
172187
jb @tail_done
173188

0 commit comments

Comments
 (0)