Skip to content

Commit 8a17534

Browse files
authored
Add AVX2 multi-block Keccak absorb with jagged state layout (#46)
Add AVX2 multi-block Keccak absorb with jagged state layout Implement SIMD-optimized SHA3/Keccak absorb that fuses data XOR + permutation into a single loop with one gather/scatter, eliminating the per-block gather/scatter overhead that made the initial AVX2 permutation slower than scalar. Key changes: - New KeccakF1600Avx2Absorb.inc: multi-block absorb assembly using Andy Polyakov's plane-per-register technique (CRYPTOGAMS/XKCP), with jagged buffer on stack and A_JAGGED offset table for XOR - New SimdProc5Begin.inc: shared 5-parameter ABI prologue - Extended K_KECCAK with Jagged offset table mapping standard state indices to jagged buffer positions - Added TKeccakF1600AbsorbProc dispatch (AVX2 + scalar fallback) - Overrode TSHA3.TransformBytes for multi-block absorb path - Consolidated TransformBlock to delegate to KeccakF1600_Absorb, making scalar absorb endian-safe via le64_copy
1 parent 79ff5d4 commit 8a17534

4 files changed

Lines changed: 544 additions & 11 deletions

File tree

HashLib/src/Crypto/HlpSHA3.pas

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ TSHA3 = class abstract(TBlockHash, ICryptoNotBuildIn, ITransformBlock)
4848

4949
public
5050
procedure Initialize; override;
51+
procedure TransformBytes(const AData: THashLibByteArray;
52+
AIndex, ALength: Int32); override;
5153

5254
end;
5355

@@ -404,21 +406,47 @@ procedure TSHA3.Initialize;
404406
end;
405407

406408
procedure TSHA3.TransformBlock(AData: PByte; ADataLength: Int32; AIndex: Int32);
409+
begin
410+
KeccakF1600_Absorb(@FState[0], AData + AIndex, 1, BlockSize);
411+
end;
412+
413+
procedure TSHA3.TransformBytes(const AData: THashLibByteArray;
414+
AIndex, ALength: Int32);
407415
var
408-
LData: array [0 .. 20] of UInt64;
409-
LInnerIdx, LBlockCount: Int32;
416+
LPtrData: PByte;
417+
LBlockCount: Int32;
410418
begin
411-
TConverters.le64_copy(AData, AIndex, @(LData[0]), 0, ADataLength);
412-
LInnerIdx := 0;
413-
LBlockCount := BlockSize shr 3;
414-
while LInnerIdx < LBlockCount do
419+
{$IFDEF DEBUG}
420+
System.Assert(AIndex >= 0);
421+
System.Assert(ALength >= 0);
422+
System.Assert(AIndex + ALength <= System.Length(AData));
423+
{$ENDIF DEBUG}
424+
LPtrData := PByte(AData);
425+
426+
if (not FBuffer.IsEmpty) then
427+
begin
428+
if (FBuffer.Feed(LPtrData, System.Length(AData), AIndex, ALength,
429+
FProcessedBytesCount)) then
430+
begin
431+
TransformBuffer();
432+
end;
433+
end;
434+
435+
LBlockCount := ALength div FBuffer.Length;
436+
if LBlockCount > 0 then
415437
begin
416-
FState[LInnerIdx] := FState[LInnerIdx] xor LData[LInnerIdx];
417-
System.Inc(LInnerIdx);
438+
FProcessedBytesCount := FProcessedBytesCount +
439+
UInt64(LBlockCount) * UInt64(FBuffer.Length);
440+
KeccakF1600_Absorb(@FState[0], LPtrData + AIndex, LBlockCount, BlockSize);
441+
AIndex := AIndex + (LBlockCount * FBuffer.Length);
442+
ALength := ALength - (LBlockCount * FBuffer.Length);
418443
end;
419444

420-
KeccakF1600_Permute(@FState[0]);
421-
System.FillChar(LData, System.SizeOf(LData), UInt64(0));
445+
if (ALength > 0) then
446+
begin
447+
FBuffer.Feed(LPtrData, System.Length(AData), AIndex, ALength,
448+
FProcessedBytesCount);
449+
end;
422450
end;
423451

424452
{ TSHA3_224 }

HashLib/src/Crypto/HlpSHA3Dispatch.pas

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,18 @@ interface
66

77
type
88
TKeccakF1600Proc = procedure(AState: Pointer);
9+
TKeccakF1600AbsorbProc = procedure(AState: Pointer; AData: PByte;
10+
ABlockCount: Int32; ABlockSize: Int32);
911

1012
var
1113
KeccakF1600_Permute: TKeccakF1600Proc;
14+
KeccakF1600_Absorb: TKeccakF1600AbsorbProc;
1215

1316
implementation
1417

1518
uses
1619
HlpBits,
20+
HlpConverters,
1721
HlpSimd;
1822

1923
// =============================================================================
@@ -377,6 +381,30 @@ procedure KeccakF1600_Scalar(AState: Pointer);
377381
{$ENDIF USE_UNROLLED_VARIANT}
378382
end;
379383

384+
// =============================================================================
385+
// Scalar absorb: XOR + permute loop (no SIMD)
386+
// =============================================================================
387+
388+
procedure KeccakF1600_Absorb_Scalar(AState: Pointer; AData: PByte;
389+
ABlockCount: Int32; ABlockSize: Int32);
390+
var
391+
LPState: PUInt64;
392+
LData: array [0 .. 20] of UInt64;
393+
LBlockSizeWords, I, J: Int32;
394+
begin
395+
LPState := PUInt64(AState);
396+
LBlockSizeWords := ABlockSize shr 3;
397+
for I := 0 to ABlockCount - 1 do
398+
begin
399+
TConverters.le64_copy(AData, 0, @LData[0], 0, ABlockSize);
400+
for J := 0 to LBlockSizeWords - 1 do
401+
LPState[J] := LPState[J] xor LData[J];
402+
KeccakF1600_Scalar(AState);
403+
System.Inc(AData, ABlockSize);
404+
end;
405+
System.FillChar(LData, System.SizeOf(LData), UInt64(0));
406+
end;
407+
380408
// =============================================================================
381409
// SIMD implementations (x86-64 only)
382410
// =============================================================================
@@ -388,6 +416,7 @@ procedure KeccakF1600_Scalar(AState: Pointer);
388416
RhotatesLeft: array [0..23] of UInt64;
389417
RhotatesRight: array [0..23] of UInt64;
390418
Iotas: array [0..95] of UInt64;
419+
Jagged: array [0..24] of Int32;
391420
end = (
392421
RhotatesLeft: (
393422
3, 18, 36, 41, // ymm2: [2][0] [4][0] [1][0] [3][0]
@@ -427,7 +456,11 @@ procedure KeccakF1600_Scalar(AState: Pointer);
427456
$8000000080008081, $8000000080008081, $8000000080008081, $8000000080008081,
428457
$8000000000008080, $8000000000008080, $8000000000008080, $8000000000008080,
429458
$0000000080000001, $0000000080000001, $0000000080000001, $0000000080000001,
430-
$8000000080008008, $8000000080008008, $8000000080008008, $8000000080008008)
459+
$8000000080008008, $8000000080008008, $8000000080008008, $8000000080008008);
460+
Jagged: (
461+
0, 32, 40, 48, 56, 80, 192, 104, 144, 184,
462+
64, 128, 200, 176, 120, 88, 96, 168, 208, 152,
463+
72, 160, 136, 112, 216)
431464
);
432465

433466
procedure KeccakF1600_Avx2(AState: Pointer; AConstants: Pointer);
@@ -440,6 +473,18 @@ procedure KeccakF1600_Avx2_Wrap(AState: Pointer);
440473
KeccakF1600_Avx2(AState, @K_KECCAK);
441474
end;
442475

476+
procedure KeccakF1600_Avx2_Absorb(AState: Pointer; AData: PByte;
477+
ABlockCount: Int32; ABlockSize: Int32; AConstants: Pointer);
478+
{$I ..\Include\Simd\Common\SimdProc5Begin.inc}
479+
{$I ..\Include\Simd\SHA3\KeccakF1600Avx2Absorb.inc}
480+
end;
481+
482+
procedure KeccakF1600_Avx2_Absorb_Wrap(AState: Pointer; AData: PByte;
483+
ABlockCount: Int32; ABlockSize: Int32);
484+
begin
485+
KeccakF1600_Avx2_Absorb(AState, AData, ABlockCount, ABlockSize, @K_KECCAK);
486+
end;
487+
443488
{$ENDIF HASHLIB_X86_64}
444489

445490
// =============================================================================
@@ -453,11 +498,13 @@ procedure InitDispatch();
453498
TSimdLevel.AVX2:
454499
begin
455500
KeccakF1600_Permute := @KeccakF1600_Avx2_Wrap;
501+
KeccakF1600_Absorb := @KeccakF1600_Avx2_Absorb_Wrap;
456502
end;
457503
{$ENDIF}
458504
TSimdLevel.Scalar:
459505
begin
460506
KeccakF1600_Permute := @KeccakF1600_Scalar;
507+
KeccakF1600_Absorb := @KeccakF1600_Absorb_Scalar;
461508
end;
462509
end;
463510
end;
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Shared SIMD procedure prologue for 5-parameter assembly functions.
2+
// After inclusion: rcx = param1, rdx = param2, r8 = param3, r9 = param4, r10 = param5
3+
// (MS x64 ABI).
4+
// On FPC non-Windows (System V ABI), remaps rdi,rsi,rdx,rcx,r8 -> rcx,rdx,r8,r9,r10.
5+
// Move order avoids register clobbering.
6+
// On MS x64, param5 is loaded from [rsp+40] (after shadow space).
7+
// Usage:
8+
// procedure MyProc(P1, P2: Pointer; P3, P4: Int32; P5: Pointer);
9+
// {$I SimdProc5Begin.inc}
10+
// // ... SIMD instructions using rcx, rdx, r8, r9, r10 ...
11+
// end;
12+
{$IFDEF FPC}
13+
assembler; nostackframe;
14+
asm
15+
{$IFNDEF MSWINDOWS}
16+
mov r10, r8
17+
mov r9, rcx
18+
mov r8, rdx
19+
mov rdx, rsi
20+
mov rcx, rdi
21+
{$ELSE}
22+
mov r10, [rsp + 40]
23+
{$ENDIF}
24+
{$ELSE}
25+
asm
26+
.noframe
27+
mov r10, [rsp + 40]
28+
{$ENDIF}

0 commit comments

Comments
 (0)