Skip to content

Commit 01cd822

Browse files
authored
Blake3 Perf Enhancements (#49)
- Unroll rounds in Blake3_Compress_Scalar loop - Inline G - Reduce heap allocations by using stack-allocated arrays * Add BLAKE3 cascade dispatch (Hash8 → Hash4 → scalar) - Add cascade wrapper procedures (Blake3_HashMany_Avx2, Blake3_HashMany_Sse2) matching the official BLAKE3 C reference dispatch pattern: AVX2 hash8 → SSE2 hash4 → scalar hash_one - Update TransformBytes to pass variable chunk counts to HashMany instead of fixed ParallelDegree-sized batches, allowing the cascade to process any remainder (e.g. 5 leftover chunks → hash4 + 1 scalar)
1 parent 6062234 commit 01cd822

5 files changed

Lines changed: 4305 additions & 138 deletions

File tree

HashLib/src/Crypto/HlpBlake3.pas

Lines changed: 90 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,7 @@ TBlake3Node = record
8888
class function DefaultBlake3Node(): TBlake3Node; static;
8989

9090
public
91-
class function CreateBlake3Node(const ACV, ABlock: THashLibUInt32Array;
92-
ACounter: UInt64; ABlockLen, AFlags: UInt32): TBlake3Node; static;
93-
94-
class function ParentNode(const ALeft, ARight, AKey: THashLibUInt32Array;
91+
class function ParentNode(ALeft, ARight, AKey: PCardinal;
9592
AFlags: UInt32): TBlake3Node; static;
9693
end;
9794

@@ -122,7 +119,7 @@ TBlake3ChunkState = record
122119
class function DefaultBlake3ChunkState(): TBlake3ChunkState; static;
123120

124121
public
125-
class function CreateBlake3ChunkState(const AIV: THashLibUInt32Array;
122+
class function CreateBlake3ChunkState(AIV: PCardinal;
126123
AChunkCounter: UInt64; AFlags: UInt32): TBlake3ChunkState; static;
127124
end;
128125

@@ -150,7 +147,7 @@ TBlake3OutputReader = record
150147
function HasSubTreeAtHeight(AIdx: Int32): Boolean; inline;
151148

152149
// AddChunkChainingValue appends a chunk to the right edge of the Merkle tree.
153-
procedure AddChunkChainingValue(const ACV: THashLibUInt32Array); inline;
150+
procedure AddChunkChainingValue(ACV: PCardinal); inline;
154151

155152
class function Len64(AValue: UInt64): Int32; static;
156153

@@ -160,12 +157,12 @@ TBlake3OutputReader = record
160157
var
161158
FCS: TBlake3ChunkState;
162159
FOutputReader: TBlake3OutputReader;
163-
FKey: THashLibUInt32Array;
160+
FKey: array [0 .. 7] of UInt32;
164161
FFlags: UInt32;
165162

166163
// log(n) set of Merkle subtree roots, at most one per height.
167164
// stack [54][8]uint32
168-
FStack: THashLibMatrixUInt32Array; // 2^54 * chunkSize = 2^64
165+
FStack: array [0 .. 53, 0 .. 7] of UInt32; // 2^54 * chunkSize = 2^64
169166
// bit vector indicating which stack elems are valid; also number of chunks added
170167
FUsed: UInt64;
171168

@@ -179,7 +176,7 @@ TBlake3OutputReader = record
179176
const AKey: THashLibByteArray); overload;
180177

181178
constructor CreateInternal(AHashSize: Int32;
182-
const AKeyWords: THashLibUInt32Array; AFlags: UInt32);
179+
AKeyWords: PCardinal; AFlags: UInt32);
183180

184181
public
185182
constructor Create(AHashSize: THashSize = THashSize.hsHashSize256;
@@ -230,7 +227,7 @@ TBlake3XOF = class sealed(TBlake3, IXOF)
230227
constructor Create(AHashSize: Int32;
231228
const AKey: THashLibByteArray); overload;
232229

233-
constructor Create(AHashSize: Int32; const AKeyWords: THashLibUInt32Array;
230+
constructor Create(AHashSize: Int32; AKeyWords: PCardinal;
234231
AFlags: UInt32); overload;
235232

236233
procedure Initialize(); override;
@@ -259,19 +256,6 @@ class function TBlake3.TBlake3Node.DefaultBlake3Node: TBlake3Node;
259256
Result.Flags := 0;
260257
end;
261258

262-
class function TBlake3.TBlake3Node.CreateBlake3Node(const ACV,
263-
ABlock: THashLibUInt32Array; ACounter: UInt64; ABlockLen, AFlags: UInt32)
264-
: TBlake3Node;
265-
begin
266-
Result := DefaultBlake3Node();
267-
System.Move(ACV[0], Result.CV, System.Length(ACV) * System.SizeOf(UInt32));
268-
System.Move(ABlock[0], Result.Block, System.Length(ABlock) *
269-
System.SizeOf(UInt32));
270-
Result.Counter := ACounter;
271-
Result.BlockLen := ABlockLen;
272-
Result.Flags := AFlags;
273-
end;
274-
275259
function TBlake3.TBlake3Node.Clone: TBlake3Node;
276260
begin
277261
Result := DefaultBlake3Node();
@@ -301,14 +285,16 @@ procedure TBlake3.TBlake3Node.ChainingValue(AResult: PCardinal);
301285
System.Move(LFull, AResult[0], 8 * System.SizeOf(UInt32));
302286
end;
303287

304-
class function TBlake3.TBlake3Node.ParentNode(const ALeft, ARight,
305-
AKey: THashLibUInt32Array; AFlags: UInt32): TBlake3Node;
306-
var
307-
LBlockWords: THashLibUInt32Array;
288+
class function TBlake3.TBlake3Node.ParentNode(ALeft, ARight,
289+
AKey: PCardinal; AFlags: UInt32): TBlake3Node;
308290
begin
309-
LBlockWords := TArrayUtils.Concatenate(ALeft, ARight);
310-
Result := TBlake3Node.CreateBlake3Node(AKey, LBlockWords, 0, BlockSizeInBytes,
311-
AFlags or FlagParent);
291+
Result := DefaultBlake3Node();
292+
System.Move(AKey^, Result.CV[0], 8 * System.SizeOf(UInt32));
293+
System.Move(ALeft^, Result.Block[0], 8 * System.SizeOf(UInt32));
294+
System.Move(ARight^, Result.Block[8], 8 * System.SizeOf(UInt32));
295+
Result.Counter := 0;
296+
Result.BlockLen := BlockSizeInBytes;
297+
Result.Flags := AFlags or FlagParent;
312298
end;
313299

314300
{ TBlake3.TBlake3ChunkState }
@@ -328,12 +314,11 @@ function TBlake3.TBlake3ChunkState.Complete: Boolean;
328314
end;
329315

330316
class function TBlake3.TBlake3ChunkState.CreateBlake3ChunkState
331-
(const AIV: THashLibUInt32Array; AChunkCounter: UInt64; AFlags: UInt32)
317+
(AIV: PCardinal; AChunkCounter: UInt64; AFlags: UInt32)
332318
: TBlake3ChunkState;
333319
begin
334320
Result := DefaultBlake3ChunkState;
335-
System.Move(AIV[0], Result.N.CV[0], System.Length(AIV) *
336-
System.SizeOf(UInt32));
321+
System.Move(AIV^, Result.N.CV[0], 8 * System.SizeOf(UInt32));
337322
Result.N.Counter := AChunkCounter;
338323
Result.N.BlockLen := BlockSizeInBytes;
339324
// compress the first block with the start flag set
@@ -531,30 +516,22 @@ class function TBlake3.TrailingZeros64(AValue: UInt64): Int32;
531516
end;
532517

533518
constructor TBlake3.CreateInternal(AHashSize: Int32;
534-
const AKeyWords: THashLibUInt32Array; AFlags: UInt32);
535-
var
536-
LIdx: Int32;
519+
AKeyWords: PCardinal; AFlags: UInt32);
537520
begin
538521
inherited Create(AHashSize, BlockSizeInBytes);
539-
FKey := System.Copy(AKeyWords);
522+
System.Move(AKeyWords^, FKey[0], 8 * System.SizeOf(UInt32));
540523
FFlags := AFlags;
541-
System.SetLength(FStack, 54);
542-
for LIdx := System.Low(FStack) to System.High(FStack) do
543-
begin
544-
System.SetLength(FStack[LIdx], 8);
545-
end;
546524
end;
547525

548526
constructor TBlake3.Create(AHashSize: Int32; const AKey: THashLibByteArray);
549527
var
550-
LKeyWords: THashLibUInt32Array;
528+
LKeyWords: array [0 .. 7] of UInt32;
551529
LKeyLength: Int32;
552530
begin
553-
System.SetLength(LKeyWords, 8);
554531
if AKey = nil then
555532
begin
556533
System.Move(IV, LKeyWords[0], System.SizeOf(IV));
557-
CreateInternal(AHashSize, LKeyWords, 0);
534+
CreateInternal(AHashSize, @LKeyWords[0], 0);
558535
end
559536
else
560537
begin
@@ -564,8 +541,8 @@ constructor TBlake3.Create(AHashSize: Int32; const AKey: THashLibByteArray);
564541
raise EArgumentOutOfRangeHashLibException.CreateResFmt(@SInvalidKeyLength,
565542
[KeyLengthInBytes, LKeyLength]);
566543
end;
567-
TConverters.le32_copy(PByte(AKey), 0, PCardinal(LKeyWords), 0, LKeyLength);
568-
CreateInternal(AHashSize, LKeyWords, FlagKeyedHash);
544+
TConverters.le32_copy(PByte(AKey), 0, @LKeyWords[0], 0, LKeyLength);
545+
CreateInternal(AHashSize, @LKeyWords[0], FlagKeyedHash);
569546
end;
570547
end;
571548

@@ -576,9 +553,9 @@ constructor TBlake3.Create(AHashSize: THashSize; const AKey: THashLibByteArray);
576553

577554
procedure TBlake3.Initialize;
578555
begin
579-
FCS := TBlake3ChunkState.CreateBlake3ChunkState(FKey, 0, FFlags);
556+
FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0], 0, FFlags);
580557
FOutputReader := TBlake3OutputReader.DefaultBlake3OutputReader();
581-
TArrayUtils.ZeroFill(FStack);
558+
FillChar(FStack, System.SizeOf(FStack), 0);
582559
FUsed := 0;
583560
end;
584561

@@ -592,10 +569,10 @@ function TBlake3.Clone: IHash;
592569
var
593570
LHashInstance: TBlake3;
594571
begin
595-
LHashInstance := TBlake3.CreateInternal(HashSize, FKey, FFlags);
572+
LHashInstance := TBlake3.CreateInternal(HashSize, @FKey[0], FFlags);
596573
LHashInstance.FCS := FCS.Clone();
597574
LHashInstance.FOutputReader := FOutputReader.Clone();
598-
LHashInstance.FStack := TArrayUtils.Clone(FStack);
575+
System.Move(FStack, LHashInstance.FStack, System.SizeOf(FStack));
599576
LHashInstance.FUsed := FUsed;
600577
Result := LHashInstance;
601578
Result.BufferSize := BufferSize;
@@ -616,37 +593,30 @@ function TBlake3.HasSubTreeAtHeight(AIdx: Int32): Boolean;
616593
Result := (FUsed and (1 shl AIdx)) <> 0;
617594
end;
618595

619-
procedure TBlake3.AddChunkChainingValue(const ACV: THashLibUInt32Array);
596+
procedure TBlake3.AddChunkChainingValue(ACV: PCardinal);
620597
var
621598
LIdx: Int32;
622-
LFlags: UInt32;
623-
LKey: THashLibUInt32Array;
624-
LPtrCV: PCardinal;
625599
begin
626-
LKey := FKey;
627-
LFlags := FFlags;
628-
LPtrCV := PCardinal(ACV);
629600
// seek to first open stack slot, merging subtrees as we go
630601
LIdx := 0;
631602
while HasSubTreeAtHeight(LIdx) do
632603
begin
633-
TBlake3Node.ParentNode(FStack[LIdx], ACV, LKey, LFlags)
634-
.ChainingValue(LPtrCV);
604+
TBlake3Node.ParentNode(@FStack[LIdx, 0], ACV, @FKey[0], FFlags)
605+
.ChainingValue(ACV);
635606
System.Inc(LIdx);
636607
end;
637-
FStack[LIdx] := System.Copy(ACV);
608+
System.Move(ACV^, FStack[LIdx, 0], 8 * System.SizeOf(UInt32));
638609
System.Inc(FUsed);
639610
end;
640611

641612
function TBlake3.RootNode: TBlake3Node;
642613
var
643614
LIdx, LTrailingZeros64, LLen64: Int32;
644-
LTemp: THashLibUInt32Array;
615+
LTemp: array [0 .. 7] of UInt32;
645616
LPtrTemp: PCardinal;
646617
begin
647618
Result := FCS.Node();
648-
System.SetLength(LTemp, 8);
649-
LPtrTemp := PCardinal(LTemp);
619+
LPtrTemp := @LTemp[0];
650620

651621
LTrailingZeros64 := TrailingZeros64(FUsed);
652622
LLen64 := Len64(FUsed);
@@ -655,7 +625,8 @@ function TBlake3.RootNode: TBlake3Node;
655625
if HasSubTreeAtHeight(LIdx) then
656626
begin
657627
Result.ChainingValue(LPtrTemp);
658-
Result := TBlake3Node.ParentNode(FStack[LIdx], LTemp, FKey, FFlags);
628+
Result := TBlake3Node.ParentNode(@FStack[LIdx, 0], LPtrTemp,
629+
@FKey[0], FFlags);
659630
end;
660631
end;
661632
Result.Flags := Result.Flags or FlagRoot;
@@ -665,27 +636,64 @@ procedure TBlake3.TransformBytes(const AData: THashLibByteArray;
665636
AIndex, ADataLength: Int32);
666637
var
667638
LPtrAData: PByte;
668-
LCV: THashLibUInt32Array;
669-
LCount: Int32;
639+
LCV: array [0 .. 7] of UInt32;
640+
LCVs: array [0 .. 7, 0 .. 7] of UInt32;
641+
LCount, LParDeg, I, LBatchBytes, LNumChunks: Int32;
670642
LPtrCV: PCardinal;
671643
begin
672644
LPtrAData := PByte(AData) + AIndex;
673-
System.SetLength(LCV, 8);
674-
LPtrCV := PCardinal(LCV);
645+
LPtrCV := @LCV[0];
646+
LParDeg := Blake3_ParallelDegree;
647+
648+
// Step 1: Complete any partial chunk to reach a clean boundary
649+
if (FCS.BytesConsumed > 0) and (ADataLength > 0) then
650+
begin
651+
LCount := Min(ChunkSize - FCS.BytesConsumed, ADataLength);
652+
FCS.Update(LPtrAData, LCount);
653+
System.Inc(LPtrAData, LCount);
654+
System.Dec(ADataLength, LCount);
655+
end;
656+
657+
// Flush the completed chunk if there's more data to process
658+
if FCS.Complete() and (ADataLength > 0) then
659+
begin
660+
FCS.Node().ChainingValue(LPtrCV);
661+
AddChunkChainingValue(LPtrCV);
662+
FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0],
663+
FCS.ChunkCounter() + 1, FFlags);
664+
end;
665+
666+
// Step 2: Process full chunk batches in parallel (4x SSE2, 8x AVX2)
667+
// At this point FCS.BytesConsumed = 0 whenever ADataLength > 0.
668+
// Always leave at least ChunkSize bytes for the sequential path so that
669+
// the last chunk ends up in FCS (required by RootNode's invariant).
670+
// HashMany cascades internally (e.g. AVX2: hash8 -> hash4 -> scalar).
671+
while ADataLength >= 2 * ChunkSize do
672+
begin
673+
LNumChunks := (ADataLength div ChunkSize) - 1;
674+
if LNumChunks > LParDeg then
675+
LNumChunks := LParDeg;
676+
Blake3_HashMany(LPtrAData, @FKey[0], @LCVs[0, 0],
677+
LNumChunks, FCS.ChunkCounter(), FFlags);
678+
for I := 0 to LNumChunks - 1 do
679+
AddChunkChainingValue(@LCVs[I, 0]);
680+
LBatchBytes := LNumChunks * ChunkSize;
681+
FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0],
682+
FCS.ChunkCounter() + UInt64(LNumChunks), FFlags);
683+
System.Inc(LPtrAData, LBatchBytes);
684+
System.Dec(ADataLength, LBatchBytes);
685+
end;
675686

687+
// Step 3: Process remaining data sequentially
676688
while ADataLength > 0 do
677689
begin
678-
// If the current chunk is complete, finalize it and add it to the tree,
679-
// then reset the chunk state (but keep incrementing the counter across
680-
// chunks).
681690
if FCS.Complete() then
682691
begin
683692
FCS.Node().ChainingValue(LPtrCV);
684-
AddChunkChainingValue(LCV);
685-
FCS := TBlake3ChunkState.CreateBlake3ChunkState(FKey,
693+
AddChunkChainingValue(LPtrCV);
694+
FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0],
686695
FCS.ChunkCounter() + 1, FFlags);
687696
end;
688-
// Compress input bytes into the current chunk state.
689697
LCount := Min(ChunkSize - FCS.BytesConsumed, ADataLength);
690698
FCS.Update(LPtrAData, LCount);
691699
System.Inc(LPtrAData, LCount);
@@ -709,20 +717,19 @@ class procedure TBlake3.DeriveKey(const ASrcKey, ACtx,
709717
const
710718
derivationIVLen = Int32(32);
711719
var
712-
LIVWords: THashLibUInt32Array;
720+
LIVWords: array [0 .. 7] of UInt32;
713721
LDerivationIV: THashLibByteArray;
714722
LXof: IXOF;
715723
begin
716-
System.SetLength(LIVWords, 8);
717724
System.Move(IV, LIVWords[0], System.SizeOf(IV));
718725
// construct the derivation Hasher and get the DerivationIV
719-
LDerivationIV := (TBlake3.CreateInternal(derivationIVLen, LIVWords,
726+
LDerivationIV := (TBlake3.CreateInternal(derivationIVLen, @LIVWords[0],
720727
FlagDeriveKeyContext) as IHash).ComputeBytes(ACtx).GetBytes();
721-
TConverters.le32_copy(PByte(LDerivationIV), 0, PCardinal(LIVWords), 0,
728+
TConverters.le32_copy(PByte(LDerivationIV), 0, @LIVWords[0], 0,
722729
KeyLengthInBytes);
723730

724731
// derive the SubKey
725-
LXof := TBlake3XOF.Create(32, LIVWords, FlagDeriveKeyMaterial);
732+
LXof := TBlake3XOF.Create(32, @LIVWords[0], FlagDeriveKeyMaterial);
726733
LXof.XOFSizeInBits := System.Length(ASubKey) * 8;
727734
LXof.Initialize;
728735
LXof.TransformBytes(ASrcKey);
@@ -758,10 +765,10 @@ function TBlake3XOF.Clone: IHash;
758765
// Internal Blake3 Cloning
759766
LHashInstance.FCS := FCS.Clone();
760767
LHashInstance.FOutputReader := FOutputReader.Clone();
761-
LHashInstance.FStack := TArrayUtils.Clone(FStack);
768+
System.Move(FStack, LHashInstance.FStack, System.SizeOf(FStack));
762769
LHashInstance.FUsed := FUsed;
763770
LHashInstance.FFlags := FFlags;
764-
LHashInstance.FKey := System.Copy(FKey);
771+
System.Move(FKey, LHashInstance.FKey, System.SizeOf(FKey));
765772

766773
Result := LHashInstance;
767774
Result.BufferSize := BufferSize;
@@ -774,7 +781,7 @@ constructor TBlake3XOF.Create(AHashSize: Int32; const AKey: THashLibByteArray);
774781
end;
775782

776783
constructor TBlake3XOF.Create(AHashSize: Int32;
777-
const AKeyWords: THashLibUInt32Array; AFlags: UInt32);
784+
AKeyWords: PCardinal; AFlags: UInt32);
778785
begin
779786
inherited CreateInternal(AHashSize, AKeyWords, AFlags);
780787
FFinalized := False;

0 commit comments

Comments
 (0)