@@ -88,10 +88,7 @@ TBlake3Node = record
8888 class function DefaultBlake3Node (): TBlake3Node; static;
8989
9090 public
91- class function CreateBlake3Node (const ACV, ABlock: THashLibUInt32Array;
92- ACounter: UInt64; ABlockLen, AFlags: UInt32): TBlake3Node; static;
93-
94- class function ParentNode (const ALeft, ARight, AKey: THashLibUInt32Array;
91+ class function ParentNode (ALeft, ARight, AKey: PCardinal;
9592 AFlags: UInt32): TBlake3Node; static;
9693 end ;
9794
@@ -122,7 +119,7 @@ TBlake3ChunkState = record
122119 class function DefaultBlake3ChunkState (): TBlake3ChunkState; static;
123120
124121 public
125- class function CreateBlake3ChunkState (const AIV: THashLibUInt32Array ;
122+ class function CreateBlake3ChunkState (AIV: PCardinal ;
126123 AChunkCounter: UInt64; AFlags: UInt32): TBlake3ChunkState; static;
127124 end ;
128125
@@ -150,7 +147,7 @@ TBlake3OutputReader = record
150147 function HasSubTreeAtHeight (AIdx: Int32): Boolean; inline;
151148
152149 // AddChunkChainingValue appends a chunk to the right edge of the Merkle tree.
153- procedure AddChunkChainingValue (const ACV: THashLibUInt32Array ); inline;
150+ procedure AddChunkChainingValue (ACV: PCardinal ); inline;
154151
155152 class function Len64 (AValue: UInt64): Int32; static;
156153
@@ -160,12 +157,12 @@ TBlake3OutputReader = record
160157 var
161158 FCS: TBlake3ChunkState;
162159 FOutputReader: TBlake3OutputReader;
163- FKey: THashLibUInt32Array ;
160+ FKey: array [ 0 .. 7 ] of UInt32 ;
164161 FFlags: UInt32;
165162
166163 // log(n) set of Merkle subtree roots, at most one per height.
167164 // stack [54][8]uint32
168- FStack: THashLibMatrixUInt32Array ; // 2^54 * chunkSize = 2^64
165+ FStack: array [ 0 .. 53 , 0 .. 7 ] of UInt32 ; // 2^54 * chunkSize = 2^64
169166 // bit vector indicating which stack elems are valid; also number of chunks added
170167 FUsed: UInt64;
171168
@@ -179,7 +176,7 @@ TBlake3OutputReader = record
179176 const AKey: THashLibByteArray); overload;
180177
181178 constructor CreateInternal(AHashSize: Int32;
182- const AKeyWords: THashLibUInt32Array ; AFlags: UInt32);
179+ AKeyWords: PCardinal ; AFlags: UInt32);
183180
184181 public
185182 constructor Create(AHashSize: THashSize = THashSize.hsHashSize256;
@@ -230,7 +227,7 @@ TBlake3XOF = class sealed(TBlake3, IXOF)
230227 constructor Create(AHashSize: Int32;
231228 const AKey: THashLibByteArray); overload;
232229
233- constructor Create(AHashSize: Int32; const AKeyWords: THashLibUInt32Array ;
230+ constructor Create(AHashSize: Int32; AKeyWords: PCardinal ;
234231 AFlags: UInt32); overload;
235232
236233 procedure Initialize (); override;
@@ -259,19 +256,6 @@ class function TBlake3.TBlake3Node.DefaultBlake3Node: TBlake3Node;
259256 Result.Flags := 0 ;
260257end ;
261258
262- class function TBlake3.TBlake3Node .CreateBlake3Node(const ACV,
263- ABlock: THashLibUInt32Array; ACounter: UInt64; ABlockLen, AFlags: UInt32)
264- : TBlake3Node;
265- begin
266- Result := DefaultBlake3Node();
267- System.Move(ACV[0 ], Result.CV, System.Length(ACV) * System.SizeOf(UInt32));
268- System.Move(ABlock[0 ], Result.Block, System.Length(ABlock) *
269- System.SizeOf(UInt32));
270- Result.Counter := ACounter;
271- Result.BlockLen := ABlockLen;
272- Result.Flags := AFlags;
273- end ;
274-
275259function TBlake3.TBlake3Node .Clone: TBlake3Node;
276260begin
277261 Result := DefaultBlake3Node();
@@ -301,14 +285,16 @@ procedure TBlake3.TBlake3Node.ChainingValue(AResult: PCardinal);
301285 System.Move(LFull, AResult[0 ], 8 * System.SizeOf(UInt32));
302286end ;
303287
304- class function TBlake3.TBlake3Node .ParentNode(const ALeft, ARight,
305- AKey: THashLibUInt32Array; AFlags: UInt32): TBlake3Node;
306- var
307- LBlockWords: THashLibUInt32Array;
288+ class function TBlake3.TBlake3Node .ParentNode(ALeft, ARight,
289+ AKey: PCardinal; AFlags: UInt32): TBlake3Node;
308290begin
309- LBlockWords := TArrayUtils.Concatenate(ALeft, ARight);
310- Result := TBlake3Node.CreateBlake3Node(AKey, LBlockWords, 0 , BlockSizeInBytes,
311- AFlags or FlagParent);
291+ Result := DefaultBlake3Node();
292+ System.Move(AKey^, Result.CV[0 ], 8 * System.SizeOf(UInt32));
293+ System.Move(ALeft^, Result.Block[0 ], 8 * System.SizeOf(UInt32));
294+ System.Move(ARight^, Result.Block[8 ], 8 * System.SizeOf(UInt32));
295+ Result.Counter := 0 ;
296+ Result.BlockLen := BlockSizeInBytes;
297+ Result.Flags := AFlags or FlagParent;
312298end ;
313299
314300{ TBlake3.TBlake3ChunkState }
@@ -328,12 +314,11 @@ function TBlake3.TBlake3ChunkState.Complete: Boolean;
328314end ;
329315
330316class function TBlake3.TBlake3ChunkState .CreateBlake3ChunkState
331- (const AIV: THashLibUInt32Array ; AChunkCounter: UInt64; AFlags: UInt32)
317+ (AIV: PCardinal ; AChunkCounter: UInt64; AFlags: UInt32)
332318 : TBlake3ChunkState;
333319begin
334320 Result := DefaultBlake3ChunkState;
335- System.Move(AIV[0 ], Result.N.CV[0 ], System.Length(AIV) *
336- System.SizeOf(UInt32));
321+ System.Move(AIV^, Result.N.CV[0 ], 8 * System.SizeOf(UInt32));
337322 Result.N.Counter := AChunkCounter;
338323 Result.N.BlockLen := BlockSizeInBytes;
339324 // compress the first block with the start flag set
@@ -531,30 +516,22 @@ class function TBlake3.TrailingZeros64(AValue: UInt64): Int32;
531516end ;
532517
533518constructor TBlake3.CreateInternal(AHashSize: Int32;
534- const AKeyWords: THashLibUInt32Array; AFlags: UInt32);
535- var
536- LIdx: Int32;
519+ AKeyWords: PCardinal; AFlags: UInt32);
537520begin
538521 inherited Create(AHashSize, BlockSizeInBytes);
539- FKey := System.Copy(AKeyWords );
522+ System.Move(AKeyWords^, FKey[ 0 ], 8 * System.SizeOf(UInt32) );
540523 FFlags := AFlags;
541- System.SetLength(FStack, 54 );
542- for LIdx := System.Low(FStack) to System.High(FStack) do
543- begin
544- System.SetLength(FStack[LIdx], 8 );
545- end ;
546524end ;
547525
548526constructor TBlake3.Create(AHashSize: Int32; const AKey: THashLibByteArray);
549527var
550- LKeyWords: THashLibUInt32Array ;
528+ LKeyWords: array [ 0 .. 7 ] of UInt32 ;
551529 LKeyLength: Int32;
552530begin
553- System.SetLength(LKeyWords, 8 );
554531 if AKey = nil then
555532 begin
556533 System.Move(IV, LKeyWords[0 ], System.SizeOf(IV));
557- CreateInternal(AHashSize, LKeyWords, 0 );
534+ CreateInternal(AHashSize, @ LKeyWords[ 0 ] , 0 );
558535 end
559536 else
560537 begin
@@ -564,8 +541,8 @@ constructor TBlake3.Create(AHashSize: Int32; const AKey: THashLibByteArray);
564541 raise EArgumentOutOfRangeHashLibException.CreateResFmt(@SInvalidKeyLength,
565542 [KeyLengthInBytes, LKeyLength]);
566543 end ;
567- TConverters.le32_copy(PByte(AKey), 0 , PCardinal( LKeyWords) , 0 , LKeyLength);
568- CreateInternal(AHashSize, LKeyWords, FlagKeyedHash);
544+ TConverters.le32_copy(PByte(AKey), 0 , @ LKeyWords[ 0 ] , 0 , LKeyLength);
545+ CreateInternal(AHashSize, @ LKeyWords[ 0 ] , FlagKeyedHash);
569546 end ;
570547end ;
571548
@@ -576,9 +553,9 @@ constructor TBlake3.Create(AHashSize: THashSize; const AKey: THashLibByteArray);
576553
577554procedure TBlake3.Initialize ;
578555begin
579- FCS := TBlake3ChunkState.CreateBlake3ChunkState(FKey, 0 , FFlags);
556+ FCS := TBlake3ChunkState.CreateBlake3ChunkState(@ FKey[ 0 ] , 0 , FFlags);
580557 FOutputReader := TBlake3OutputReader.DefaultBlake3OutputReader();
581- TArrayUtils.ZeroFill (FStack);
558+ FillChar(FStack, System.SizeOf (FStack), 0 );
582559 FUsed := 0 ;
583560end ;
584561
@@ -592,10 +569,10 @@ function TBlake3.Clone: IHash;
592569var
593570 LHashInstance: TBlake3;
594571begin
595- LHashInstance := TBlake3.CreateInternal(HashSize, FKey, FFlags);
572+ LHashInstance := TBlake3.CreateInternal(HashSize, @ FKey[ 0 ] , FFlags);
596573 LHashInstance.FCS := FCS.Clone();
597574 LHashInstance.FOutputReader := FOutputReader.Clone();
598- LHashInstance.FStack := TArrayUtils.Clone (FStack);
575+ System.Move(FStack, LHashInstance.FStack, System.SizeOf (FStack) );
599576 LHashInstance.FUsed := FUsed;
600577 Result := LHashInstance;
601578 Result.BufferSize := BufferSize;
@@ -616,37 +593,30 @@ function TBlake3.HasSubTreeAtHeight(AIdx: Int32): Boolean;
616593 Result := (FUsed and (1 shl AIdx)) <> 0 ;
617594end ;
618595
619- procedure TBlake3.AddChunkChainingValue (const ACV: THashLibUInt32Array );
596+ procedure TBlake3.AddChunkChainingValue (ACV: PCardinal );
620597var
621598 LIdx: Int32;
622- LFlags: UInt32;
623- LKey: THashLibUInt32Array;
624- LPtrCV: PCardinal;
625599begin
626- LKey := FKey;
627- LFlags := FFlags;
628- LPtrCV := PCardinal(ACV);
629600 // seek to first open stack slot, merging subtrees as we go
630601 LIdx := 0 ;
631602 while HasSubTreeAtHeight(LIdx) do
632603 begin
633- TBlake3Node.ParentNode(FStack[LIdx], ACV, LKey, LFlags )
634- .ChainingValue(LPtrCV );
604+ TBlake3Node.ParentNode(@ FStack[LIdx, 0 ], ACV, @FKey[ 0 ], FFlags )
605+ .ChainingValue(ACV );
635606 System.Inc(LIdx);
636607 end ;
637- FStack[LIdx] := System.Copy(ACV );
608+ System.Move(ACV^, FStack[LIdx, 0 ], 8 * System.SizeOf(UInt32) );
638609 System.Inc(FUsed);
639610end ;
640611
641612function TBlake3.RootNode : TBlake3Node;
642613var
643614 LIdx, LTrailingZeros64, LLen64: Int32;
644- LTemp: THashLibUInt32Array ;
615+ LTemp: array [ 0 .. 7 ] of UInt32 ;
645616 LPtrTemp: PCardinal;
646617begin
647618 Result := FCS.Node();
648- System.SetLength(LTemp, 8 );
649- LPtrTemp := PCardinal(LTemp);
619+ LPtrTemp := @LTemp[0 ];
650620
651621 LTrailingZeros64 := TrailingZeros64(FUsed);
652622 LLen64 := Len64(FUsed);
@@ -655,7 +625,8 @@ function TBlake3.RootNode: TBlake3Node;
655625 if HasSubTreeAtHeight(LIdx) then
656626 begin
657627 Result.ChainingValue(LPtrTemp);
658- Result := TBlake3Node.ParentNode(FStack[LIdx], LTemp, FKey, FFlags);
628+ Result := TBlake3Node.ParentNode(@FStack[LIdx, 0 ], LPtrTemp,
629+ @FKey[0 ], FFlags);
659630 end ;
660631 end ;
661632 Result.Flags := Result.Flags or FlagRoot;
@@ -665,27 +636,64 @@ procedure TBlake3.TransformBytes(const AData: THashLibByteArray;
665636 AIndex, ADataLength: Int32);
666637var
667638 LPtrAData: PByte;
668- LCV: THashLibUInt32Array;
669- LCount: Int32;
639+ LCV: array [0 .. 7 ] of UInt32;
640+ LCVs: array [0 .. 7 , 0 .. 7 ] of UInt32;
641+ LCount, LParDeg, I, LBatchBytes, LNumChunks: Int32;
670642 LPtrCV: PCardinal;
671643begin
672644 LPtrAData := PByte(AData) + AIndex;
673- System.SetLength(LCV, 8 );
674- LPtrCV := PCardinal(LCV);
645+ LPtrCV := @LCV[0 ];
646+ LParDeg := Blake3_ParallelDegree;
647+
648+ // Step 1: Complete any partial chunk to reach a clean boundary
649+ if (FCS.BytesConsumed > 0 ) and (ADataLength > 0 ) then
650+ begin
651+ LCount := Min(ChunkSize - FCS.BytesConsumed, ADataLength);
652+ FCS.Update(LPtrAData, LCount);
653+ System.Inc(LPtrAData, LCount);
654+ System.Dec(ADataLength, LCount);
655+ end ;
656+
657+ // Flush the completed chunk if there's more data to process
658+ if FCS.Complete() and (ADataLength > 0 ) then
659+ begin
660+ FCS.Node().ChainingValue(LPtrCV);
661+ AddChunkChainingValue(LPtrCV);
662+ FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0 ],
663+ FCS.ChunkCounter() + 1 , FFlags);
664+ end ;
665+
666+ // Step 2: Process full chunk batches in parallel (4x SSE2, 8x AVX2)
667+ // At this point FCS.BytesConsumed = 0 whenever ADataLength > 0.
668+ // Always leave at least ChunkSize bytes for the sequential path so that
669+ // the last chunk ends up in FCS (required by RootNode's invariant).
670+ // HashMany cascades internally (e.g. AVX2: hash8 -> hash4 -> scalar).
671+ while ADataLength >= 2 * ChunkSize do
672+ begin
673+ LNumChunks := (ADataLength div ChunkSize) - 1 ;
674+ if LNumChunks > LParDeg then
675+ LNumChunks := LParDeg;
676+ Blake3_HashMany(LPtrAData, @FKey[0 ], @LCVs[0 , 0 ],
677+ LNumChunks, FCS.ChunkCounter(), FFlags);
678+ for I := 0 to LNumChunks - 1 do
679+ AddChunkChainingValue(@LCVs[I, 0 ]);
680+ LBatchBytes := LNumChunks * ChunkSize;
681+ FCS := TBlake3ChunkState.CreateBlake3ChunkState(@FKey[0 ],
682+ FCS.ChunkCounter() + UInt64(LNumChunks), FFlags);
683+ System.Inc(LPtrAData, LBatchBytes);
684+ System.Dec(ADataLength, LBatchBytes);
685+ end ;
675686
687+ // Step 3: Process remaining data sequentially
676688 while ADataLength > 0 do
677689 begin
678- // If the current chunk is complete, finalize it and add it to the tree,
679- // then reset the chunk state (but keep incrementing the counter across
680- // chunks).
681690 if FCS.Complete() then
682691 begin
683692 FCS.Node().ChainingValue(LPtrCV);
684- AddChunkChainingValue(LCV );
685- FCS := TBlake3ChunkState.CreateBlake3ChunkState(FKey,
693+ AddChunkChainingValue(LPtrCV );
694+ FCS := TBlake3ChunkState.CreateBlake3ChunkState(@ FKey[ 0 ] ,
686695 FCS.ChunkCounter() + 1 , FFlags);
687696 end ;
688- // Compress input bytes into the current chunk state.
689697 LCount := Min(ChunkSize - FCS.BytesConsumed, ADataLength);
690698 FCS.Update(LPtrAData, LCount);
691699 System.Inc(LPtrAData, LCount);
@@ -709,20 +717,19 @@ class procedure TBlake3.DeriveKey(const ASrcKey, ACtx,
709717const
710718 derivationIVLen = Int32(32 );
711719var
712- LIVWords: THashLibUInt32Array ;
720+ LIVWords: array [ 0 .. 7 ] of UInt32 ;
713721 LDerivationIV: THashLibByteArray;
714722 LXof: IXOF;
715723begin
716- System.SetLength(LIVWords, 8 );
717724 System.Move(IV, LIVWords[0 ], System.SizeOf(IV));
718725 // construct the derivation Hasher and get the DerivationIV
719- LDerivationIV := (TBlake3.CreateInternal(derivationIVLen, LIVWords,
726+ LDerivationIV := (TBlake3.CreateInternal(derivationIVLen, @ LIVWords[ 0 ] ,
720727 FlagDeriveKeyContext) as IHash).ComputeBytes(ACtx).GetBytes();
721- TConverters.le32_copy(PByte(LDerivationIV), 0 , PCardinal( LIVWords) , 0 ,
728+ TConverters.le32_copy(PByte(LDerivationIV), 0 , @ LIVWords[ 0 ] , 0 ,
722729 KeyLengthInBytes);
723730
724731 // derive the SubKey
725- LXof := TBlake3XOF.Create(32 , LIVWords, FlagDeriveKeyMaterial);
732+ LXof := TBlake3XOF.Create(32 , @ LIVWords[ 0 ] , FlagDeriveKeyMaterial);
726733 LXof.XOFSizeInBits := System.Length(ASubKey) * 8 ;
727734 LXof.Initialize;
728735 LXof.TransformBytes(ASrcKey);
@@ -758,10 +765,10 @@ function TBlake3XOF.Clone: IHash;
758765 // Internal Blake3 Cloning
759766 LHashInstance.FCS := FCS.Clone();
760767 LHashInstance.FOutputReader := FOutputReader.Clone();
761- LHashInstance.FStack := TArrayUtils.Clone (FStack);
768+ System.Move(FStack, LHashInstance.FStack, System.SizeOf (FStack) );
762769 LHashInstance.FUsed := FUsed;
763770 LHashInstance.FFlags := FFlags;
764- LHashInstance.FKey := System.Copy (FKey);
771+ System.Move(FKey, LHashInstance.FKey, System.SizeOf (FKey) );
765772
766773 Result := LHashInstance;
767774 Result.BufferSize := BufferSize;
@@ -774,7 +781,7 @@ constructor TBlake3XOF.Create(AHashSize: Int32; const AKey: THashLibByteArray);
774781end ;
775782
776783constructor TBlake3XOF.Create(AHashSize: Int32;
777- const AKeyWords: THashLibUInt32Array ; AFlags: UInt32);
784+ AKeyWords: PCardinal ; AFlags: UInt32);
778785begin
779786 inherited CreateInternal(AHashSize, AKeyWords, AFlags);
780787 FFinalized := False;
0 commit comments