@@ -158,10 +158,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
158158// Given some input larger than one chunk, return the number of bytes that
159159// should go in the left subtree. This is the largest power-of-2 number of
160160// chunks that leaves at least 1 byte for the right subtree.
161- INLINE size_t left_len (size_t content_len ) {
162- // Subtract 1 to reserve at least one byte for the right side. content_len
161+ INLINE size_t left_subtree_len (size_t input_len ) {
162+ // Subtract 1 to reserve at least one byte for the right side. input_len
163163 // should always be greater than BLAKE3_CHUNK_LEN.
164- size_t full_chunks = (content_len - 1 ) / BLAKE3_CHUNK_LEN ;
164+ size_t full_chunks = (input_len - 1 ) / BLAKE3_CHUNK_LEN ;
165165 return round_down_to_power_of_2 (full_chunks ) * BLAKE3_CHUNK_LEN ;
166166}
167167
@@ -265,11 +265,10 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
265265// Why not just have the caller split the input on the first update(), instead
266266// of implementing this special rule? Because we don't want to limit SIMD or
267267// multi-threading parallelism for that update().
268- static size_t blake3_compress_subtree_wide (const uint8_t * input ,
269- size_t input_len ,
270- const uint32_t key [8 ],
271- uint64_t chunk_counter ,
272- uint8_t flags , uint8_t * out ) {
268+ size_t blake3_compress_subtree_wide (const uint8_t * input , size_t input_len ,
269+ const uint32_t key [8 ],
270+ uint64_t chunk_counter , uint8_t flags ,
271+ uint8_t * out , bool use_tbb ) {
273272 // Note that the single chunk case does *not* bump the SIMD degree up to 2
274273 // when it is 1. If this implementation adds multi-threading in the future,
275274 // this gives us the option of multi-threading even the 2-chunk case, which
@@ -283,7 +282,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
283282 // the input into left and right subtrees. (Note that this is only optimal
284283 // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
285284 // of 3 or something, we'll need a more complicated strategy.)
286- size_t left_input_len = left_len (input_len );
285+ size_t left_input_len = left_subtree_len (input_len );
287286 size_t right_input_len = input_len - left_input_len ;
288287 const uint8_t * right_input = & input [left_input_len ];
289288 uint64_t right_chunk_counter =
@@ -303,12 +302,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
303302 }
304303 uint8_t * right_cvs = & cv_array [degree * BLAKE3_OUT_LEN ];
305304
306- // Recurse! If this implementation adds multi-threading support in the
307- // future, this is where it will go.
308- size_t left_n = blake3_compress_subtree_wide (input , left_input_len , key ,
309- chunk_counter , flags , cv_array );
310- size_t right_n = blake3_compress_subtree_wide (
311- right_input , right_input_len , key , right_chunk_counter , flags , right_cvs );
305+ // Recurse!
306+ size_t left_n = -1 ;
307+ size_t right_n = -1 ;
308+
309+ #if defined(BLAKE3_USE_TBB )
310+ blake3_compress_subtree_wide_join_tbb (
311+ key , flags , use_tbb ,
312+ // left-hand side
313+ input , left_input_len , chunk_counter , cv_array , & left_n ,
314+ // right-hand side
315+ right_input , right_input_len , right_chunk_counter , right_cvs , & right_n );
316+ #else
317+ left_n = blake3_compress_subtree_wide (
318+ input , left_input_len , key , chunk_counter , flags , cv_array , use_tbb );
319+ right_n = blake3_compress_subtree_wide (right_input , right_input_len , key ,
320+ right_chunk_counter , flags , right_cvs ,
321+ use_tbb );
322+ #endif // BLAKE3_USE_TBB
312323
313324 // The special case again. If simd_degree=1, then we'll have left_n=1 and
314325 // right_n=1. Rather than compressing them into a single output, return
@@ -334,16 +345,18 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
334345//
335346// As with compress_subtree_wide(), this function is not used on inputs of 1
336347// chunk or less. That's a different codepath.
337- INLINE void compress_subtree_to_parent_node (
338- const uint8_t * input , size_t input_len , const uint32_t key [8 ],
339- uint64_t chunk_counter , uint8_t flags , uint8_t out [2 * BLAKE3_OUT_LEN ]) {
348+ INLINE void
349+ compress_subtree_to_parent_node (const uint8_t * input , size_t input_len ,
350+ const uint32_t key [8 ], uint64_t chunk_counter ,
351+ uint8_t flags , uint8_t out [2 * BLAKE3_OUT_LEN ],
352+ bool use_tbb ) {
340353#if defined(BLAKE3_TESTING )
341354 assert (input_len > BLAKE3_CHUNK_LEN );
342355#endif
343356
344357 uint8_t cv_array [MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN ];
345358 size_t num_cvs = blake3_compress_subtree_wide (input , input_len , key ,
346- chunk_counter , flags , cv_array );
359+ chunk_counter , flags , cv_array , use_tbb );
347360 assert (num_cvs <= MAX_SIMD_DEGREE_OR_2 );
348361 // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
349362 // as we just asserted, num_cvs will always be <=2 in that case. But GCC
@@ -459,8 +472,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
459472 self -> cv_stack_len += 1 ;
460473}
461474
462- void blake3_hasher_update (blake3_hasher * self , const void * input ,
463- size_t input_len ) {
475+ INLINE void blake3_hasher_update_base (blake3_hasher * self , const void * input ,
476+ size_t input_len , bool use_tbb ) {
464477 // Explicitly checking for zero avoids causing UB by passing a null pointer
465478 // to memcpy. This comes up in practice with things like:
466479 // std::vector<uint8_t> v;
@@ -546,7 +559,7 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
546559 uint8_t cv_pair [2 * BLAKE3_OUT_LEN ];
547560 compress_subtree_to_parent_node (input_bytes , subtree_len , self -> key ,
548561 self -> chunk .chunk_counter ,
549- self -> chunk .flags , cv_pair );
562+ self -> chunk .flags , cv_pair , use_tbb );
550563 hasher_push_cv (self , cv_pair , self -> chunk .chunk_counter );
551564 hasher_push_cv (self , & cv_pair [BLAKE3_OUT_LEN ],
552565 self -> chunk .chunk_counter + (subtree_chunks / 2 ));
@@ -568,6 +581,20 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
568581 }
569582}
570583
584+ void blake3_hasher_update (blake3_hasher * self , const void * input ,
585+ size_t input_len ) {
586+ bool use_tbb = false;
587+ blake3_hasher_update_base (self , input , input_len , use_tbb );
588+ }
589+
590+ #if defined(BLAKE3_USE_TBB )
591+ void blake3_hasher_update_tbb (blake3_hasher * self , const void * input ,
592+ size_t input_len ) {
593+ bool use_tbb = true;
594+ blake3_hasher_update_base (self , input , input_len , use_tbb );
595+ }
596+ #endif // BLAKE3_USE_TBB
597+
571598void blake3_hasher_finalize (const blake3_hasher * self , uint8_t * out ,
572599 size_t out_len ) {
573600 blake3_hasher_finalize_seek (self , 0 , out , out_len );
0 commit comments