Skip to content

Commit 7b79e3a

Browse files
divinity76mvorisek
authored andcommitted
Sync upstream blake3 (latest 1.8.3 release)
Also BLAKE3-team/BLAKE3#382 and BLAKE3-team/BLAKE3#443 have both been accepted upstream, so we can cleanly use upstream BLAKE3 sources without php-specific patches, at least for now :)
1 parent f70934f commit 7b79e3a

File tree

5 files changed

+91
-34
lines changed

5 files changed

+91
-34
lines changed

ext/hash/blake3/upstream_blake3/c/blake3.c

Lines changed: 49 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -158,10 +158,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
158158
// Given some input larger than one chunk, return the number of bytes that
159159
// should go in the left subtree. This is the largest power-of-2 number of
160160
// chunks that leaves at least 1 byte for the right subtree.
161-
INLINE size_t left_len(size_t content_len) {
162-
// Subtract 1 to reserve at least one byte for the right side. content_len
161+
INLINE size_t left_subtree_len(size_t input_len) {
162+
// Subtract 1 to reserve at least one byte for the right side. input_len
163163
// should always be greater than BLAKE3_CHUNK_LEN.
164-
size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
164+
size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
165165
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
166166
}
167167

@@ -265,11 +265,10 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
265265
// Why not just have the caller split the input on the first update(), instead
266266
// of implementing this special rule? Because we don't want to limit SIMD or
267267
// multi-threading parallelism for that update().
268-
static size_t blake3_compress_subtree_wide(const uint8_t *input,
269-
size_t input_len,
270-
const uint32_t key[8],
271-
uint64_t chunk_counter,
272-
uint8_t flags, uint8_t *out) {
268+
size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
269+
const uint32_t key[8],
270+
uint64_t chunk_counter, uint8_t flags,
271+
uint8_t *out, bool use_tbb) {
273272
// Note that the single chunk case does *not* bump the SIMD degree up to 2
274273
// when it is 1. If this implementation adds multi-threading in the future,
275274
// this gives us the option of multi-threading even the 2-chunk case, which
@@ -283,7 +282,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
283282
// the input into left and right subtrees. (Note that this is only optimal
284283
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
285284
// of 3 or something, we'll need a more complicated strategy.)
286-
size_t left_input_len = left_len(input_len);
285+
size_t left_input_len = left_subtree_len(input_len);
287286
size_t right_input_len = input_len - left_input_len;
288287
const uint8_t *right_input = &input[left_input_len];
289288
uint64_t right_chunk_counter =
@@ -303,12 +302,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
303302
}
304303
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
305304

306-
// Recurse! If this implementation adds multi-threading support in the
307-
// future, this is where it will go.
308-
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
309-
chunk_counter, flags, cv_array);
310-
size_t right_n = blake3_compress_subtree_wide(
311-
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
305+
// Recurse!
306+
size_t left_n = -1;
307+
size_t right_n = -1;
308+
309+
#if defined(BLAKE3_USE_TBB)
310+
blake3_compress_subtree_wide_join_tbb(
311+
key, flags, use_tbb,
312+
// left-hand side
313+
input, left_input_len, chunk_counter, cv_array, &left_n,
314+
// right-hand side
315+
right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
316+
#else
317+
left_n = blake3_compress_subtree_wide(
318+
input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
319+
right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
320+
right_chunk_counter, flags, right_cvs,
321+
use_tbb);
322+
#endif // BLAKE3_USE_TBB
312323

313324
// The special case again. If simd_degree=1, then we'll have left_n=1 and
314325
// right_n=1. Rather than compressing them into a single output, return
@@ -334,16 +345,18 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
334345
//
335346
// As with compress_subtree_wide(), this function is not used on inputs of 1
336347
// chunk or less. That's a different codepath.
337-
INLINE void compress_subtree_to_parent_node(
338-
const uint8_t *input, size_t input_len, const uint32_t key[8],
339-
uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
348+
INLINE void
349+
compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
350+
const uint32_t key[8], uint64_t chunk_counter,
351+
uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
352+
bool use_tbb) {
340353
#if defined(BLAKE3_TESTING)
341354
assert(input_len > BLAKE3_CHUNK_LEN);
342355
#endif
343356

344357
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
345358
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
346-
chunk_counter, flags, cv_array);
359+
chunk_counter, flags, cv_array, use_tbb);
347360
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
348361
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
349362
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
@@ -459,8 +472,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
459472
self->cv_stack_len += 1;
460473
}
461474

462-
void blake3_hasher_update(blake3_hasher *self, const void *input,
463-
size_t input_len) {
475+
INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
476+
size_t input_len, bool use_tbb) {
464477
// Explicitly checking for zero avoids causing UB by passing a null pointer
465478
// to memcpy. This comes up in practice with things like:
466479
// std::vector<uint8_t> v;
@@ -546,7 +559,7 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
546559
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
547560
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
548561
self->chunk.chunk_counter,
549-
self->chunk.flags, cv_pair);
562+
self->chunk.flags, cv_pair, use_tbb);
550563
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
551564
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
552565
self->chunk.chunk_counter + (subtree_chunks / 2));
@@ -568,6 +581,20 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
568581
}
569582
}
570583

584+
void blake3_hasher_update(blake3_hasher *self, const void *input,
585+
size_t input_len) {
586+
bool use_tbb = false;
587+
blake3_hasher_update_base(self, input, input_len, use_tbb);
588+
}
589+
590+
#if defined(BLAKE3_USE_TBB)
591+
void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
592+
size_t input_len) {
593+
bool use_tbb = true;
594+
blake3_hasher_update_base(self, input, input_len, use_tbb);
595+
}
596+
#endif // BLAKE3_USE_TBB
597+
571598
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
572599
size_t out_len) {
573600
blake3_hasher_finalize_seek(self, 0, out, out_len);

ext/hash/blake3/upstream_blake3/c/blake3.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,20 @@
3030
extern "C" {
3131
#endif
3232

33-
#define BLAKE3_VERSION_STRING "1.5.5"
33+
#define BLAKE3_VERSION_STRING "1.8.3"
3434
#define BLAKE3_KEY_LEN 32
3535
#define BLAKE3_OUT_LEN 32
3636
#define BLAKE3_BLOCK_LEN 64
3737
#define BLAKE3_CHUNK_LEN 1024
3838
#define BLAKE3_MAX_DEPTH 54
3939

4040
// This struct is a private implementation detail. It has to be here because
41-
// it's part of blake3_hasher below.
41+
// it's part of the blake3_hasher structure defined below.
4242
typedef struct {
4343
uint32_t cv[8];
4444
uint64_t chunk_counter;
4545
uint8_t buf[BLAKE3_BLOCK_LEN];
4646
uint8_t buf_len;
47-
uint8_t padding_1[5];
4847
uint8_t blocks_compressed;
4948
uint8_t flags;
5049
} blake3_chunk_state;
@@ -59,7 +58,6 @@ typedef struct {
5958
// don't know whether more input is coming. This is different from how the
6059
// reference implementation does things.
6160
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
62-
uint8_t padding_2[7];
6361
} blake3_hasher;
6462

6563
BLAKE3_API const char *blake3_version(void);
@@ -71,6 +69,10 @@ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const voi
7169
size_t context_len);
7270
BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
7371
size_t input_len);
72+
#if defined(BLAKE3_USE_TBB)
73+
BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
74+
size_t input_len);
75+
#endif // BLAKE3_USE_TBB
7476
BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
7577
size_t out_len);
7678
BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,

ext/hash/blake3/upstream_blake3/c/blake3_dispatch.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ static
163163
#endif
164164
}
165165
}
166-
// https://github.com/BLAKE3-team/BLAKE3/pull/382
167166
#endif
168167

169168
void blake3_compress_in_place(uint32_t cv[8],
@@ -235,9 +234,8 @@ void blake3_xof_many(const uint32_t cv[8],
235234
}
236235
#if defined(IS_X86)
237236
const enum cpu_feature features = get_cpu_features();
238-
// https://github.com/BLAKE3-team/BLAKE3/pull/443
239237
MAYBE_UNUSED(features);
240-
#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
238+
#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
241239
if (features & AVX512VL) {
242240
blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
243241
return;

ext/hash/blake3/upstream_blake3/c/blake3_impl.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99

1010
#include "blake3.h"
1111

12+
#ifdef __cplusplus
13+
extern "C" {
14+
#endif
15+
1216
// internal flags
1317
enum blake3_flags {
1418
CHUNK_START = 1 << 0,
@@ -28,6 +32,12 @@ enum blake3_flags {
2832
#define INLINE static inline __attribute__((always_inline))
2933
#endif
3034

35+
#ifdef __cplusplus
36+
#define NOEXCEPT noexcept
37+
#else
38+
#define NOEXCEPT
39+
#endif
40+
3141
#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
3242
#define IS_X86
3343
#define IS_X86_64
@@ -210,6 +220,22 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
210220

211221
size_t blake3_simd_degree(void);
212222

223+
BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
224+
const uint32_t key[8],
225+
uint64_t chunk_counter, uint8_t flags,
226+
uint8_t *out, bool use_tbb);
227+
228+
#if defined(BLAKE3_USE_TBB)
229+
BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
230+
// shared params
231+
const uint32_t key[8], uint8_t flags, bool use_tbb,
232+
// left-hand side params
233+
const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
234+
uint8_t *l_cvs, size_t *l_n,
235+
// right-hand side params
236+
const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
237+
uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
238+
#endif
213239

214240
// Declarations for implementation-specific functions.
215241
void blake3_compress_in_place_portable(uint32_t cv[8],
@@ -283,7 +309,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
283309
uint8_t flags, uint8_t flags_start,
284310
uint8_t flags_end, uint8_t *out);
285311

286-
#if !defined(_WIN32)
312+
#if !defined(_WIN32) && !defined(__CYGWIN__)
287313
void blake3_xof_many_avx512(const uint32_t cv[8],
288314
const uint8_t block[BLAKE3_BLOCK_LEN],
289315
uint8_t block_len, uint64_t counter, uint8_t flags,
@@ -300,5 +326,8 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
300326
uint8_t flags_end, uint8_t *out);
301327
#endif
302328

329+
#ifdef __cplusplus
330+
}
331+
#endif
303332

304333
#endif /* BLAKE3_IMPL_H */

ext/hash/blake3/upstream_blake3/c/blake3_neon.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
243243
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
244244
}
245245

246-
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
247-
const uint32_t key[8], uint64_t counter,
248-
bool increment_counter, uint8_t flags,
249-
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
246+
static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
247+
const uint32_t key[8], uint64_t counter,
248+
bool increment_counter, uint8_t flags,
249+
uint8_t flags_start, uint8_t flags_end,
250+
uint8_t *out) {
250251
uint32x4_t h_vecs[8] = {
251252
set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
252253
set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),

0 commit comments

Comments
 (0)