@@ -95,24 +95,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
95
95
96
96
INLINE void output_root_bytes (const output_t * self , uint64_t seek , uint8_t * out ,
97
97
size_t out_len ) {
98
+ if (out_len == 0 ) {
99
+ return ;
100
+ }
98
101
uint64_t output_block_counter = seek / 64 ;
99
102
size_t offset_within_block = seek % 64 ;
100
103
uint8_t wide_buf [64 ];
101
- while (out_len > 0 ) {
102
- blake3_compress_xof (self -> input_cv , self -> block , self -> block_len ,
103
- output_block_counter , self -> flags | ROOT , wide_buf );
104
- size_t available_bytes = 64 - offset_within_block ;
105
- size_t memcpy_len ;
106
- if (out_len > available_bytes ) {
107
- memcpy_len = available_bytes ;
108
- } else {
109
- memcpy_len = out_len ;
110
- }
111
- memcpy (out , wide_buf + offset_within_block , memcpy_len );
112
- out += memcpy_len ;
113
- out_len -= memcpy_len ;
104
+ if (offset_within_block ) {
105
+ blake3_compress_xof (self -> input_cv , self -> block , self -> block_len , output_block_counter , self -> flags | ROOT , wide_buf );
106
+ const size_t available_bytes = 64 - offset_within_block ;
107
+ const size_t bytes = out_len > available_bytes ? available_bytes : out_len ;
108
+ memcpy (out , wide_buf + offset_within_block , bytes );
109
+ out += bytes ;
110
+ out_len -= bytes ;
114
111
output_block_counter += 1 ;
115
- offset_within_block = 0 ;
112
+ }
113
+ if (out_len / 64 ) {
114
+ blake3_xof_many (self -> input_cv , self -> block , self -> block_len , output_block_counter , self -> flags | ROOT , out , out_len / 64 );
115
+ }
116
+ output_block_counter += out_len / 64 ;
117
+ out += out_len & -64 ;
118
+ out_len -= out_len & -64 ;
119
+ if (out_len ) {
120
+ blake3_compress_xof (self -> input_cv , self -> block , self -> block_len , output_block_counter , self -> flags | ROOT , wide_buf );
121
+ memcpy (out , wide_buf , out_len );
116
122
}
117
123
}
118
124
@@ -159,10 +165,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
159
165
// Given some input larger than one chunk, return the number of bytes that
160
166
// should go in the left subtree. This is the largest power-of-2 number of
161
167
// chunks that leaves at least 1 byte for the right subtree.
162
- INLINE size_t left_len (size_t content_len ) {
163
- // Subtract 1 to reserve at least one byte for the right side. content_len
168
+ INLINE size_t left_subtree_len (size_t input_len ) {
169
+ // Subtract 1 to reserve at least one byte for the right side. input_len
164
170
// should always be greater than BLAKE3_CHUNK_LEN.
165
- size_t full_chunks = (content_len - 1 ) / BLAKE3_CHUNK_LEN ;
171
+ size_t full_chunks = (input_len - 1 ) / BLAKE3_CHUNK_LEN ;
166
172
return round_down_to_power_of_2 (full_chunks ) * BLAKE3_CHUNK_LEN ;
167
173
}
168
174
@@ -251,26 +257,25 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
251
257
252
258
// The wide helper function returns (writes out) an array of chaining values
253
259
// and returns the length of that array. The number of chaining values returned
254
- // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
260
+ // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
255
261
// if the input is shorter than that many chunks. The reason for maintaining a
256
262
// wide array of chaining values going back up the tree, is to allow the
257
263
// implementation to hash as many parents in parallel as possible.
258
264
//
259
265
// As a special case when the SIMD degree is 1, this function will still return
260
266
// at least 2 outputs. This guarantees that this function doesn't perform the
261
267
// root compression. (If it did, it would use the wrong flags, and also we
262
- // wouldn't be able to implement exendable ouput .) Note that this function is
268
+ // wouldn't be able to implement extendable output .) Note that this function is
263
269
// not used when the whole input is only 1 chunk long; that's a different
264
270
// codepath.
265
271
//
266
272
// Why not just have the caller split the input on the first update(), instead
267
273
// of implementing this special rule? Because we don't want to limit SIMD or
268
274
// multi-threading parallelism for that update().
269
- static size_t blake3_compress_subtree_wide (const uint8_t * input ,
270
- size_t input_len ,
271
- const uint32_t key [8 ],
272
- uint64_t chunk_counter ,
273
- uint8_t flags , uint8_t * out ) {
275
+ size_t blake3_compress_subtree_wide (const uint8_t * input , size_t input_len ,
276
+ const uint32_t key [8 ],
277
+ uint64_t chunk_counter , uint8_t flags ,
278
+ uint8_t * out , bool use_tbb ) {
274
279
// Note that the single chunk case does *not* bump the SIMD degree up to 2
275
280
// when it is 1. If this implementation adds multi-threading in the future,
276
281
// this gives us the option of multi-threading even the 2-chunk case, which
@@ -284,7 +289,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
284
289
// the input into left and right subtrees. (Note that this is only optimal
285
290
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
286
291
// of 3 or something, we'll need a more complicated strategy.)
287
- size_t left_input_len = left_len (input_len );
292
+ size_t left_input_len = left_subtree_len (input_len );
288
293
size_t right_input_len = input_len - left_input_len ;
289
294
const uint8_t * right_input = & input [left_input_len ];
290
295
uint64_t right_chunk_counter =
@@ -304,12 +309,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
304
309
}
305
310
uint8_t * right_cvs = & cv_array [degree * BLAKE3_OUT_LEN ];
306
311
307
- // Recurse! If this implementation adds multi-threading support in the
308
- // future, this is where it will go.
309
- size_t left_n = blake3_compress_subtree_wide (input , left_input_len , key ,
310
- chunk_counter , flags , cv_array );
311
- size_t right_n = blake3_compress_subtree_wide (
312
- right_input , right_input_len , key , right_chunk_counter , flags , right_cvs );
312
+ // Recurse!
313
+ size_t left_n = -1 ;
314
+ size_t right_n = -1 ;
315
+
316
+ #if defined(BLAKE3_USE_TBB )
317
+ blake3_compress_subtree_wide_join_tbb (
318
+ key , flags , use_tbb ,
319
+ // left-hand side
320
+ input , left_input_len , chunk_counter , cv_array , & left_n ,
321
+ // right-hand side
322
+ right_input , right_input_len , right_chunk_counter , right_cvs , & right_n );
323
+ #else
324
+ left_n = blake3_compress_subtree_wide (
325
+ input , left_input_len , key , chunk_counter , flags , cv_array , use_tbb );
326
+ right_n = blake3_compress_subtree_wide (right_input , right_input_len , key ,
327
+ right_chunk_counter , flags , right_cvs ,
328
+ use_tbb );
329
+ #endif // BLAKE3_USE_TBB
313
330
314
331
// The special case again. If simd_degree=1, then we'll have left_n=1 and
315
332
// right_n=1. Rather than compressing them into a single output, return
@@ -335,32 +352,37 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
335
352
//
336
353
// As with compress_subtree_wide(), this function is not used on inputs of 1
337
354
// chunk or less. That's a different codepath.
338
- INLINE void compress_subtree_to_parent_node (
339
- const uint8_t * input , size_t input_len , const uint32_t key [8 ],
340
- uint64_t chunk_counter , uint8_t flags , uint8_t out [2 * BLAKE3_OUT_LEN ]) {
355
+ INLINE void
356
+ compress_subtree_to_parent_node (const uint8_t * input , size_t input_len ,
357
+ const uint32_t key [8 ], uint64_t chunk_counter ,
358
+ uint8_t flags , uint8_t out [2 * BLAKE3_OUT_LEN ],
359
+ bool use_tbb ) {
341
360
#if defined(BLAKE3_TESTING )
342
361
assert (input_len > BLAKE3_CHUNK_LEN );
343
362
#endif
344
363
345
364
uint8_t cv_array [MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN ];
346
365
size_t num_cvs = blake3_compress_subtree_wide (input , input_len , key ,
347
- chunk_counter , flags , cv_array );
366
+ chunk_counter , flags , cv_array , use_tbb );
348
367
assert (num_cvs <= MAX_SIMD_DEGREE_OR_2 );
349
-
350
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
368
+ // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
369
+ // as we just asserted, num_cvs will always be <=2 in that case. But GCC
370
+ // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
371
+ // set then it emits incorrect warnings here. We tried a few different
372
+ // hacks to silence these, but in the end our hacks just produced different
373
+ // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
374
+ // desperation, we ifdef out this entire loop when we know it's not needed.
375
+ #if MAX_SIMD_DEGREE_OR_2 > 2
376
+ // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
351
377
// compress_subtree_wide() returns more than 2 chaining values. Condense
352
378
// them into 2 by forming parent nodes repeatedly.
353
379
uint8_t out_array [MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2 ];
354
- // The second half of this loop condition is always true, and we just
355
- // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
356
- // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
357
- // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
358
- // this code, test it against that version.
359
- while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2 ) {
380
+ while (num_cvs > 2 ) {
360
381
num_cvs =
361
382
compress_parents_parallel (cv_array , num_cvs , key , flags , out_array );
362
383
memcpy (cv_array , out_array , num_cvs * BLAKE3_OUT_LEN );
363
384
}
385
+ #endif
364
386
memcpy (out , cv_array , 2 * BLAKE3_OUT_LEN );
365
387
}
366
388
@@ -432,7 +454,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
432
454
// of the whole tree, and it would need to be ROOT finalized. We can't
433
455
// compress it until we know.
434
456
// 2) This 64 KiB input might complete a larger tree, whose root node is
435
- // similarly going to be the the root of the whole tree. For example, maybe
457
+ // similarly going to be the root of the whole tree. For example, maybe
436
458
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
437
459
// node at the root of the 256 KiB subtree until we know how to finalize it.
438
460
//
@@ -457,8 +479,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
457
479
self -> cv_stack_len += 1 ;
458
480
}
459
481
460
- void llvm_blake3_hasher_update (blake3_hasher * self , const void * input ,
461
- size_t input_len ) {
482
+ INLINE void blake3_hasher_update_base (blake3_hasher * self , const void * input ,
483
+ size_t input_len , bool use_tbb ) {
462
484
// Explicitly checking for zero avoids causing UB by passing a null pointer
463
485
// to memcpy. This comes up in practice with things like:
464
486
// std::vector<uint8_t> v;
@@ -544,7 +566,7 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
544
566
uint8_t cv_pair [2 * BLAKE3_OUT_LEN ];
545
567
compress_subtree_to_parent_node (input_bytes , subtree_len , self -> key ,
546
568
self -> chunk .chunk_counter ,
547
- self -> chunk .flags , cv_pair );
569
+ self -> chunk .flags , cv_pair , use_tbb );
548
570
hasher_push_cv (self , cv_pair , self -> chunk .chunk_counter );
549
571
hasher_push_cv (self , & cv_pair [BLAKE3_OUT_LEN ],
550
572
self -> chunk .chunk_counter + (subtree_chunks / 2 ));
@@ -566,6 +588,20 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
566
588
}
567
589
}
568
590
591
+ void llvm_blake3_hasher_update (blake3_hasher * self , const void * input ,
592
+ size_t input_len ) {
593
+ bool use_tbb = false;
594
+ blake3_hasher_update_base (self , input , input_len , use_tbb );
595
+ }
596
+
597
+ #if defined(BLAKE3_USE_TBB )
598
+ void blake3_hasher_update_tbb (blake3_hasher * self , const void * input ,
599
+ size_t input_len ) {
600
+ bool use_tbb = true;
601
+ blake3_hasher_update_base (self , input , input_len , use_tbb );
602
+ }
603
+ #endif // BLAKE3_USE_TBB
604
+
569
605
void llvm_blake3_hasher_finalize (const blake3_hasher * self , uint8_t * out ,
570
606
size_t out_len ) {
571
607
llvm_blake3_hasher_finalize_seek (self , 0 , out , out_len );
0 commit comments