Skip to content

Commit fd7cb55

Browse files
committed
[Support/BLAKE3] Updated BLAKE3 to v1.8.2
1 parent 0da6528 commit fd7cb55

15 files changed

+2623
-85
lines changed

llvm/include/llvm-c/blake3.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
extern "C" {
2626
#endif
2727

28-
#define LLVM_BLAKE3_VERSION_STRING "1.3.1"
28+
#define LLVM_BLAKE3_VERSION_STRING "1.8.2"
2929
#define LLVM_BLAKE3_KEY_LEN 32
3030
#define LLVM_BLAKE3_OUT_LEN 32
3131
#define LLVM_BLAKE3_BLOCK_LEN 64

llvm/lib/Support/BLAKE3/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
1+
Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.8.2/c
22

33
# Example
44

llvm/lib/Support/BLAKE3/blake3.c

Lines changed: 83 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -95,24 +95,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
9595

9696
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
9797
size_t out_len) {
98+
if (out_len == 0) {
99+
return;
100+
}
98101
uint64_t output_block_counter = seek / 64;
99102
size_t offset_within_block = seek % 64;
100103
uint8_t wide_buf[64];
101-
while (out_len > 0) {
102-
blake3_compress_xof(self->input_cv, self->block, self->block_len,
103-
output_block_counter, self->flags | ROOT, wide_buf);
104-
size_t available_bytes = 64 - offset_within_block;
105-
size_t memcpy_len;
106-
if (out_len > available_bytes) {
107-
memcpy_len = available_bytes;
108-
} else {
109-
memcpy_len = out_len;
110-
}
111-
memcpy(out, wide_buf + offset_within_block, memcpy_len);
112-
out += memcpy_len;
113-
out_len -= memcpy_len;
104+
if(offset_within_block) {
105+
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
106+
const size_t available_bytes = 64 - offset_within_block;
107+
const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
108+
memcpy(out, wide_buf + offset_within_block, bytes);
109+
out += bytes;
110+
out_len -= bytes;
114111
output_block_counter += 1;
115-
offset_within_block = 0;
112+
}
113+
if(out_len / 64) {
114+
blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
115+
}
116+
output_block_counter += out_len / 64;
117+
out += out_len & -64;
118+
out_len -= out_len & -64;
119+
if(out_len) {
120+
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
121+
memcpy(out, wide_buf, out_len);
116122
}
117123
}
118124

@@ -159,10 +165,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
159165
// Given some input larger than one chunk, return the number of bytes that
160166
// should go in the left subtree. This is the largest power-of-2 number of
161167
// chunks that leaves at least 1 byte for the right subtree.
162-
INLINE size_t left_len(size_t content_len) {
163-
// Subtract 1 to reserve at least one byte for the right side. content_len
168+
INLINE size_t left_subtree_len(size_t input_len) {
169+
// Subtract 1 to reserve at least one byte for the right side. input_len
164170
// should always be greater than BLAKE3_CHUNK_LEN.
165-
size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
171+
size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
166172
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
167173
}
168174

@@ -251,26 +257,25 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
251257

252258
// The wide helper function returns (writes out) an array of chaining values
253259
// and returns the length of that array. The number of chaining values returned
254-
// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
260+
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
255261
// if the input is shorter than that many chunks. The reason for maintaining a
256262
// wide array of chaining values going back up the tree, is to allow the
257263
// implementation to hash as many parents in parallel as possible.
258264
//
259265
// As a special case when the SIMD degree is 1, this function will still return
260266
// at least 2 outputs. This guarantees that this function doesn't perform the
261267
// root compression. (If it did, it would use the wrong flags, and also we
262-
// wouldn't be able to implement exendable ouput.) Note that this function is
268+
// wouldn't be able to implement extendable output.) Note that this function is
263269
// not used when the whole input is only 1 chunk long; that's a different
264270
// codepath.
265271
//
266272
// Why not just have the caller split the input on the first update(), instead
267273
// of implementing this special rule? Because we don't want to limit SIMD or
268274
// multi-threading parallelism for that update().
269-
static size_t blake3_compress_subtree_wide(const uint8_t *input,
270-
size_t input_len,
271-
const uint32_t key[8],
272-
uint64_t chunk_counter,
273-
uint8_t flags, uint8_t *out) {
275+
size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
276+
const uint32_t key[8],
277+
uint64_t chunk_counter, uint8_t flags,
278+
uint8_t *out, bool use_tbb) {
274279
// Note that the single chunk case does *not* bump the SIMD degree up to 2
275280
// when it is 1. If this implementation adds multi-threading in the future,
276281
// this gives us the option of multi-threading even the 2-chunk case, which
@@ -284,7 +289,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
284289
// the input into left and right subtrees. (Note that this is only optimal
285290
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
286291
// of 3 or something, we'll need a more complicated strategy.)
287-
size_t left_input_len = left_len(input_len);
292+
size_t left_input_len = left_subtree_len(input_len);
288293
size_t right_input_len = input_len - left_input_len;
289294
const uint8_t *right_input = &input[left_input_len];
290295
uint64_t right_chunk_counter =
@@ -304,12 +309,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
304309
}
305310
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
306311

307-
// Recurse! If this implementation adds multi-threading support in the
308-
// future, this is where it will go.
309-
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
310-
chunk_counter, flags, cv_array);
311-
size_t right_n = blake3_compress_subtree_wide(
312-
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
312+
// Recurse!
313+
size_t left_n = -1;
314+
size_t right_n = -1;
315+
316+
#if defined(BLAKE3_USE_TBB)
317+
blake3_compress_subtree_wide_join_tbb(
318+
key, flags, use_tbb,
319+
// left-hand side
320+
input, left_input_len, chunk_counter, cv_array, &left_n,
321+
// right-hand side
322+
right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
323+
#else
324+
left_n = blake3_compress_subtree_wide(
325+
input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
326+
right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
327+
right_chunk_counter, flags, right_cvs,
328+
use_tbb);
329+
#endif // BLAKE3_USE_TBB
313330

314331
// The special case again. If simd_degree=1, then we'll have left_n=1 and
315332
// right_n=1. Rather than compressing them into a single output, return
@@ -335,32 +352,37 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
335352
//
336353
// As with compress_subtree_wide(), this function is not used on inputs of 1
337354
// chunk or less. That's a different codepath.
338-
INLINE void compress_subtree_to_parent_node(
339-
const uint8_t *input, size_t input_len, const uint32_t key[8],
340-
uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
355+
INLINE void
356+
compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
357+
const uint32_t key[8], uint64_t chunk_counter,
358+
uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
359+
bool use_tbb) {
341360
#if defined(BLAKE3_TESTING)
342361
assert(input_len > BLAKE3_CHUNK_LEN);
343362
#endif
344363

345364
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
346365
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
347-
chunk_counter, flags, cv_array);
366+
chunk_counter, flags, cv_array, use_tbb);
348367
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
349-
350-
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
368+
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
369+
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
370+
// (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
371+
// set then it emits incorrect warnings here. We tried a few different
372+
// hacks to silence these, but in the end our hacks just produced different
373+
// warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
374+
// desperation, we ifdef out this entire loop when we know it's not needed.
375+
#if MAX_SIMD_DEGREE_OR_2 > 2
376+
// If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
351377
// compress_subtree_wide() returns more than 2 chaining values. Condense
352378
// them into 2 by forming parent nodes repeatedly.
353379
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
354-
// The second half of this loop condition is always true, and we just
355-
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
356-
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
357-
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
358-
// this code, test it against that version.
359-
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
380+
while (num_cvs > 2) {
360381
num_cvs =
361382
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
362383
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
363384
}
385+
#endif
364386
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
365387
}
366388

@@ -432,7 +454,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
432454
// of the whole tree, and it would need to be ROOT finalized. We can't
433455
// compress it until we know.
434456
// 2) This 64 KiB input might complete a larger tree, whose root node is
435-
// similarly going to be the the root of the whole tree. For example, maybe
457+
// similarly going to be the root of the whole tree. For example, maybe
436458
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
437459
// node at the root of the 256 KiB subtree until we know how to finalize it.
438460
//
@@ -457,8 +479,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
457479
self->cv_stack_len += 1;
458480
}
459481

460-
void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
461-
size_t input_len) {
482+
INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
483+
size_t input_len, bool use_tbb) {
462484
// Explicitly checking for zero avoids causing UB by passing a null pointer
463485
// to memcpy. This comes up in practice with things like:
464486
// std::vector<uint8_t> v;
@@ -544,7 +566,7 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
544566
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
545567
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
546568
self->chunk.chunk_counter,
547-
self->chunk.flags, cv_pair);
569+
self->chunk.flags, cv_pair, use_tbb);
548570
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
549571
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
550572
self->chunk.chunk_counter + (subtree_chunks / 2));
@@ -566,6 +588,20 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
566588
}
567589
}
568590

591+
void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
592+
size_t input_len) {
593+
bool use_tbb = false;
594+
blake3_hasher_update_base(self, input, input_len, use_tbb);
595+
}
596+
597+
#if defined(BLAKE3_USE_TBB)
598+
void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
599+
size_t input_len) {
600+
bool use_tbb = true;
601+
blake3_hasher_update_base(self, input, input_len, use_tbb);
602+
}
603+
#endif // BLAKE3_USE_TBB
604+
569605
void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
570606
size_t out_len) {
571607
llvm_blake3_hasher_finalize_seek(self, 0, out, out_len);

llvm/lib/Support/BLAKE3/blake3_avx2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
167167
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168168
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169169

170-
// Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
170+
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
171171
// 11/33.
172172
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173173
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);

llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1786,7 +1786,7 @@ blake3_hash_many_avx2:
17861786
vmovdqu xmmword ptr [rbx+0x10], xmm1
17871787
jmp 4b
17881788

1789-
.section .rodata
1789+
.section .rdata
17901790
.p2align 6
17911791
ADD0:
17921792
.long 0, 1, 2, 3, 4, 5, 6, 7

0 commit comments

Comments
 (0)