Skip to content

[Support/BLAKE3] Make g_cpu_features thread safe #147948

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/include/llvm-c/blake3.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
extern "C" {
#endif

#define LLVM_BLAKE3_VERSION_STRING "1.3.1"
#define LLVM_BLAKE3_VERSION_STRING "1.8.2"
#define LLVM_BLAKE3_KEY_LEN 32
#define LLVM_BLAKE3_OUT_LEN 32
#define LLVM_BLAKE3_BLOCK_LEN 64
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Support/BLAKE3/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
Implementation of BLAKE3, originating from https://github.com/BLAKE3-team/BLAKE3/tree/1.8.2/c

# Example

Expand Down
130 changes: 83 additions & 47 deletions llvm/lib/Support/BLAKE3/blake3.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,24 +95,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {

INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
size_t out_len) {
if (out_len == 0) {
return;
}
uint64_t output_block_counter = seek / 64;
size_t offset_within_block = seek % 64;
uint8_t wide_buf[64];
while (out_len > 0) {
blake3_compress_xof(self->input_cv, self->block, self->block_len,
output_block_counter, self->flags | ROOT, wide_buf);
size_t available_bytes = 64 - offset_within_block;
size_t memcpy_len;
if (out_len > available_bytes) {
memcpy_len = available_bytes;
} else {
memcpy_len = out_len;
}
memcpy(out, wide_buf + offset_within_block, memcpy_len);
out += memcpy_len;
out_len -= memcpy_len;
if(offset_within_block) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
const size_t available_bytes = 64 - offset_within_block;
const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
memcpy(out, wide_buf + offset_within_block, bytes);
out += bytes;
out_len -= bytes;
output_block_counter += 1;
offset_within_block = 0;
}
if(out_len / 64) {
blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
}
output_block_counter += out_len / 64;
out += out_len & -64;
out_len -= out_len & -64;
if(out_len) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
memcpy(out, wide_buf, out_len);
}
}

Expand Down Expand Up @@ -159,10 +165,10 @@ INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
INLINE size_t left_len(size_t content_len) {
// Subtract 1 to reserve at least one byte for the right side. content_len
INLINE size_t left_subtree_len(size_t input_len) {
// Subtract 1 to reserve at least one byte for the right side. input_len
// should always be greater than BLAKE3_CHUNK_LEN.
size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
}

Expand Down Expand Up @@ -251,26 +257,25 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,

// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
//
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement exendable ouput.) Note that this function is
// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
static size_t blake3_compress_subtree_wide(const uint8_t *input,
size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter,
uint8_t flags, uint8_t *out) {
size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags,
uint8_t *out, bool use_tbb) {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
Expand All @@ -284,7 +289,7 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
size_t left_input_len = left_len(input_len);
size_t left_input_len = left_subtree_len(input_len);
size_t right_input_len = input_len - left_input_len;
const uint8_t *right_input = &input[left_input_len];
uint64_t right_chunk_counter =
Expand All @@ -304,12 +309,24 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];

// Recurse! If this implementation adds multi-threading support in the
// future, this is where it will go.
size_t left_n = blake3_compress_subtree_wide(input, left_input_len, key,
chunk_counter, flags, cv_array);
size_t right_n = blake3_compress_subtree_wide(
right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
// Recurse!
size_t left_n = -1;
size_t right_n = -1;

#if defined(BLAKE3_USE_TBB)
blake3_compress_subtree_wide_join_tbb(
key, flags, use_tbb,
// left-hand side
input, left_input_len, chunk_counter, cv_array, &left_n,
// right-hand side
right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
#else
left_n = blake3_compress_subtree_wide(
input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
right_chunk_counter, flags, right_cvs,
use_tbb);
#endif // BLAKE3_USE_TBB

// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
Expand All @@ -335,32 +352,37 @@ static size_t blake3_compress_subtree_wide(const uint8_t *input,
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a different codepath.
INLINE void compress_subtree_to_parent_node(
const uint8_t *input, size_t input_len, const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) {
INLINE void
compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
const uint32_t key[8], uint64_t chunk_counter,
uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
bool use_tbb) {
#if defined(BLAKE3_TESTING)
assert(input_len > BLAKE3_CHUNK_LEN);
#endif

uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
chunk_counter, flags, cv_array, use_tbb);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);

// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
// (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
// set then it emits incorrect warnings here. We tried a few different
// hacks to silence these, but in the end our hacks just produced different
// warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
// desperation, we ifdef out this entire loop when we know it's not needed.
#if MAX_SIMD_DEGREE_OR_2 > 2
// If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
// The second half of this loop condition is always true, and we just
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
// this code, test it against that version.
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
while (num_cvs > 2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
#endif
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}

Expand Down Expand Up @@ -432,7 +454,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
// of the whole tree, and it would need to be ROOT finalized. We can't
// compress it until we know.
// 2) This 64 KiB input might complete a larger tree, whose root node is
// similarly going to be the the root of the whole tree. For example, maybe
// similarly going to be the root of the whole tree. For example, maybe
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
// node at the root of the 256 KiB subtree until we know how to finalize it.
//
Expand All @@ -457,8 +479,8 @@ INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
self->cv_stack_len += 1;
}

void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len) {
INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
size_t input_len, bool use_tbb) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
Expand Down Expand Up @@ -544,7 +566,7 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
self->chunk.chunk_counter,
self->chunk.flags, cv_pair);
self->chunk.flags, cv_pair, use_tbb);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
Expand All @@ -566,6 +588,20 @@ void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
}
}

void llvm_blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len) {
bool use_tbb = false;
blake3_hasher_update_base(self, input, input_len, use_tbb);
}

#if defined(BLAKE3_USE_TBB)
void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
size_t input_len) {
bool use_tbb = true;
blake3_hasher_update_base(self, input, input_len, use_tbb);
}
#endif // BLAKE3_USE_TBB

void llvm_blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len) {
llvm_blake3_hasher_finalize_seek(self, 0, out, out_len);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Support/BLAKE3/blake3_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);

// Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
Original file line number Diff line number Diff line change
Expand Up @@ -1786,7 +1786,7 @@ blake3_hash_many_avx2:
vmovdqu xmmword ptr [rbx+0x10], xmm1
jmp 4b

.section .rodata
.section .rdata
.p2align 6
ADD0:
.long 0, 1, 2, 3, 4, 5, 6, 7
Expand Down
Loading
Loading