4
4
5
5
#include "blake3_impl.h"
6
6
7
+ #if defined(_MSC_VER )
8
+ #include <Windows.h>
9
+ #endif
10
+
7
11
#if defined(IS_X86 )
8
12
#if defined(_MSC_VER )
9
13
#include <intrin.h>
10
14
#elif defined(__GNUC__ )
11
15
#include <immintrin.h>
12
16
#else
13
- #error " Unimplemented!"
17
+ #undef IS_X86 /* Unimplemented! */
14
18
#endif
15
19
#endif
16
20
17
- /* Atomic access abstraction (since MSVC does not do C11 yet) */
18
- #if defined(_MSC_VER ) && !defined(__clang__ )
19
- #if !defined(IS_X86 )
20
- #include <intrin.h>
21
- #endif
22
- #pragma warning(disable : 5105)
23
- #ifndef FORCEINLINE
24
- #define FORCEINLINE inline __forceinline
25
- #endif
26
- typedef volatile long atomic32_t ;
27
- static FORCEINLINE int32_t atomic_load32 (atomic32_t * src ) {
28
- return _InterlockedOr (src , 0 );
29
- }
30
- static FORCEINLINE void atomic_store32 (atomic32_t * dst , int32_t val ) {
31
- _InterlockedExchange (dst , val );
32
- }
21
+ #if !defined(BLAKE3_ATOMICS )
22
+ #if defined(__has_include )
23
+ #if __has_include (< stdatomic .h > ) && !defined(_MSC_VER )
24
+ #define BLAKE3_ATOMICS 1
33
25
#else
34
- #include <stdatomic.h>
35
- #ifndef FORCEINLINE
36
- #define FORCEINLINE inline __attribute__((__always_inline__))
37
- #endif
38
- typedef volatile _Atomic (int32_t ) atomic32_t ;
39
- static FORCEINLINE int32_t atomic_load32 (atomic32_t * src ) {
40
- return atomic_load_explicit (src , memory_order_relaxed );
41
- }
42
- static FORCEINLINE void atomic_store32 (atomic32_t * dst , int32_t val ) {
43
- atomic_store_explicit (dst , val , memory_order_relaxed );
44
- }
26
+ #define BLAKE3_ATOMICS 0
27
+ #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
28
+ #else
29
+ #define BLAKE3_ATOMICS 0
30
+ #endif /* defined(__has_include) */
31
+ #endif /* BLAKE3_ATOMICS */
32
+
33
+ #if BLAKE3_ATOMICS
34
+ #define ATOMIC_INT _Atomic int
35
+ #define ATOMIC_LOAD (x ) x
36
+ #define ATOMIC_STORE (x , y ) x = y
37
+ #elif defined(_MSC_VER )
38
+ #define ATOMIC_INT LONG
39
+ #define ATOMIC_LOAD (x ) InterlockedOr(&x, 0)
40
+ #define ATOMIC_STORE (x , y ) InterlockedExchange(&x, y)
41
+ #else
42
+ #define ATOMIC_INT int
43
+ #define ATOMIC_LOAD (x ) x
44
+ #define ATOMIC_STORE (x , y ) x = y
45
45
#endif
46
46
47
47
#define MAYBE_UNUSED (x ) (void)((x))
@@ -89,7 +89,6 @@ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
89
89
#endif
90
90
}
91
91
92
- #endif
93
92
94
93
enum cpu_feature {
95
94
SSE2 = 1 << 0 ,
@@ -106,24 +105,25 @@ enum cpu_feature {
106
105
#if !defined(BLAKE3_TESTING )
107
106
static /* Allow the variable to be controlled manually for testing */
108
107
#endif
109
- atomic32_t g_cpu_features = UNDEFINED ;
108
+ ATOMIC_INT g_cpu_features = UNDEFINED ;
110
109
111
110
LLVM_ATTRIBUTE_USED
112
111
#if !defined(BLAKE3_TESTING )
113
112
static
114
113
#endif
115
114
enum cpu_feature
116
115
get_cpu_features (void ) {
117
- enum cpu_feature _cpu_features ;
118
- _cpu_features = (enum cpu_feature )atomic_load32 (& g_cpu_features );
119
- if (_cpu_features != UNDEFINED ) {
120
- return _cpu_features ;
116
+
117
+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
118
+ enum cpu_feature features = ATOMIC_LOAD (g_cpu_features );
119
+ if (features != UNDEFINED ) {
120
+ return features ;
121
121
} else {
122
122
#if defined(IS_X86 )
123
123
uint32_t regs [4 ] = {0 };
124
124
uint32_t * eax = & regs [0 ], * ebx = & regs [1 ], * ecx = & regs [2 ], * edx = & regs [3 ];
125
125
(void )edx ;
126
- enum cpu_feature features = 0 ;
126
+ features = 0 ;
127
127
cpuid (regs , 0 );
128
128
const int max_id = * eax ;
129
129
cpuid (regs , 1 );
@@ -133,7 +133,7 @@ static
133
133
if (* edx & (1UL << 26 ))
134
134
features |= SSE2 ;
135
135
#endif
136
- if (* ecx & (1UL << 0 ))
136
+ if (* ecx & (1UL << 9 ))
137
137
features |= SSSE3 ;
138
138
if (* ecx & (1UL << 19 ))
139
139
features |= SSE41 ;
@@ -156,15 +156,15 @@ static
156
156
}
157
157
}
158
158
}
159
- atomic_store32 ( & g_cpu_features , ( int32_t ) features );
159
+ ATOMIC_STORE ( g_cpu_features , features );
160
160
return features ;
161
161
#else
162
162
/* How to detect NEON? */
163
- atomic_store32 (& g_cpu_features , 0 );
164
163
return 0 ;
165
164
#endif
166
165
}
167
166
}
167
+ #endif
168
168
169
169
void blake3_compress_in_place (uint32_t cv [8 ],
170
170
const uint8_t block [BLAKE3_BLOCK_LEN ],
0 commit comments