Skip to content

Commit 33c8d50

Browse files
authored
Add provisions for windows support for BF16 code including CMake provision for enabling AVX512_BF16 (#7258)
1 parent d359f30 commit 33c8d50

File tree

5 files changed

+38
-8
lines changed

5 files changed

+38
-8
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ option(LLAMA_AVX2 "llama: enable AVX2"
7777
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
7878
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
7979
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
80+
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
8081
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
8182
# in MSVC F16C is implied with AVX2/AVX512
8283
if (NOT MSVC)
@@ -1060,6 +1061,10 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
10601061
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
10611062
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
10621063
endif()
1064+
if (LLAMA_AVX512_BF16)
1065+
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
1066+
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
1067+
endif()
10631068
elseif (LLAMA_AVX2)
10641069
list(APPEND ARCH_FLAGS /arch:AVX2)
10651070
elseif (LLAMA_AVX)
@@ -1091,6 +1096,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
10911096
if (LLAMA_AVX512_VNNI)
10921097
list(APPEND ARCH_FLAGS -mavx512vnni)
10931098
endif()
1099+
if (LLAMA_AVX512_BF16)
1100+
list(APPEND ARCH_FLAGS -mavx512bf16)
1101+
endif()
10941102
endif()
10951103
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
10961104
message(STATUS "PowerPC detected")

ggml-impl.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,18 @@
1717
#define MIN(a, b) ((a) < (b) ? (a) : (b))
1818
#define MAX(a, b) ((a) > (b) ? (a) : (b))
1919

20+
#if defined(_WIN32)
21+
22+
#define m512bh(p) p
23+
#define m512i(p) p
24+
25+
#else
26+
27+
#define m512bh(p) (__m512bh)(p)
28+
#define m512i(p) (__m512i)(p)
29+
30+
#endif
31+
2032
/**
2133
* Converts brain16 to float32.
2234
*

ggml.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -406,10 +406,10 @@ void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
406406
int i = 0;
407407
#if defined(__AVX512BF16__)
408408
for (; i + 32 <= n; i += 32) {
409-
_mm512_storeu_ps(
410-
(__m512 *)(y + i),
411-
(__m512)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
412-
_mm512_loadu_ps(x + i)));
409+
_mm512_storeu_si512(
410+
(__m512i *)(y + i),
411+
m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
412+
_mm512_loadu_ps(x + i))));
413413
}
414414
#endif
415415
for (; i < n; i++) {
@@ -1666,10 +1666,10 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
16661666
__m512 c1 = _mm512_setzero_ps();
16671667
__m512 c2 = _mm512_setzero_ps();
16681668
for (; i + 64 <= n; i += 64) {
1669-
c1 = _mm512_dpbf16_ps(c1, (__m512bh)_mm512_loadu_ps((const float *)(x + i)),
1670-
(__m512bh)_mm512_loadu_ps((const float *)(y + i)));
1671-
c2 = _mm512_dpbf16_ps(c2, (__m512bh)_mm512_loadu_ps((const float *)(x + i + 32)),
1672-
(__m512bh)_mm512_loadu_ps((const float *)(y + i + 32)));
1669+
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
1670+
m512bh(_mm512_loadu_si512((y + i))));
1671+
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
1672+
m512bh(_mm512_loadu_si512((y + i + 32))));
16731673
}
16741674
sumf += (ggml_float)_mm512_reduce_add_ps(c1);
16751675
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
@@ -23137,6 +23137,14 @@ int ggml_cpu_has_avx512_vnni(void) {
2313723137
#endif
2313823138
}
2313923139

23140+
int ggml_cpu_has_avx512_bf16(void) {
23141+
#if defined(__AVX512BF16__)
23142+
return 1;
23143+
#else
23144+
return 0;
23145+
#endif
23146+
}
23147+
2314023148
int ggml_cpu_has_fma(void) {
2314123149
#if defined(__FMA__)
2314223150
return 1;

ggml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,6 +2390,7 @@ extern "C" {
23902390
GGML_API int ggml_cpu_has_avx512 (void);
23912391
GGML_API int ggml_cpu_has_avx512_vbmi(void);
23922392
GGML_API int ggml_cpu_has_avx512_vnni(void);
2393+
GGML_API int ggml_cpu_has_avx512_bf16(void);
23932394
GGML_API int ggml_cpu_has_fma (void);
23942395
GGML_API int ggml_cpu_has_neon (void);
23952396
GGML_API int ggml_cpu_has_arm_fma (void);

llama.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18074,6 +18074,7 @@ const char * llama_print_system_info(void) {
1807418074
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
1807518075
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
1807618076
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
18077+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
1807718078
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
1807818079
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
1807918080
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";

0 commit comments

Comments
 (0)