Skip to content

Commit d1eb702

Browse files
committed
Introduce bfloat16 support
Many models on Hugging Face (e.g. Mistral, TinyLLaMA) use bfloat16 as their canonical floating point format. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───┐ 0b0000000000000000 brain16 This encoding has the same number of exponent bits as float32. That makes conversion relatively straightforward, even in the absence of hardware support. For example, converting brain16 to binary32 means simply shifting 16 bits to the left. ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌──┴───┐┌─┴───────────────────┐ 0b00000000000000000000000000000000 IEEE binary32 The issue is that converting bf16 to fp16 can result in information loss. Only 13% of bf16 numbers can be precisely represented in fp16 which in practice ends up being 99.71% of Mistral 7b v0.2's weights however there is currently no way other than fp32 to get the others ┌sign │ │ ┌exponent │ │ │ │ ┌mantissa │ │ │ │┌─┴─┐┌─┴──────┐ 0b0000000000000000 IEEE binary16 This change fixes that, by adding a bf16 data type to GGML. Support for CPU inference has been implemented along with optimizations for the AVX2, AVX512, and AVX512BF16 ISAs. Perplexity on Mistral 7b 0.2 improves somewhere around -0.0024 to -0.0046 compared to using fp16
1 parent 6e472f5 commit d1eb702

File tree

8 files changed

+1734
-173
lines changed

8 files changed

+1734
-173
lines changed

examples/finetune/finetune.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
575575
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
576576

577577
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
578-
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
578+
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
579579
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
580580
} else if (a->type == GGML_TYPE_F32) {
581581
return ggml_add(ctx, a, b);

examples/quantize/quantize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4646
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
4747
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
4848
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
49-
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
49+
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
50+
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5051
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
5152
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
5253
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },

ggml-impl.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,9 @@ size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml
518518
// return index, asserts if table is full
519519
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
520520

521+
#define GGML_FP32_TO_BF16(x) ggml_fp32_to_bf16(x)
522+
#define GGML_BF16_TO_FP32(x) ggml_bf16_to_fp32(x)
523+
521524
#ifdef __cplusplus
522525
}
523526
#endif

ggml.c

Lines changed: 1621 additions & 169 deletions
Large diffs are not rendered by default.

ggml.h

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ extern "C" {
370370
GGML_TYPE_I64 = 27,
371371
GGML_TYPE_F64 = 28,
372372
GGML_TYPE_IQ1_M = 29,
373+
GGML_TYPE_BF16 = 30,
373374
GGML_TYPE_COUNT,
374375
};
375376

@@ -410,6 +411,7 @@ extern "C" {
410411
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411412
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412413
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
414+
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413415
};
414416

415417
// available tensor operations:
@@ -2370,6 +2372,90 @@ extern "C" {
23702372
GGML_API int ggml_cpu_has_vsx (void);
23712373
GGML_API int ggml_cpu_has_matmul_int8(void);
23722374

2375+
/**
2376+
* Google Brain 16-bit floating point number.
2377+
*
2378+
* ┌sign
2379+
* │
2380+
* │ ┌exponent
2381+
* │ │
2382+
* │ │ ┌mantissa
2383+
* │ │ │
2384+
* │┌──┴───┐┌─┴───┐
2385+
* 0b0000000000000000 brain16
2386+
*
2387+
* Since bf16 has the same number of exponent bits as a 32bit float,
2388+
* encoding and decoding numbers becomes relatively straightforward.
2389+
*
2390+
* ┌sign
2391+
* │
2392+
* │ ┌exponent
2393+
* │ │
2394+
* │ │ ┌mantissa
2395+
* │ │ │
2396+
* │┌──┴───┐┌─┴───────────────────┐
2397+
* 0b00000000000000000000000000000000 IEEE binary32
2398+
*
2399+
* For comparison, the standard fp16 format has fewer exponent bits.
2400+
*
2401+
* ┌sign
2402+
* │
2403+
* │ ┌exponent
2404+
* │ │
2405+
* │ │ ┌mantissa
2406+
* │ │ │
2407+
* │┌─┴─┐┌─┴──────┐
2408+
* 0b0000000000000000 IEEE binary16
2409+
*
2410+
* So be warned that converting between them, destroys several bits.
2411+
*
2412+
* @see IEEE 754-2008
2413+
*/
2414+
typedef struct {
2415+
uint16_t x;
2416+
} ggml_bf16_t;
2417+
2418+
/**
2419+
* Converts brain16 to float32.
2420+
*/
2421+
static inline float ggml_bf16_to_fp32(ggml_bf16_t h) {
2422+
union {
2423+
float f;
2424+
uint32_t i;
2425+
} u;
2426+
u.i = (uint32_t)h.x << 16;
2427+
return u.f;
2428+
}
2429+
2430+
/**
2431+
* Converts float32 to brain16.
2432+
*
2433+
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
2434+
* Subnormals shall be flushed to zero, and NANs will be quiet.
2435+
* This code should vectorize nicely if using modern compilers.
2436+
*/
2437+
static inline ggml_bf16_t ggml_fp32_to_bf16(float s) {
2438+
ggml_bf16_t h;
2439+
union {
2440+
float f;
2441+
uint32_t i;
2442+
} u;
2443+
u.f = s;
2444+
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
2445+
h.x = (u.i >> 16) | 64; /* force to quiet */
2446+
return h;
2447+
}
2448+
if (!(u.i & 0x7f800000)) { /* subnormal */
2449+
h.x = (u.i & 0x80000000) >> 16; /* flush to zero */
2450+
return h;
2451+
}
2452+
h.x = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
2453+
return h;
2454+
}
2455+
2456+
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int n);
2457+
GGML_API void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int n);
2458+
23732459
//
23742460
// Internal types and functions exposed for tests and benchmarks
23752461
//

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
817817
I64 = 27
818818
F64 = 28
819819
IQ1_M = 29
820+
BF16 = 30
820821

821822

822823
class GGUFEndian(IntEnum):
@@ -863,6 +864,7 @@ def get_type(val: Any) -> GGUFValueType:
863864
GGML_QUANT_SIZES = {
864865
GGMLQuantizationType.F32: (1, 4),
865866
GGMLQuantizationType.F16: (1, 2),
867+
GGMLQuantizationType.BF16: (1, 2),
866868
GGMLQuantizationType.Q4_0: (32, 2 + 16),
867869
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
868870
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),

llama.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3156,6 +3156,7 @@ struct llama_model_loader {
31563156
switch (type_max) {
31573157
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
31583158
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
3159+
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
31593160
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
31603161
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
31613162
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@@ -3647,6 +3648,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
36473648
switch (ftype) {
36483649
case LLAMA_FTYPE_ALL_F32: return "all F32";
36493650
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
3651+
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
36503652
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
36513653
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
36523654
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -6054,6 +6056,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
60546056
|| !(
60556057
model.ftype == LLAMA_FTYPE_ALL_F32 ||
60566058
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
6059+
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
60576060
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
60586061
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
60596062
)
@@ -14075,13 +14078,16 @@ static void llama_tensor_dequantize_internal(
1407514078
if (qtype.to_float == NULL) {
1407614079
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
1407714080
}
14078-
} else if (tensor->type != GGML_TYPE_F16) {
14081+
} else if (tensor->type != GGML_TYPE_F16 &&
14082+
tensor->type != GGML_TYPE_BF16) {
1407914083
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
1408014084
}
1408114085

1408214086
if (nthread < 2) {
1408314087
if (tensor->type == GGML_TYPE_F16) {
1408414088
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
14089+
} else if (tensor->type == GGML_TYPE_BF16) {
14090+
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
1408514091
} else if (ggml_is_quantized(tensor->type)) {
1408614092
qtype.to_float(tensor->data, f32_output, nelements);
1408714093
} else {
@@ -14090,7 +14096,14 @@ static void llama_tensor_dequantize_internal(
1409014096
return;
1409114097
}
1409214098

14093-
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
14099+
size_t block_size;
14100+
if (tensor->type == GGML_TYPE_F16 ||
14101+
tensor->type == GGML_TYPE_BF16) {
14102+
block_size = 1;
14103+
} else {
14104+
block_size = (size_t)ggml_blck_size(tensor->type);
14105+
}
14106+
1409414107
size_t block_size_bytes = ggml_type_size(tensor->type);
1409514108

1409614109
GGML_ASSERT(nelements % block_size == 0);
@@ -14109,6 +14122,8 @@ static void llama_tensor_dequantize_internal(
1410914122
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
1411014123
if (typ == GGML_TYPE_F16) {
1411114124
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
14125+
} else if (typ == GGML_TYPE_BF16) {
14126+
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
1411214127
} else {
1411314128
qtype.to_float(inbuf, outbuf, nels);
1411414129
}
@@ -14469,6 +14484,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1446914484
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
1447014485
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
1447114486
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
14487+
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
1447214488
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
1447314489

1447414490
// K-quants

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ extern "C" {
122122
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
123123
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
124124
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
125+
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
125126

126127
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
127128
};

0 commit comments

Comments
 (0)