Skip to content

Commit 5d1c5f0

Browse files
committed
Introduce Q4_0 and Q8_0 quantizations with BF16 delta values
1 parent 1d8fca7 commit 5d1c5f0

File tree

10 files changed

+1359
-190
lines changed

10 files changed

+1359
-190
lines changed

examples/quantize/quantize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ struct quant_option {
1717

1818
static const std::vector<struct quant_option> QUANT_OPTIONS = {
1919
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
20+
{ "Q4_0_B16", LLAMA_FTYPE_MOSTLY_Q4_0_B16, " 3.56G, 5.9624 +/- 0.03348 ppl @ LLaMA-v2-7B", },
2021
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
2122
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
2223
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
@@ -46,6 +47,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4647
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
4748
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
4849
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
50+
{ "Q8_0_B16", LLAMA_FTYPE_MOSTLY_Q8_0_B16, " 6.70G, 5.8011 +/- 0.03239 ppl @ LLaMA-v1-7B", },
4951
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
5052
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5153
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

ggml-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
#if defined(_WIN32)
2121

2222
#define m512bh(p) p
23+
#define m128bh(p) p
2324
#define m512i(p) p
2425

2526
#else
2627

2728
#define m512bh(p) (__m512bh)(p)
29+
#define m128bh(p) (__m128bh)(p)
2830
#define m512i(p) (__m512i)(p)
2931

3032
#endif

0 commit comments

Comments
 (0)