ggml-org
diff --git a/‎Makefile
Lines changed: 18 additions & 1 deletion b/‎Makefile
Lines changed: 18 additions & 1 deletion
diff --git a/‎examples/quantize/quantize.cpp
Lines changed: 4 additions & 0 deletions b/‎examples/quantize/quantize.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎ggml/include/ggml.h
Lines changed: 9 additions & 0 deletions b/‎ggml/include/ggml.h
Lines changed: 9 additions & 0 deletions
diff --git a/‎ggml/src/ggml-common.h
Lines changed: 18 additions & 0 deletions b/‎ggml/src/ggml-common.h
Lines changed: 18 additions & 0 deletions
diff --git a/‎ggml/src/ggml-fp8.cpp
Lines changed: 251 additions & 0 deletions b/‎ggml/src/ggml-fp8.cpp
Lines changed: 251 additions & 0 deletions
@@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1
 DEPRECATE_WARNING := 1
 endif
 
+ifdef LLAMA_NO_OPENMP_SIMD
+GGML_NO_OPENMP_SIMD := 1
+endif
+
 ifdef LLAMA_NO_METAL
 GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
@@ -548,6 +552,12 @@ ifndef GGML_NO_OPENMP
 	endif # GGML_MUSA
 endif # GGML_NO_OPENMP
 
+ifndef GGML_NO_OPENMP_SIMD
+	MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
+	MK_CFLAGS   += -fopenmp-simd
+	MK_CXXFLAGS += -fopenmp-simd
+endif # GGML_NO_OPENMP_SIMD
+
 ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
@@ -918,7 +928,8 @@ OBJ_GGML += \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
 	ggml/src/ggml-quants.o \
-	ggml/src/ggml-aarch64.o
+	ggml/src/ggml-aarch64.o \
+	ggml/src/ggml-fp8.o
 
 OBJ_LLAMA = \
 	src/llama.o \
@@ -1074,6 +1085,12 @@ ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
+ggml/src/ggml-fp8.o: \
+	ggml/src/ggml-fp8.cpp \
+	ggml/src/ggml-fp8.h \
+	ggml/src/ggml-common.h
+	$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
+
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 
@@ -51,6 +51,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
     { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
+
+    { "E4M3_Q",   LLAMA_FTYPE_MOSTLY_E4M3_Q,   "12,21G, 0.0050 kld @ Mistral-Nemo", },
+    { "E3M4_Q",   LLAMA_FTYPE_MOSTLY_E3M4_Q,   "12,21G, 0.0016 kld @ Mistral-Nemo", },
+
     { "F16",      LLAMA_FTYPE_MOSTLY_F16,      "14.00G, +0.0020 ppl @ Mistral-7B",  },
     { "BF16",     LLAMA_FTYPE_MOSTLY_BF16,     "14.00G, -0.0050 ppl @ Mistral-7B",  },
     { "F32",      LLAMA_FTYPE_ALL_F32,         "26.00G              @ 7B",          },
 
@@ -390,6 +390,11 @@ extern "C" {
         GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
+        GGML_TYPE_E5M2    = 36,
+        GGML_TYPE_E4M3    = 37,
+        GGML_TYPE_E4M3_Q  = 38,
+        GGML_TYPE_E3M4_Q  = 39,
+        // E5M6 => 12 bits vs 16 bits for BF16 = E8M7 / FP16 = E5M10  
         GGML_TYPE_COUNT,
     };
 
@@ -434,6 +439,10 @@ extern "C" {
         GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
         GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
+        GGML_FTYPE_MOSTLY_E5M2     = 28, // except 1d tensors
+        GGML_FTYPE_MOSTLY_E4M3     = 29, // except 1d tensors
+        GGML_FTYPE_MOSTLY_E4M3_Q   = 30, // except 1d tensors
+        GGML_FTYPE_MOSTLY_E3M4_Q   = 31, // except 1d tensors
     };
 
     // available tensor operations:
 
@@ -418,6 +418,24 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
+// the fp8 types.
+typedef uint8_t ggml_e5m2;
+typedef uint8_t ggml_e4m3;
+typedef uint8_t ggml_e3m4;
+
+// fp8 with bloc delta => 8.125 bpw
+typedef struct {
+    float d;  // delta
+    ggml_e4m3 qs[QK_K];
+} block_e4m3_q;
+static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
+
+typedef struct {
+    float d;  // delta
+    ggml_e3m4 qs[QK_K];
+} block_e3m4_q;
+static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
+
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
 
 
@@ -0,0 +1,251 @@
+#define GGML_COMMON_IMPL_C
+#include "ggml-common.h"
+
+#include "ggml-fp8.h"
+
+#include <cassert>
+
+/*
+# ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
+./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
+./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
+# ./llama-perplexity -f ~/LLM/wikitext-2-raw/wiki.test.raw  -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
+./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
+
+*/
+
+#include <iostream>
+#include <cstdint>
+#include <immintrin.h>
+
+template<int N> constexpr float EXP2() {
+    if constexpr (N==0) return 1;
+    if constexpr (N>0) return EXP2<N-1>()*2;
+    if constexpr (N<0) return EXP2<N+1>()/2;
+}
+
+// 2^N avec N>0 en entier
+template<int N> constexpr int EXP_I2() {
+    if constexpr (N==0) return 1;
+    if constexpr (N>0) return EXP_I2<N-1>()*2;
+}
+
+template<int _E> //, int M=7-E>  1.7 bits!
+struct FP8 {
+    uint8_t bits;
+    using type = FP8<_E>;
+    static constexpr int E=_E;
+    static constexpr int M=7-_E;
+    static constexpr int E_BIAS=EXP2<_E-1>()-1;
+    static constexpr float MAX() { return (2-EXP2<-M+1>())*EXP2<EXP_I2<_E-1>()>(); }
+    static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); }
+    //=============================================
+
+    #pragma omp declare simd
+    void operator=(float value) {
+        union {
+            float f;
+            uint32_t bits;
+        } in = {value};
+        // le signe:
+        bits = (in.bits >> 24) & 0x80;
+        // la valeur sans la signe!
+        in.bits &= 0x7fffffff;
+        //GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
+        if (in.f >= MAX()) {
+            bits |= 0x7E;
+        } else if (in.f<MIN()) { // => 0.
+            // OK: S.0000000
+        } else {
+            in.f *= EXP2<E_BIAS-127>();
+            in.bits += 1<<(22-M); // for rounding
+            bits |= (in.bits >> (23-M)) & 0x7F;
+        }
+    }
+
+    #pragma omp declare simd
+    operator float () const {
+        union {
+            float f;
+            uint32_t bits;
+        } out = {0};
+        // le signe:
+        out.bits = bits & 0x80;
+        out.bits <<= 24;
+        uint32_t _bits = bits & 0x7F;
+        _bits <<= (23-M);
+        out.bits |= _bits;
+        out.f *= EXP2<127-E_BIAS>();
+        return out.f;
+    }
+};
+
+// block_e4m3_q
+//typedef struct {
+//    float d;  // delta
+//    ggml_e4m3 qs[QK_K];
+//} block_e4m3_q;
+
+template<int E>
+static inline void conv(const FP8<E>* x, float* y, int64_t size) {
+    #pragma omp simd
+    for (int64_t i=0; i<size; i++) {
+        y[i] = (float) x[i];
+    }
+}
+
+template<int E>
+static inline void conv(const float* x, FP8<E>* y, int64_t size) {
+    #pragma omp simd
+    for (int64_t i=0; i<size; i++) {
+        y[i] = x[i];
+    }
+}
+
+template<int E>
+static inline float dot(const FP8<E>* x, const float* y, int64_t size) {
+    float z = 0;
+    #pragma omp simd reduction(+:z)
+    for (int64_t i=0; i<size; i++) {
+        z += ((float)x[i])*y[i];
+    }
+    return z;
+}
+
+template <int E, int QK>
+struct bloc_fp8 {
+    float d;
+    FP8<E> qs[QK];
+};
+
+template <int E, int QK>
+static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        #pragma omp simd
+        for (int64_t i=0; i<QK; i++) {
+            y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d;
+        }
+    }
+}
+
+template <int E, int QK>
+static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        float m = 0;
+        #pragma omp simd reduction(max:m)
+        for (int64_t i=0; i<QK; i++) {
+            m = std::max(std::abs(x[q*QK+i]),m);
+        }
+        const float D = FP8<E>::MAX()/m;
+        y[q].d = m/FP8<E>::MAX();
+        #pragma omp simd
+        for (int64_t i=0; i<QK; i++) {
+            y[q].qs[i] = x[q*QK+i]*D;
+        }
+    }
+}
+
+template <int E, int QK>
+static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) {
+    float z = 0;
+    const auto qk_size = size / QK;
+    for (int64_t q=0; q<qk_size; ++q) {
+        float z0 = 0;
+        #pragma omp simd reduction(+:z0)
+        for (int64_t i=0; i<QK; i++) {
+            z0 += ((float)x[q].qs[i])*y[q*QK+i];
+        }
+        z += (x[q]).d * z0;
+    }
+    return z;
+}
+
+// the C API.
+void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    conv(reinterpret_cast<const FP8<5>*>(x), y, k);
+}
+void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
+    conv(x, reinterpret_cast<FP8<5>*>(y), k);
+}
+void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) {
+    for (int64_t i =0; i<k; ++i) {
+        reinterpret_cast<FP8<5>*>(y)[i] = x[i];
+    }
+}
+
+void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    conv(reinterpret_cast<const FP8<4>*>(x), y, k);
+}
+void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
+    conv(x, reinterpret_cast<FP8<4>*>(y), k);
+}
+void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) {
+    for (int64_t i =0; i<k; ++i) {
+        reinterpret_cast<FP8<4>*>(y)[i] = x[i];
+    }
+}
+
+void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
+}
+void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
+}
+void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
+}
+
+void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
+}
+void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
+}
+void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
+}
+
+// the dot product for FP8 weight
+void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
+}
+
+void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    *s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
+}