|
| 1 | +#define GGML_COMMON_IMPL_C |
| 2 | +#include "ggml-common.h" |
| 3 | + |
| 4 | +#include "ggml-fp8.h" |
| 5 | + |
| 6 | +#include <cassert> |
| 7 | + |
| 8 | +/* |
| 9 | +# ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q |
| 10 | +./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q |
| 11 | +./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42 |
| 12 | +# ./llama-perplexity -f ~/LLM/wikitext-2-raw/wiki.test.raw -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf |
| 13 | +./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf |
| 14 | +
|
| 15 | +*/ |
| 16 | + |
| 17 | +#include <iostream> |
| 18 | +#include <cstdint> |
| 19 | +#include <immintrin.h> |
| 20 | + |
| 21 | +template<int N> constexpr float EXP2() { |
| 22 | + if constexpr (N==0) return 1; |
| 23 | + if constexpr (N>0) return EXP2<N-1>()*2; |
| 24 | + if constexpr (N<0) return EXP2<N+1>()/2; |
| 25 | +} |
| 26 | + |
| 27 | +// 2^N avec N>0 en entier |
| 28 | +template<int N> constexpr int EXP_I2() { |
| 29 | + if constexpr (N==0) return 1; |
| 30 | + if constexpr (N>0) return EXP_I2<N-1>()*2; |
| 31 | +} |
| 32 | + |
| 33 | +template<int _E> //, int M=7-E> 1.7 bits! |
| 34 | +struct FP8 { |
| 35 | + uint8_t bits; |
| 36 | + using type = FP8<_E>; |
| 37 | + static constexpr int E=_E; |
| 38 | + static constexpr int M=7-_E; |
| 39 | + static constexpr int E_BIAS=EXP2<_E-1>()-1; |
| 40 | + static constexpr float MAX() { return (2-EXP2<-M+1>())*EXP2<EXP_I2<_E-1>()>(); } |
| 41 | + static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); } |
| 42 | + //============================================= |
| 43 | + |
| 44 | + #pragma omp declare simd |
| 45 | + void operator=(float value) { |
| 46 | + union { |
| 47 | + float f; |
| 48 | + uint32_t bits; |
| 49 | + } in = {value}; |
| 50 | + // le signe: |
| 51 | + bits = (in.bits >> 24) & 0x80; |
| 52 | + // la valeur sans la signe! |
| 53 | + in.bits &= 0x7fffffff; |
| 54 | + //GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN |
| 55 | + if (in.f >= MAX()) { |
| 56 | + bits |= 0x7E; |
| 57 | + } else if (in.f<MIN()) { // => 0. |
| 58 | + // OK: S.0000000 |
| 59 | + } else { |
| 60 | + in.f *= EXP2<E_BIAS-127>(); |
| 61 | + in.bits += 1<<(22-M); // for rounding |
| 62 | + bits |= (in.bits >> (23-M)) & 0x7F; |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + #pragma omp declare simd |
| 67 | + operator float () const { |
| 68 | + union { |
| 69 | + float f; |
| 70 | + uint32_t bits; |
| 71 | + } out = {0}; |
| 72 | + // le signe: |
| 73 | + out.bits = bits & 0x80; |
| 74 | + out.bits <<= 24; |
| 75 | + uint32_t _bits = bits & 0x7F; |
| 76 | + _bits <<= (23-M); |
| 77 | + out.bits |= _bits; |
| 78 | + out.f *= EXP2<127-E_BIAS>(); |
| 79 | + return out.f; |
| 80 | + } |
| 81 | +}; |
| 82 | + |
| 83 | +// block_e4m3_q |
| 84 | +//typedef struct { |
| 85 | +// float d; // delta |
| 86 | +// ggml_e4m3 qs[QK_K]; |
| 87 | +//} block_e4m3_q; |
| 88 | + |
| 89 | +template<int E> |
| 90 | +static inline void conv(const FP8<E>* x, float* y, int64_t size) { |
| 91 | + #pragma omp simd |
| 92 | + for (int64_t i=0; i<size; i++) { |
| 93 | + y[i] = (float) x[i]; |
| 94 | + } |
| 95 | +} |
| 96 | + |
| 97 | +template<int E> |
| 98 | +static inline void conv(const float* x, FP8<E>* y, int64_t size) { |
| 99 | + #pragma omp simd |
| 100 | + for (int64_t i=0; i<size; i++) { |
| 101 | + y[i] = x[i]; |
| 102 | + } |
| 103 | +} |
| 104 | + |
| 105 | +template<int E> |
| 106 | +static inline float dot(const FP8<E>* x, const float* y, int64_t size) { |
| 107 | + float z = 0; |
| 108 | + #pragma omp simd reduction(+:z) |
| 109 | + for (int64_t i=0; i<size; i++) { |
| 110 | + z += ((float)x[i])*y[i]; |
| 111 | + } |
| 112 | + return z; |
| 113 | +} |
| 114 | + |
| 115 | +template <int E, int QK> |
| 116 | +struct bloc_fp8 { |
| 117 | + float d; |
| 118 | + FP8<E> qs[QK]; |
| 119 | +}; |
| 120 | + |
| 121 | +template <int E, int QK> |
| 122 | +static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) { |
| 123 | + const auto qk_size = size / QK; |
| 124 | + for (int64_t q=0; q<qk_size; ++q) { |
| 125 | + #pragma omp simd |
| 126 | + for (int64_t i=0; i<QK; i++) { |
| 127 | + y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d; |
| 128 | + } |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +template <int E, int QK> |
| 133 | +static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) { |
| 134 | + const auto qk_size = size / QK; |
| 135 | + for (int64_t q=0; q<qk_size; ++q) { |
| 136 | + float m = 0; |
| 137 | + #pragma omp simd reduction(max:m) |
| 138 | + for (int64_t i=0; i<QK; i++) { |
| 139 | + m = std::max(std::abs(x[q*QK+i]),m); |
| 140 | + } |
| 141 | + const float D = FP8<E>::MAX()/m; |
| 142 | + y[q].d = m/FP8<E>::MAX(); |
| 143 | + #pragma omp simd |
| 144 | + for (int64_t i=0; i<QK; i++) { |
| 145 | + y[q].qs[i] = x[q*QK+i]*D; |
| 146 | + } |
| 147 | + } |
| 148 | +} |
| 149 | + |
| 150 | +template <int E, int QK> |
| 151 | +static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size) { |
| 152 | + float z = 0; |
| 153 | + const auto qk_size = size / QK; |
| 154 | + for (int64_t q=0; q<qk_size; ++q) { |
| 155 | + float z0 = 0; |
| 156 | + #pragma omp simd reduction(+:z0) |
| 157 | + for (int64_t i=0; i<QK; i++) { |
| 158 | + z0 += ((float)x[q].qs[i])*y[q*QK+i]; |
| 159 | + } |
| 160 | + z += (x[q]).d * z0; |
| 161 | + } |
| 162 | + return z; |
| 163 | +} |
| 164 | + |
| 165 | +// the C API. |
| 166 | +void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { |
| 167 | + conv(reinterpret_cast<const FP8<5>*>(x), y, k); |
| 168 | +} |
| 169 | +void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) { |
| 170 | + conv(x, reinterpret_cast<FP8<5>*>(y), k); |
| 171 | +} |
| 172 | +void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k) { |
| 173 | + for (int64_t i =0; i<k; ++i) { |
| 174 | + reinterpret_cast<FP8<5>*>(y)[i] = x[i]; |
| 175 | + } |
| 176 | +} |
| 177 | + |
| 178 | +void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { |
| 179 | + conv(reinterpret_cast<const FP8<4>*>(x), y, k); |
| 180 | +} |
| 181 | +void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) { |
| 182 | + conv(x, reinterpret_cast<FP8<4>*>(y), k); |
| 183 | +} |
| 184 | +void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k) { |
| 185 | + for (int64_t i =0; i<k; ++i) { |
| 186 | + reinterpret_cast<FP8<4>*>(y)[i] = x[i]; |
| 187 | + } |
| 188 | +} |
| 189 | + |
| 190 | +void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { |
| 191 | + assert(k % QK_K == 0); |
| 192 | + conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k); |
| 193 | +} |
| 194 | +void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) { |
| 195 | + assert(k % QK_K == 0); |
| 196 | + conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k); |
| 197 | +} |
| 198 | +void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) { |
| 199 | + assert(k % QK_K == 0); |
| 200 | + conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k); |
| 201 | +} |
| 202 | + |
| 203 | +void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { |
| 204 | + assert(k % QK_K == 0); |
| 205 | + conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k); |
| 206 | +} |
| 207 | +void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) { |
| 208 | + assert(k % QK_K == 0); |
| 209 | + conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k); |
| 210 | +} |
| 211 | +void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) { |
| 212 | + assert(k % QK_K == 0); |
| 213 | + conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k); |
| 214 | +} |
| 215 | + |
| 216 | +// the dot product for FP8 weight |
| 217 | +void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) { |
| 218 | + assert(nrc == 1); |
| 219 | + GGML_UNUSED(nrc); |
| 220 | + GGML_UNUSED(bx); |
| 221 | + GGML_UNUSED(by); |
| 222 | + GGML_UNUSED(bs); |
| 223 | + *s = dot(reinterpret_cast<const FP8<5>*>(vx), vy, n); |
| 224 | +} |
| 225 | + |
| 226 | +void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) { |
| 227 | + assert(nrc == 1); |
| 228 | + GGML_UNUSED(nrc); |
| 229 | + GGML_UNUSED(bx); |
| 230 | + GGML_UNUSED(by); |
| 231 | + GGML_UNUSED(bs); |
| 232 | + *s = dot(reinterpret_cast<const FP8<4>*>(vx), vy, n); |
| 233 | +} |
| 234 | + |
| 235 | +void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) { |
| 236 | + assert(nrc == 1); |
| 237 | + GGML_UNUSED(nrc); |
| 238 | + GGML_UNUSED(bx); |
| 239 | + GGML_UNUSED(by); |
| 240 | + GGML_UNUSED(bs); |
| 241 | + *s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n); |
| 242 | +} |
| 243 | + |
| 244 | +void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) { |
| 245 | + assert(nrc == 1); |
| 246 | + GGML_UNUSED(nrc); |
| 247 | + GGML_UNUSED(bx); |
| 248 | + GGML_UNUSED(by); |
| 249 | + GGML_UNUSED(bs); |
| 250 | + *s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n); |
| 251 | +} |
0 commit comments