|
| 1 | +#include <cstdio> |
| 2 | +#include <vector> |
| 3 | +#include <random> |
| 4 | +#include <chrono> |
| 5 | +#include <cstdlib> |
| 6 | +#include <cmath> |
| 7 | +#include <cassert> |
| 8 | +#include <cstring> |
| 9 | +#include <array> |
| 10 | + |
| 11 | +#include <ggml.h> |
| 12 | + |
| 13 | +constexpr int kVecSize = 1 << 18; |
| 14 | + |
| 15 | +float drawFromGaussianPdf(std::mt19937& rndm) { |
| 16 | + constexpr double kScale = 1./(1. + std::mt19937::max()); |
| 17 | + constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale; |
| 18 | + static float lastX; |
| 19 | + static bool haveX = false; |
| 20 | + if (haveX) { haveX = false; return lastX; } |
| 21 | + auto r = sqrt(-2*log(1 - kScale*rndm())); |
| 22 | + auto phi = kTwoPiTimesScale * rndm(); |
| 23 | + lastX = r*sin(phi); |
| 24 | + haveX = true; |
| 25 | + return r*cos(phi); |
| 26 | +} |
| 27 | +void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) { |
| 28 | + for (auto& v : values) v = mean + drawFromGaussianPdf(rndm); |
| 29 | +} |
| 30 | + |
| 31 | +// Copy-pasted from ggml.c |
| 32 | +#define QK4_0 32 |
| 33 | +typedef struct { |
| 34 | + float d; // delta |
| 35 | + uint8_t qs[QK4_0 / 2]; // nibbles / quants |
| 36 | +} block_q4_0; |
| 37 | +static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding"); |
| 38 | + |
| 39 | +#define QK4_1 32 |
| 40 | +typedef struct { |
| 41 | + float d; // delta |
| 42 | + float m; // min |
| 43 | + uint8_t qs[QK4_1 / 2]; // nibbles / quants |
| 44 | +} block_q4_1; |
| 45 | +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding"); |
| 46 | + |
| 47 | +// Copy-pasted from ggml.c |
| 48 | +#define QK8_0 32 |
| 49 | +typedef struct { |
| 50 | + float d; // delta |
| 51 | + int8_t qs[QK8_0]; // quants |
| 52 | +} block_q8_0; |
| 53 | +static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding"); |
| 54 | + |
| 55 | +// "Scalar" dot product between the quantized vector x and float vector y |
| 56 | +inline double dot(int n, const block_q4_0* x, const float* y) { |
| 57 | + const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}; |
| 58 | + constexpr uint32_t kMask1 = 0x0f0f0f0f; |
| 59 | + uint32_t u1, u2; |
| 60 | + auto q1 = (const uint8_t*)&u1; |
| 61 | + auto q2 = (const uint8_t*)&u2; |
| 62 | + double sum = 0; |
| 63 | + for (int i=0; i<n; ++i) { |
| 64 | + float d = x->d; |
| 65 | + auto u = (const uint32_t*)x->qs; |
| 66 | + float s = 0; |
| 67 | + for (int k=0; k<4; ++k) { |
| 68 | + u1 = u[k] & kMask1; |
| 69 | + u2 = (u[k] >> 4) & kMask1; |
| 70 | + s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] + |
| 71 | + y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] + |
| 72 | + y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] + |
| 73 | + y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]]; |
| 74 | + y += 8; |
| 75 | + } |
| 76 | + sum += s*d; |
| 77 | + ++x; |
| 78 | + } |
| 79 | + return sum; |
| 80 | +} |
| 81 | +// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product), |
| 82 | +// but about the same on X86_64 (Ryzen 7950X CPU). |
| 83 | +inline double dot3(int n, const block_q4_0* x, const float* y) { |
| 84 | + const static std::pair<float,float> kValues[256] = { |
| 85 | + {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f}, |
| 86 | + { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f}, |
| 87 | + {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f}, |
| 88 | + { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f}, |
| 89 | + {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f}, |
| 90 | + { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f}, |
| 91 | + {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f}, |
| 92 | + { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f}, |
| 93 | + {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f}, |
| 94 | + { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f}, |
| 95 | + {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f}, |
| 96 | + { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f}, |
| 97 | + {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f}, |
| 98 | + { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f}, |
| 99 | + {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f}, |
| 100 | + { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f}, |
| 101 | + {-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f}, |
| 102 | + { 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f}, |
| 103 | + {-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f}, |
| 104 | + { 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f}, |
| 105 | + {-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f}, |
| 106 | + { 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f}, |
| 107 | + {-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f}, |
| 108 | + { 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f}, |
| 109 | + {-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f}, |
| 110 | + { 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f}, |
| 111 | + {-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f}, |
| 112 | + { 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f}, |
| 113 | + {-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f}, |
| 114 | + { 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f}, |
| 115 | + {-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f}, |
| 116 | + { 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f} |
| 117 | + }; |
| 118 | + double sum = 0; |
| 119 | + for (int i=0; i<n; ++i) { |
| 120 | + float d = x->d; |
| 121 | + auto q = x->qs; |
| 122 | + float s = 0; |
| 123 | + for (int k=0; k<4; ++k) { |
| 124 | + s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second + |
| 125 | + y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second + |
| 126 | + y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second + |
| 127 | + y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second; |
| 128 | + y += 8; q += 4; |
| 129 | + } |
| 130 | + sum += s*d; |
| 131 | + ++x; |
| 132 | + } |
| 133 | + return sum; |
| 134 | +} |
| 135 | + |
| 136 | +inline double dot41(int n, const block_q4_1* x, const float* y) { |
| 137 | + const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f}; |
| 138 | + constexpr uint32_t kMask1 = 0x0f0f0f0f; |
| 139 | + uint32_t u1, u2; |
| 140 | + auto q1 = (const uint8_t*)&u1; |
| 141 | + auto q2 = (const uint8_t*)&u2; |
| 142 | + double sum = 0; |
| 143 | + for (int i=0; i<n; ++i) { |
| 144 | + auto u = (const uint32_t*)x->qs; |
| 145 | + float s = 0, s1 = 0; |
| 146 | + for (int k=0; k<4; ++k) { |
| 147 | + u1 = u[k] & kMask1; |
| 148 | + u2 = (u[k] >> 4) & kMask1; |
| 149 | + s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] + |
| 150 | + y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] + |
| 151 | + y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] + |
| 152 | + y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]]; |
| 153 | + s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7]; |
| 154 | + y += 8; |
| 155 | + } |
| 156 | + sum += s*x->d + s1*x->m; |
| 157 | + ++x; |
| 158 | + } |
| 159 | + return sum; |
| 160 | +} |
| 161 | + |
| 162 | +// Copy-pasted from ggml.c |
| 163 | +static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) { |
| 164 | + assert(k % QK8_0 == 0); |
| 165 | + const int nb = k / QK8_0; |
| 166 | + |
| 167 | + for (int i = 0; i < nb; i++) { |
| 168 | + float amax = 0.0f; // absolute max |
| 169 | + |
| 170 | + for (int l = 0; l < QK8_0; l++) { |
| 171 | + const float v = x[i*QK8_0 + l]; |
| 172 | + amax = std::max(amax, fabsf(v)); |
| 173 | + } |
| 174 | + |
| 175 | + const float d = amax / ((1 << 7) - 1); |
| 176 | + const float id = d ? 1.0f/d : 0.0f; |
| 177 | + |
| 178 | + y[i].d = d; |
| 179 | + |
| 180 | + for (int l = 0; l < QK8_0; ++l) { |
| 181 | + const float v = x[i*QK8_0 + l]*id; |
| 182 | + y[i].qs[l] = roundf(v); |
| 183 | + } |
| 184 | + } |
| 185 | +} |
| 186 | + |
| 187 | +// Copy-pasted from ggml.c |
| 188 | +static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) { |
| 189 | + const int nb = n / QK8_0; |
| 190 | + const block_q4_0* x = (const block_q4_0*)vx; |
| 191 | + const block_q8_0* y = (const block_q8_0*)vy; |
| 192 | + float sumf = 0; |
| 193 | + for (int i = 0; i < nb; i++) { |
| 194 | + const float d0 = x[i].d; |
| 195 | + const float d1 = y[i].d; |
| 196 | + |
| 197 | + const uint8_t * p0 = x[i].qs; |
| 198 | + const int8_t * p1 = y[i].qs; |
| 199 | + |
| 200 | + int sumi = 0; |
| 201 | + for (int j = 0; j < QK8_0/2; j++) { |
| 202 | + const uint8_t v0 = p0[j]; |
| 203 | + |
| 204 | + const int i0 = (int8_t) (v0 & 0xf) - 8; |
| 205 | + const int i1 = (int8_t) (v0 >> 4) - 8; |
| 206 | + |
| 207 | + const int i2 = p1[2*j + 0]; |
| 208 | + const int i3 = p1[2*j + 1]; |
| 209 | + |
| 210 | + sumi += i0*i2 + i1*i3; |
| 211 | + } |
| 212 | + sumf += d0*d1*sumi; |
| 213 | + } |
| 214 | + *s = sumf; |
| 215 | +} |
| 216 | + |
| 217 | +int main(int argc, char** argv) { |
| 218 | + |
| 219 | + int nloop = argc > 1 ? atoi(argv[1]) : 10; |
| 220 | + bool scalar = argc > 2 ? atoi(argv[2]) : false; |
| 221 | + bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false; |
| 222 | + |
| 223 | + if (scalar && useQ4_1) { |
| 224 | + printf("It is not possible to use Q4_1 quantization and scalar implementations\n"); |
| 225 | + return 1; |
| 226 | + } |
| 227 | + |
| 228 | + std::mt19937 rndm(1234); |
| 229 | + |
| 230 | + std::vector<float> x1(kVecSize), y1(kVecSize); |
| 231 | + int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64); |
| 232 | + int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64); |
| 233 | + |
| 234 | + auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0); |
| 235 | + |
| 236 | + std::vector<block_q4_0> q40; |
| 237 | + std::vector<block_q4_1> q41; |
| 238 | + if (useQ4_1) q41.resize(n4); |
| 239 | + else q40.resize(n4); |
| 240 | + std::vector<block_q8_0> q8(n8); |
| 241 | + std::vector<int64_t> H(16, 0); |
| 242 | + double sumt = 0, sumt2 = 0, maxt = 0; |
| 243 | + double sumqt = 0, sumqt2 = 0, maxqt = 0; |
| 244 | + double sum = 0, sumq = 0, exactSum = 0; |
| 245 | + for (int iloop=0; iloop<nloop; ++iloop) { |
| 246 | + |
| 247 | + // Fill vector x with random numbers |
| 248 | + fillRandomGaussianFloats(x1, rndm); |
| 249 | + |
| 250 | + // Fill vector y with random numbers |
| 251 | + fillRandomGaussianFloats(y1, rndm); |
| 252 | + |
| 253 | + // Compute the exact dot product |
| 254 | + for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k]; |
| 255 | + |
| 256 | + // quantize x. |
| 257 | + // Note, we do not include this in the timing as in practical application |
| 258 | + // we already have the quantized model weights. |
| 259 | + if (useQ4_1) { |
| 260 | + funcs.quantize_row_q(x1.data(), q41.data(), kVecSize); |
| 261 | + } else { |
| 262 | + funcs.quantize_row_q(x1.data(), q40.data(), kVecSize); |
| 263 | + } |
| 264 | + |
| 265 | + // Now measure time the dot product needs using the "scalar" version above |
| 266 | + auto t1 = std::chrono::high_resolution_clock::now(); |
| 267 | + if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data()); |
| 268 | + else sum += dot(kVecSize / QK4_0, q40.data(), y1.data()); |
| 269 | + auto t2 = std::chrono::high_resolution_clock::now(); |
| 270 | + auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count(); |
| 271 | + sumt += t; sumt2 += t*t; maxt = std::max(maxt, t); |
| 272 | + |
| 273 | + // And now measure the time needed to quantize y and perform the dot product with the quantized y |
| 274 | + t1 = std::chrono::high_resolution_clock::now(); |
| 275 | + float result; |
| 276 | + if (scalar) { |
| 277 | + quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize); |
| 278 | + dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); |
| 279 | + } |
| 280 | + else { |
| 281 | + funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize); |
| 282 | + if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data()); |
| 283 | + else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data()); |
| 284 | + } |
| 285 | + sumq += result; |
| 286 | + t2 = std::chrono::high_resolution_clock::now(); |
| 287 | + t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count(); |
| 288 | + sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t); |
| 289 | + |
| 290 | + } |
| 291 | + |
| 292 | + // Report the time (and the average of the dot products so the compiler does not come up with the idea |
| 293 | + // of optimizing away the function calls after figuring that the result is not used). |
| 294 | + sum /= nloop; sumq /= nloop; |
| 295 | + exactSum /= nloop; |
| 296 | + printf("Exact result: <dot> = %g\n",exactSum); |
| 297 | + printf("<dot> = %g, %g\n",sum,sumq); |
| 298 | + sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt; |
| 299 | + if (sumt2 > 0) sumt2 = sqrt(sumt2); |
| 300 | + printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt); |
| 301 | + sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt; |
| 302 | + if (sumqt2 > 0) sumqt2 = sqrt(sumqt2); |
| 303 | + printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt); |
| 304 | + return 0; |
| 305 | +} |
0 commit comments