Skip to content

Commit 5ecff35

Browse files
KawrakowKawrakow
andauthored
Adding a simple program to measure speed of dot products (#1041)
On my Mac, the direct Q4_1 product is marginally slower (~69 vs ~55 us for Q4_0). The SIMD-ified ggml version is now almost 2X slower (~121 us). On a Ryzen 7950X CPU, the direct product for Q4_1 quantization is faster than the AVX2 implementation (~60 vs ~62 us). --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1 parent 7faa746 commit 5ecff35

File tree

5 files changed

+326
-1
lines changed

5 files changed

+326
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,4 +305,5 @@ endif ()
305305

306306
if (LLAMA_BUILD_EXAMPLES)
307307
add_subdirectory(examples)
308+
add_subdirectory(pocs)
308309
endif()

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ $(info I CC: $(CCV))
133133
$(info I CXX: $(CXXV))
134134
$(info )
135135

136-
default: main quantize quantize-stats perplexity embedding
136+
default: main quantize quantize-stats perplexity embedding vdot
137137

138138
#
139139
# Build library
@@ -169,6 +169,9 @@ perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
169169
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
170170
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
171171

172+
vdot: pocs/vdot/vdot.cpp ggml.o
173+
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
174+
172175
libllama.so: llama.o ggml.o
173176
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
174177

pocs/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# dependencies
2+
3+
find_package(Threads REQUIRED)
4+
5+
# third-party
6+
7+
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
8+
9+
if (EMSCRIPTEN)
10+
else()
11+
add_subdirectory(vdot)
12+
endif()

pocs/vdot/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set(TARGET vdot)
2+
add_executable(${TARGET} vdot.cpp)
3+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

pocs/vdot/vdot.cpp

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
#include <cstdio>
2+
#include <vector>
3+
#include <random>
4+
#include <chrono>
5+
#include <cstdlib>
6+
#include <cmath>
7+
#include <cassert>
8+
#include <cstring>
9+
#include <array>
10+
11+
#include <ggml.h>
12+
13+
constexpr int kVecSize = 1 << 18;
14+
15+
float drawFromGaussianPdf(std::mt19937& rndm) {
16+
constexpr double kScale = 1./(1. + std::mt19937::max());
17+
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
18+
static float lastX;
19+
static bool haveX = false;
20+
if (haveX) { haveX = false; return lastX; }
21+
auto r = sqrt(-2*log(1 - kScale*rndm()));
22+
auto phi = kTwoPiTimesScale * rndm();
23+
lastX = r*sin(phi);
24+
haveX = true;
25+
return r*cos(phi);
26+
}
27+
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
28+
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
29+
}
30+
31+
// Copy-pasted from ggml.c
32+
#define QK4_0 32
33+
typedef struct {
34+
float d; // delta
35+
uint8_t qs[QK4_0 / 2]; // nibbles / quants
36+
} block_q4_0;
37+
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
38+
39+
#define QK4_1 32
40+
typedef struct {
41+
float d; // delta
42+
float m; // min
43+
uint8_t qs[QK4_1 / 2]; // nibbles / quants
44+
} block_q4_1;
45+
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
46+
47+
// Copy-pasted from ggml.c
48+
#define QK8_0 32
49+
typedef struct {
50+
float d; // delta
51+
int8_t qs[QK8_0]; // quants
52+
} block_q8_0;
53+
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
54+
55+
// "Scalar" dot product between the quantized vector x and float vector y
56+
inline double dot(int n, const block_q4_0* x, const float* y) {
57+
const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
58+
constexpr uint32_t kMask1 = 0x0f0f0f0f;
59+
uint32_t u1, u2;
60+
auto q1 = (const uint8_t*)&u1;
61+
auto q2 = (const uint8_t*)&u2;
62+
double sum = 0;
63+
for (int i=0; i<n; ++i) {
64+
float d = x->d;
65+
auto u = (const uint32_t*)x->qs;
66+
float s = 0;
67+
for (int k=0; k<4; ++k) {
68+
u1 = u[k] & kMask1;
69+
u2 = (u[k] >> 4) & kMask1;
70+
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
71+
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
72+
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
73+
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
74+
y += 8;
75+
}
76+
sum += s*d;
77+
++x;
78+
}
79+
return sum;
80+
}
81+
// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
82+
// but about the same on X86_64 (Ryzen 7950X CPU).
83+
inline double dot3(int n, const block_q4_0* x, const float* y) {
84+
const static std::pair<float,float> kValues[256] = {
85+
{-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
86+
{ 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
87+
{-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
88+
{ 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
89+
{-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
90+
{ 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
91+
{-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
92+
{ 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
93+
{-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
94+
{ 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
95+
{-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
96+
{ 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
97+
{-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
98+
{ 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
99+
{-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
100+
{ 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
101+
{-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f},
102+
{ 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f},
103+
{-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f},
104+
{ 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f},
105+
{-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f},
106+
{ 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f},
107+
{-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f},
108+
{ 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f},
109+
{-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f},
110+
{ 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f},
111+
{-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f},
112+
{ 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f},
113+
{-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f},
114+
{ 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f},
115+
{-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f},
116+
{ 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f}
117+
};
118+
double sum = 0;
119+
for (int i=0; i<n; ++i) {
120+
float d = x->d;
121+
auto q = x->qs;
122+
float s = 0;
123+
for (int k=0; k<4; ++k) {
124+
s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
125+
y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
126+
y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
127+
y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
128+
y += 8; q += 4;
129+
}
130+
sum += s*d;
131+
++x;
132+
}
133+
return sum;
134+
}
135+
136+
inline double dot41(int n, const block_q4_1* x, const float* y) {
137+
const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
138+
constexpr uint32_t kMask1 = 0x0f0f0f0f;
139+
uint32_t u1, u2;
140+
auto q1 = (const uint8_t*)&u1;
141+
auto q2 = (const uint8_t*)&u2;
142+
double sum = 0;
143+
for (int i=0; i<n; ++i) {
144+
auto u = (const uint32_t*)x->qs;
145+
float s = 0, s1 = 0;
146+
for (int k=0; k<4; ++k) {
147+
u1 = u[k] & kMask1;
148+
u2 = (u[k] >> 4) & kMask1;
149+
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
150+
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
151+
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
152+
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
153+
s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
154+
y += 8;
155+
}
156+
sum += s*x->d + s1*x->m;
157+
++x;
158+
}
159+
return sum;
160+
}
161+
162+
// Copy-pasted from ggml.c
163+
static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
164+
assert(k % QK8_0 == 0);
165+
const int nb = k / QK8_0;
166+
167+
for (int i = 0; i < nb; i++) {
168+
float amax = 0.0f; // absolute max
169+
170+
for (int l = 0; l < QK8_0; l++) {
171+
const float v = x[i*QK8_0 + l];
172+
amax = std::max(amax, fabsf(v));
173+
}
174+
175+
const float d = amax / ((1 << 7) - 1);
176+
const float id = d ? 1.0f/d : 0.0f;
177+
178+
y[i].d = d;
179+
180+
for (int l = 0; l < QK8_0; ++l) {
181+
const float v = x[i*QK8_0 + l]*id;
182+
y[i].qs[l] = roundf(v);
183+
}
184+
}
185+
}
186+
187+
// Copy-pasted from ggml.c
188+
static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
189+
const int nb = n / QK8_0;
190+
const block_q4_0* x = (const block_q4_0*)vx;
191+
const block_q8_0* y = (const block_q8_0*)vy;
192+
float sumf = 0;
193+
for (int i = 0; i < nb; i++) {
194+
const float d0 = x[i].d;
195+
const float d1 = y[i].d;
196+
197+
const uint8_t * p0 = x[i].qs;
198+
const int8_t * p1 = y[i].qs;
199+
200+
int sumi = 0;
201+
for (int j = 0; j < QK8_0/2; j++) {
202+
const uint8_t v0 = p0[j];
203+
204+
const int i0 = (int8_t) (v0 & 0xf) - 8;
205+
const int i1 = (int8_t) (v0 >> 4) - 8;
206+
207+
const int i2 = p1[2*j + 0];
208+
const int i3 = p1[2*j + 1];
209+
210+
sumi += i0*i2 + i1*i3;
211+
}
212+
sumf += d0*d1*sumi;
213+
}
214+
*s = sumf;
215+
}
216+
217+
int main(int argc, char** argv) {
218+
219+
int nloop = argc > 1 ? atoi(argv[1]) : 10;
220+
bool scalar = argc > 2 ? atoi(argv[2]) : false;
221+
bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
222+
223+
if (scalar && useQ4_1) {
224+
printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
225+
return 1;
226+
}
227+
228+
std::mt19937 rndm(1234);
229+
230+
std::vector<float> x1(kVecSize), y1(kVecSize);
231+
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
232+
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
233+
234+
auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
235+
236+
std::vector<block_q4_0> q40;
237+
std::vector<block_q4_1> q41;
238+
if (useQ4_1) q41.resize(n4);
239+
else q40.resize(n4);
240+
std::vector<block_q8_0> q8(n8);
241+
std::vector<int64_t> H(16, 0);
242+
double sumt = 0, sumt2 = 0, maxt = 0;
243+
double sumqt = 0, sumqt2 = 0, maxqt = 0;
244+
double sum = 0, sumq = 0, exactSum = 0;
245+
for (int iloop=0; iloop<nloop; ++iloop) {
246+
247+
// Fill vector x with random numbers
248+
fillRandomGaussianFloats(x1, rndm);
249+
250+
// Fill vector y with random numbers
251+
fillRandomGaussianFloats(y1, rndm);
252+
253+
// Compute the exact dot product
254+
for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
255+
256+
// quantize x.
257+
// Note, we do not include this in the timing as in practical application
258+
// we already have the quantized model weights.
259+
if (useQ4_1) {
260+
funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
261+
} else {
262+
funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
263+
}
264+
265+
// Now measure time the dot product needs using the "scalar" version above
266+
auto t1 = std::chrono::high_resolution_clock::now();
267+
if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
268+
else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
269+
auto t2 = std::chrono::high_resolution_clock::now();
270+
auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
271+
sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
272+
273+
// And now measure the time needed to quantize y and perform the dot product with the quantized y
274+
t1 = std::chrono::high_resolution_clock::now();
275+
float result;
276+
if (scalar) {
277+
quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
278+
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
279+
}
280+
else {
281+
funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
282+
if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
283+
else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
284+
}
285+
sumq += result;
286+
t2 = std::chrono::high_resolution_clock::now();
287+
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
288+
sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
289+
290+
}
291+
292+
// Report the time (and the average of the dot products so the compiler does not come up with the idea
293+
// of optimizing away the function calls after figuring that the result is not used).
294+
sum /= nloop; sumq /= nloop;
295+
exactSum /= nloop;
296+
printf("Exact result: <dot> = %g\n",exactSum);
297+
printf("<dot> = %g, %g\n",sum,sumq);
298+
sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
299+
if (sumt2 > 0) sumt2 = sqrt(sumt2);
300+
printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
301+
sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
302+
if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
303+
printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
304+
return 0;
305+
}

0 commit comments

Comments
 (0)