Skip to content

Commit 47d6184

Browse files
heshpdxNeo Zhang
authored andcommitted
ggml : loop tiling optimizations for scalar path (ggml/898)
Apply a loop tiling technique to the generic path, which provides performance upside for ISAs with enough registers to take advantage of it. Also helps the compiler optimize this path.
1 parent eef117d commit 47d6184

File tree

1 file changed

+22
-10
lines changed

1 file changed

+22
-10
lines changed

ggml/src/ggml-quants.c

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
41904190
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
41914191
#endif
41924192
for (; ib < nb; ++ib) {
4193-
int sumi = 0;
4193+
int sumi0 = 0;
4194+
int sumi1 = 0;
41944195

41954196
for (int j = 0; j < qk/2; ++j) {
41964197
const int v0 = (x[ib].qs[j] & 0x0F) - 8;
41974198
const int v1 = (x[ib].qs[j] >> 4) - 8;
41984199

4199-
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
4200+
sumi0 += (v0 * y[ib].qs[j]);
4201+
sumi1 += (v1 * y[ib].qs[j + qk/2]);
42004202
}
42014203

4204+
int sumi = sumi0 + sumi1;
42024205
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
42034206
}
42044207

@@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
44744477
sumf = hsum_float_8(acc) + summs;
44754478
#endif
44764479
for (; ib < nb; ++ib) {
4477-
int sumi = 0;
4480+
int sumi0 = 0
4481+
int sumi1 = 0;
44784482

44794483
for (int j = 0; j < qk/2; ++j) {
44804484
const int v0 = (x[ib].qs[j] & 0x0F);
44814485
const int v1 = (x[ib].qs[j] >> 4);
44824486

4483-
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]);
4487+
sumi0 += (v0 * y[ib].qs[j]);
4488+
sumi1 += (v1 * y[ib].qs[j + qk/2]);
44844489
}
44854490

4491+
int sumi = sumi0 + sumi1;
44864492
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
44874493
}
44884494

@@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
48234829
uint32_t qh;
48244830
memcpy(&qh, x[ib].qh, sizeof(qh));
48254831

4826-
int sumi = 0;
4832+
int sumi0 = 0;
4833+
int sumi1 = 0;
48274834

48284835
for (int j = 0; j < qk/2; ++j) {
48294836
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
48304837
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
48314838

4832-
const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16;
4833-
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16;
4839+
const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
4840+
const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
48344841

4835-
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
4842+
sumi0 += (x0 * y[ib].qs[j]);
4843+
sumi1 += (x1 * y[ib].qs[j + qk/2]);
48364844
}
48374845

4846+
int sumi = sumi0 + sumi1;
48384847
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
48394848
}
48404849

@@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
51945203
uint32_t qh;
51955204
memcpy(&qh, x[ib].qh, sizeof(qh));
51965205

5197-
int sumi = 0;
5206+
int sumi0 = 0;
5207+
int sumi1 = 0;
51985208

51995209
for (int j = 0; j < qk/2; ++j) {
52005210
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
@@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
52035213
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
52045214
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
52055215

5206-
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]);
5216+
sumi0 += (x0 * y[ib].qs[j]);
5217+
sumi1 += (x1 * y[ib].qs[j + qk/2]);
52075218
}
52085219

5220+
int sumi = sumi0 + sumi1;
52095221
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
52105222
}
52115223

0 commit comments

Comments
 (0)