Skip to content

Commit 0fcbf8b

Browse files
committed
ggml : q5_0 more efficient ARM NEON using uint64_t masks
1 parent 4bf196e commit 0fcbf8b

File tree

1 file changed

+18
-17
lines changed

1 file changed

+18
-17
lines changed

ggml.c

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,9 @@ static ggml_fp16_t table_exp_f16[1 << 16];
328328
// precomputed f32 table for f16 (256 KB)
329329
static float table_f32_f16[1 << 16];
330330

331+
// precomputed table for expanding 8bits to 8 bytes (shl 4)
332+
static uint64_t table_b2b[1 << 8];
333+
331334
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
332335
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
333336
// This is also true for POWER9.
@@ -3181,14 +3184,7 @@ static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void *
31813184

31823185
float summs = 0.0f;
31833186

3184-
uint32_t tmp[8];
3185-
3186-
static const uint32_t k_mask[16] = {
3187-
0x00000000, 0x00000010, 0x00001000, 0x00001010,
3188-
0x00100000, 0x00100010, 0x00101000, 0x00101010,
3189-
0x10000000, 0x10000010, 0x10001000, 0x10001010,
3190-
0x10100000, 0x10100010, 0x10101000, 0x10101010,
3191-
};
3187+
uint64_t tmp[4];
31923188

31933189
for (int i = 0; i < nb; ++i) {
31943190
const block_q5_0 * restrict x0 = &x[i];
@@ -3199,17 +3195,13 @@ static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void *
31993195
// extract the 5th bit
32003196
const uint32_t qh = x0->qh;
32013197

3202-
tmp[0] = k_mask[(qh >> 0) & 0x0F];
3203-
tmp[1] = k_mask[(qh >> 4) & 0x0F];
3204-
tmp[2] = k_mask[(qh >> 8) & 0x0F];
3205-
tmp[3] = k_mask[(qh >> 12) & 0x0F];
3206-
tmp[4] = k_mask[(qh >> 16) & 0x0F];
3207-
tmp[5] = k_mask[(qh >> 20) & 0x0F];
3208-
tmp[6] = k_mask[(qh >> 24) & 0x0F];
3209-
tmp[7] = k_mask[(qh >> 28)];
3198+
tmp[0] = table_b2b[(qh >> 0) & 0xFF];
3199+
tmp[1] = table_b2b[(qh >> 8) & 0xFF];
3200+
tmp[2] = table_b2b[(qh >> 16) & 0xFF];
3201+
tmp[3] = table_b2b[(qh >> 24) ];
32103202

32113203
const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
3212-
const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 4));
3204+
const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
32133205

32143206
const uint8x16_t v0 = vld1q_u8(x0->qs);
32153207

@@ -4064,6 +4056,15 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
40644056
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
40654057
}
40664058

4059+
for (int i = 0; i < 256; ++i) {
4060+
table_b2b[i] = 0;
4061+
for (int b = 0; b < 8; ++b) {
4062+
table_b2b[i] |= ((uint64_t)(((i >> b) & 0x01) << 4)) << (8*b);
4063+
}
4064+
4065+
//printf("%3d %016llx\n", i, table_b2b[i]);
4066+
}
4067+
40674068
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
40684069

40694070
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);

0 commit comments

Comments
 (0)