From 6fc51a8c0521be9fc5c255387c40e2acce181799 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Fri, 24 Mar 2023 17:32:35 +0100 Subject: [PATCH 1/4] Q2 and Q3 quantization --- examples/quantize/quantize.cpp | 2 + ggml.c | 418 ++++++++++++++++++++++++++++++++- ggml.h | 16 +- llama.cpp | 8 + llama.h | 6 +- 5 files changed, 433 insertions(+), 17 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 5b4812c62ba9c..0e8ce9e92d9e1 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -12,6 +12,8 @@ int main(int argc, char ** argv) { if (argc < 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]); + fprintf(stderr, " type = %d - q2_0\n", LLAMA_FTYPE_MOSTLY_Q2_0); + fprintf(stderr, " type = %d - q3_0\n", LLAMA_FTYPE_MOSTLY_Q3_0); fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0); fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1); fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2); diff --git a/ggml.c b/ggml.c index 8109b36b24432..82876e50ed657 100644 --- a/ggml.c +++ b/ggml.c @@ -624,6 +624,25 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { #endif +#define QK2_0 16 +#pragma pack(push, 1) +typedef struct { + ggml_fp16_t d; + uint32_t qs; +} block_q2_0; +#pragma pack(pop) +static_assert(sizeof(block_q2_0) == sizeof(ggml_fp16_t) + QK2_0 / 4, "wrong q2_0 size/padding"); + +#define QK3_0 16 +typedef union { + struct { + uint16_t pad[3]; + ggml_fp16_t d; + }; + uint64_t qs; +} block_q3_0; +static_assert(sizeof(block_q3_0) == sizeof(ggml_fp16_t) + QK3_0 * 3 / 8, "wrong q3_0 size/padding"); + #define QK4_0 32 typedef struct { float d; // delta @@ -663,6 +682,72 @@ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block siz // reference implementation for deterministic creation of model files +static void quantize_row_q2_0(const float * restrict x, block_q2_0 * restrict y, int k) { + assert(k % QK2_0 == 0); + const int nb = k / QK2_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int l = 0; l < QK2_0; l++) { + const float v = x[i*QK2_0 + l]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -2; + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + uint32_t qs = 0; + + for (int l = 0; l < QK2_0; l++) { + const float v = x[i*QK2_0 + l]*id; + const uint8_t vi = MIN(3, (int8_t)roundf(v) + 2); + assert(vi < 4); + qs |= (uint32_t)vi << (l*2); + } + y[i].qs = qs; + } +} + +static void quantize_row_q3_0(const float * restrict x, block_q3_0 * restrict y, int k) { + assert(k % QK3_0 == 0); + const int nb = k / QK3_0; + + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int l = 0; l < QK3_0; l++) { + const float v = x[i*QK3_0 + l]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -4; + const float id = d ? 1.0f/d : 0.0f; + + uint64_t qs = 0; + + for (int l = 0; l < QK3_0; l++) { + const float v = x[i*QK3_0 + l]*id; + const uint8_t vi = MIN(7, (int8_t)roundf(v) + 4); + assert(vi < 8); + qs |= (uint64_t)vi << (l*3); + } + + y[i].qs = qs; + y[i].d = GGML_FP32_TO_FP16(d); // overwrite unused part of uint64_t qs + } +} + static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -1432,6 +1517,45 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int #endif } +// TODO: vectorize +static void dequantize_row_q2_0(const void * restrict vx, float * restrict y, int k) { + assert(k % QK2_0 == 0); + const int nb = k / QK2_0; + + const block_q2_0 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + uint32_t qs = x[i].qs; + for (int l = 0; l < QK2_0; l++) { + const int8_t vi = qs & 3; + const float v = (vi - 2)*d; + y[i*QK2_0 + l] = v; + assert(!isnan(y[i*QK2_0 + l])); + qs >>= 2; + } + } +} + +static void dequantize_row_q3_0(const void * restrict vx, float * restrict y, int k) { + assert(k % QK3_0 == 0); + const int nb = k / QK3_0; + + const block_q3_0 * restrict x = vx; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + uint64_t qs = x[i].qs; + for (int l = 0; l < QK3_0; l++) { + const int8_t vi = qs & 7; + const float v = (vi - 4)*d; + y[i*QK3_0 + l] = v; + assert(!isnan(y[i*QK3_0 + l])); + qs >>= 3; + } + } +} + static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -1715,12 +1839,28 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in } } +static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); +static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { + [GGML_TYPE_Q2_0] = { + .dequantize_row_q = dequantize_row_q2_0, + .quantize_row_q = (quantize_row_q_t) quantize_row_q2_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_0, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q2_0_q8_0, + }, + [GGML_TYPE_Q3_0] = { + .dequantize_row_q = dequantize_row_q3_0, + .quantize_row_q = (quantize_row_q_t) quantize_row_q3_0, + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_0, + .quantize_row_q_dot = quantize_row_q8_0, + .vec_dot_q = ggml_vec_dot_q3_0_q8_0, + }, [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, @@ -2357,6 +2497,199 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } +static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK2_0 == 0); + const int nb = n / QK2_0; + + const block_q2_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0f; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m128 acc = _mm_setzero_ps(); + + for (int i = 0; i < nb; i++) { + // Compute combined scale for the block + const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d); + + __m128i bx = _mm_set1_epi32(x[i].qs); + + // shift counts to get all bit pairs in lowest position of each byte + const __m128i shift128 = _mm_set_epi32(6, 4, 2, 0); + bx = _mm_srlv_epi32(bx, shift128); + + const __m128i shufmask = _mm_set_epi8(15,11,7,3,14,10,6,2,13,9,5,1,12,8,4,0); + bx = _mm_shuffle_epi8(bx, shufmask); + + const __m128i mask = _mm_set1_epi8(3); + bx = _mm_and_si128(mask, bx); + + const __m128i off = _mm_set1_epi8(2); + bx = _mm_sub_epi8(bx, off); + + const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK2_0)); + + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + // Convert int16_t to int32_t by adding pairwise + const __m128i ones = _mm_set1_epi16(1); + __m128i i32 = _mm_madd_epi16(dot, ones); + + // Convert int32_t to float + const __m128 p = _mm_cvtepi32_ps(i32); + + // Apply the scale, and accumulate + acc = _mm_fmadd_ps(scale, p, acc); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + sumf = _mm_cvtss_f32(res); +#else + for (int i = 0; i < nb; i++) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + const float d1 = y[i/2].d; + + uint_fast32_t qs0 = x[i].qs; + const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK2_0; + + int sumi = 0; + for (int j = 0; j < QK2_0; j++) { + const int8_t i0 = (int8_t)(qs0 & 3) - 2; + const int_fast16_t i1 = p1[j]; + + sumi += i0 * i1; + + qs0 >>= 2; + } + sumf += d0 * d1 * sumi; + } +#endif + + *s = sumf; +} + +static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK3_0 == 0); + const int nb = n / QK3_0; + + const block_q3_0 * restrict x = vx; + const block_q8_0 * restrict y = vy; + + float sumf = 0.0f; + +#if defined(__AVX2__) + // Initialize accumulator with zeros + __m128 acc = _mm_setzero_ps(); + for (int i = 0; i < nb; i++) { + // Compute combined scale for the block + const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d); + + const __m256i shift_l = _mm256_set_epi64x(2*3, 64, 4*3, 0); + const __m256i shift_r = _mm256_set_epi64x( 64, 2*3, 64, 64); + + __m256i bxx = _mm256_set1_epi64x(x[i].qs); + + // legend: _=zero +=one .=don't care 0-f=3bit quantized values s=fp16 scale + + // shift the copies to be able to reach all values + // 255 192 128 64 0 + // | | | | + // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210 in + // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210 shift left + // _______________________sssssfedcba98765432__________________________________________ shift right + // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210 out + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // e b 6 3 _ . f a 7 2 c 9 4 1 _ . d 8 5 0 + bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r)); + + // add to itself in masked places to shift some values left one bit + // 127 64 0 + // | | | | | | | | | | | | | | | | + // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000 in + // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask + // _____________________.999____________________.111____________________________________.ddd____________________.555_______________ masked + // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000 sum + // + // 255 192 128 + // | | | | | | | | | | | | | | | | + // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222 in + // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask + // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________ masked + // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222 sum + const __m256i doublemask = _mm256_set1_epi64x(0x078000078000); + bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx)); + + // collect 16 bytes from 256 into 128 bits + const __m256i shufmask = _mm256_set_epi8( + 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1, + -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0); + bxx = _mm256_shuffle_epi8(bxx, shufmask); + + __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1)); + + const __m128i mask = _mm_set1_epi8(7); + bx = _mm_and_si128(mask, bx); + + const __m128i off = _mm_set1_epi8(4); + bx = _mm_sub_epi8(bx, off); + + const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK3_0)); + + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); + + // Convert int16_t to int32_t by adding pairwise + const __m128i ones = _mm_set1_epi16(1); + __m128i i32 = _mm_madd_epi16(dot, ones); + + // Convert int32_t to float + const __m128 p = _mm_cvtepi32_ps(i32); + + // Apply the scale, and accumulate + acc = _mm_fmadd_ps(scale, p, acc); + } + + // Return horizontal sum of the acc vector + __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + sumf = _mm_cvtss_f32(res); +#else + for (int i = 0; i < nb; i++) { + const float d0 = GGML_FP16_TO_FP32(x[i].d); + const float d1 = y[i/2].d; + + uint64_t qs0 = x[i].qs; + const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK3_0; + + int sumi = 0; + for (int j = 0; j < QK3_0; j++) { + const int8_t i0 = (int8_t)(qs0 & 7) - 4; + const int_fast16_t i1 = p1[j]; + + sumi += i0 * i1; + + qs0 >>= 3; + } + sumf += d0 * d1 * sumi; + } +#endif + + *s = sumf; +} + static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK8_0; @@ -3290,6 +3623,8 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = 1, [GGML_TYPE_F16] = 1, + [GGML_TYPE_Q2_0] = QK2_0, + [GGML_TYPE_Q3_0] = QK3_0, [GGML_TYPE_Q4_0] = QK4_0, [GGML_TYPE_Q4_1] = QK4_1, [GGML_TYPE_Q4_2] = QK4_2, @@ -3299,11 +3634,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_I16] = 1, [GGML_TYPE_I32] = 1, }; -static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated"); +static_assert(GGML_TYPE_COUNT == 12, "GGML_BLCK_SIZE is outdated"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = sizeof(float), [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_Q2_0] = sizeof(block_q2_0), + [GGML_TYPE_Q3_0] = sizeof(block_q3_0), [GGML_TYPE_Q4_0] = sizeof(block_q4_0), [GGML_TYPE_Q4_1] = sizeof(block_q4_1), [GGML_TYPE_Q4_2] = sizeof(block_q4_2), @@ -3313,12 +3650,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { [GGML_TYPE_I16] = sizeof(int16_t), [GGML_TYPE_I32] = sizeof(int32_t), }; -static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated"); - +static_assert(GGML_TYPE_COUNT == 12, "GGML_TYPE_SIZE is outdated"); static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = "f32", [GGML_TYPE_F16] = "f16", + [GGML_TYPE_Q2_0] = "q2_0", + [GGML_TYPE_Q3_0] = "q3_0", [GGML_TYPE_Q4_0] = "q4_0", [GGML_TYPE_Q4_1] = "q4_1", [GGML_TYPE_Q4_2] = "q4_2", @@ -3328,11 +3666,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = { [GGML_TYPE_I16] = "i16", [GGML_TYPE_I32] = "i32", }; -static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated"); +static_assert(GGML_TYPE_COUNT == 12, "GGML_TYPE_NAME is outdated"); static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = false, [GGML_TYPE_F16] = false, + [GGML_TYPE_Q2_0] = true, + [GGML_TYPE_Q3_0] = true, [GGML_TYPE_Q4_0] = true, [GGML_TYPE_Q4_1] = true, [GGML_TYPE_Q4_2] = true, @@ -3342,7 +3682,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = { [GGML_TYPE_I16] = false, [GGML_TYPE_I32] = false, }; -static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated"); +static_assert(GGML_TYPE_COUNT == 12, "GGML_IS_QUANTIZED is outdated"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", @@ -8190,6 +8530,8 @@ static void ggml_compute_forward_mul_mat( const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { + case GGML_TYPE_Q2_0: + case GGML_TYPE_Q3_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: @@ -8419,6 +8761,8 @@ static void ggml_compute_forward_get_rows( const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { + case GGML_TYPE_Q2_0: + case GGML_TYPE_Q3_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: @@ -12092,7 +12436,51 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q2_0(const float * src, void * dst, int n, int k, int64_t hist[1<<2]) { + assert(k % QK2_0 == 0); + const int nb = k / QK2_0; + + for (int j = 0; j < n; j += k) { + block_q2_0 * restrict y = (block_q2_0 *)dst + j/QK2_0; + + quantize_row_q2_0(src + j, y, k); + + for (int i = 0; i < nb; i++) { + uint32_t qs = y[i].qs; + for (int l = 0; l < QK2_0; l++) { + const int8_t vi = qs & 3; + hist[vi]++; + qs >>= 2; + } + } + } + + return (n/QK2_0*sizeof(block_q2_0)); +} + +size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t hist[1<<3]) { + assert(k % QK3_0 == 0); + const int nb = k / QK3_0; + + for (int j = 0; j < n; j += k) { + block_q3_0 * restrict y = (block_q3_0 *)dst + j/QK3_0; + + quantize_row_q3_0(src + j, y, k); + + for (int i = 0; i < nb; i++) { + uint64_t qs = y[i].qs; + for (int l = 0; l < QK3_0; l++) { + const int8_t vi = qs & 7; + hist[vi]++; + qs >>= 3; + } + } + } + + return (n/QK3_0*sizeof(block_q3_0)); +} + +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) { assert(k % QK4_0 == 0); const int nb = k / QK4_0; @@ -12115,7 +12503,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_0*sizeof(block_q4_0)); } -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) { assert(k % QK4_1 == 0); const int nb = k / QK4_1; @@ -12138,7 +12526,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_1*sizeof(block_q4_1)); } -size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) { assert(k % QK4_2 == 0); const int nb = k / QK4_2; @@ -12162,7 +12550,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * return (n/QK4_2*sizeof(block_q4_2)); } -size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) { +size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) { assert(k % QK4_3 == 0); const int nb = k / QK4_3; @@ -12188,6 +12576,18 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) { size_t result = 0; switch (type) { + case GGML_TYPE_Q2_0: + { + GGML_ASSERT(start % QK2_0 == 0); + block_q2_0 * block = (block_q2_0*)dst + start / QK2_0; + result = ggml_quantize_q2_0(src + start, block, n, n, hist); + } break; + case GGML_TYPE_Q3_0: + { + GGML_ASSERT(start % QK3_0 == 0); + block_q3_0 * block = (block_q3_0*)dst + start / QK3_0; + result = ggml_quantize_q3_0(src + start, block, n, n, hist); + } break; case GGML_TYPE_Q4_0: { GGML_ASSERT(start % QK4_0 == 0); diff --git a/ggml.h b/ggml.h index a8a7b6b4ff504..d7f99271ca1a6 100644 --- a/ggml.h +++ b/ggml.h @@ -205,8 +205,10 @@ enum ggml_type { GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_2 = 4, - GGML_TYPE_Q4_3 = 5, - GGML_TYPE_Q8_0 = 6, + GGML_TYPE_Q2_0 = 5, + GGML_TYPE_Q3_0 = 6, + GGML_TYPE_Q4_3, + GGML_TYPE_Q8_0, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -808,10 +810,12 @@ enum ggml_opt_result ggml_opt( // quantization // -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); -size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q2_0(const float * src, void * dst, int n, int k, int64_t hist[1<<2]); +size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t hist[1<<3]); +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t hist[1<<4]); +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t hist[1<<4]); +size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t hist[1<<4]); +size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t hist[1<<4]); size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); diff --git a/llama.cpp b/llama.cpp index e4c414c2dde8e..ebc13b47b1f0e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -479,6 +479,8 @@ struct llama_file_loader { switch (shard.type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_Q2_0: + case GGML_TYPE_Q3_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: @@ -553,6 +555,8 @@ struct llama_file_saver { switch (new_type) { case GGML_TYPE_F32: case GGML_TYPE_F16: + case GGML_TYPE_Q2_0: + case GGML_TYPE_Q3_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_2: @@ -841,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; + case LLAMA_FTYPE_MOSTLY_Q2_0: return "mostly Q2_0"; + case LLAMA_FTYPE_MOSTLY_Q3_0: return "mostly Q3_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -1578,6 +1584,8 @@ static llama_vocab::id llama_sample_top_p_top_k( static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) { ggml_type quantized_type; switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q2_0: quantized_type = GGML_TYPE_Q2_0; break; + case LLAMA_FTYPE_MOSTLY_Q3_0: quantized_type = GGML_TYPE_Q3_0; break; case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; diff --git a/llama.h b/llama.h index e95ff73b8df1d..1d3ebacc8df8c 100644 --- a/llama.h +++ b/llama.h @@ -72,8 +72,10 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_0 = 5, // TODO: revert Q4_2, Q4_3 and give these different values + LLAMA_FTYPE_MOSTLY_Q3_0 = 6, + LLAMA_FTYPE_MOSTLY_Q4_2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_3, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); From c29ab90e06c0ed4a2c95d691d8a619e8a981b0d6 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sun, 16 Apr 2023 09:55:39 +0200 Subject: [PATCH 2/4] Q2 AVX2: do two blocks at a time, by @slaren --- ggml.c | 76 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/ggml.c b/ggml.c index 82876e50ed657..bfafaaa330490 100644 --- a/ggml.c +++ b/ggml.c @@ -488,6 +488,34 @@ static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) } #if __AVX2__ || __AVX512F__ +// Unpack 32 2-bit fields into 32 bytes +// The output vector contains 32 bytes, each one in [ 0 .. 3 ] interval +static inline __m256i bytes_from_crumbs(uint32_t packed_hi, uint32_t packed_lo) { + __m128i bx_hi = _mm_set1_epi32(packed_hi); + __m128i bx_lo = _mm_set1_epi32(packed_lo); + __m256i bx = _mm256_set_m128i(bx_hi, bx_lo); + + // shift counts to get all bit pairs in lowest position of each byte + const __m256i shift256 = _mm256_set_epi32(6, 4, 2, 0, + 6, 4, 2, 0); + bx = _mm256_srlv_epi32(bx, shift256); + + const __m256i shufmask = _mm256_set_epi8(15,11, 7, 3, + 14,10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0, + 15,11, 7, 3, + 14,10, 6, 2, + 13, 9, 5, 1, + 12, 8, 4, 0); + bx = _mm256_shuffle_epi8(bx, shufmask); + + const __m256i mask = _mm256_set1_epi8(3); + bx = _mm256_and_si256(mask, bx); + + return bx; +} + // Unpack 32 4-bit fields into 32 bytes // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) @@ -2500,6 +2528,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK2_0 == 0); const int nb = n / QK2_0; + assert(nb % 2 == 0); const block_q2_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2508,49 +2537,44 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * #if defined(__AVX2__) // Initialize accumulator with zeros - __m128 acc = _mm_setzero_ps(); - - for (int i = 0; i < nb; i++) { - // Compute combined scale for the block - const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d); - - __m128i bx = _mm_set1_epi32(x[i].qs); - - // shift counts to get all bit pairs in lowest position of each byte - const __m128i shift128 = _mm_set_epi32(6, 4, 2, 0); - bx = _mm_srlv_epi32(bx, shift128); + __m256 acc = _mm256_setzero_ps(); - const __m128i shufmask = _mm_set_epi8(15,11,7,3,14,10,6,2,13,9,5,1,12,8,4,0); - bx = _mm_shuffle_epi8(bx, shufmask); + for (int i = 0; i < nb; i += 2) { + __m256i bx = bytes_from_crumbs(x[i+1].qs, x[i].qs); - const __m128i mask = _mm_set1_epi8(3); - bx = _mm_and_si128(mask, bx); + // Compute combined scale for the block + const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+0].d) * y[i/2].d); + const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+1].d) * y[i/2].d); + const __m256 scale = _mm256_set_m128(scale_hi, scale_lo); - const __m128i off = _mm_set1_epi8(2); - bx = _mm_sub_epi8(bx, off); + const __m256i off = _mm256_set1_epi8(2); + bx = _mm256_sub_epi8(bx, off); - const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK2_0)); + // Load y vector + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i/2].qs); // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); + const __m256i ax = _mm256_sign_epi8(bx, bx); // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); + const __m256i sy = _mm256_sign_epi8(by, bx); // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); + const __m256i dot = _mm256_maddubs_epi16(ax, sy); // Convert int16_t to int32_t by adding pairwise - const __m128i ones = _mm_set1_epi16(1); - __m128i i32 = _mm_madd_epi16(dot, ones); + const __m256i ones = _mm256_set1_epi16(1); + __m256i i32 = _mm256_madd_epi16(ones, dot); // Convert int32_t to float - const __m128 p = _mm_cvtepi32_ps(i32); + __m256 p = _mm256_cvtepi32_ps(i32); // Apply the scale, and accumulate - acc = _mm_fmadd_ps(scale, p, acc); + acc = _mm256_fmadd_ps(scale, p, acc); } // Return horizontal sum of the acc vector - __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); + __m128 res = _mm256_extractf128_ps(acc, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(acc)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); res = _mm_add_ss(res, _mm_movehdup_ps(res)); sumf = _mm_cvtss_f32(res); #else From 8c90a860cceb8dc00abeaed0cf877efb79384982 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sun, 16 Apr 2023 15:36:36 +0200 Subject: [PATCH 3/4] More AVX2 optimizations --- ggml.c | 151 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 78 insertions(+), 73 deletions(-) diff --git a/ggml.c b/ggml.c index bfafaaa330490..7148f499714f2 100644 --- a/ggml.c +++ b/ggml.c @@ -2539,19 +2539,20 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); - for (int i = 0; i < nb; i += 2) { - __m256i bx = bytes_from_crumbs(x[i+1].qs, x[i].qs); + for (int i = 0; i < nb/2; i++) { + __m256i bx = bytes_from_crumbs(x[i*2+1].qs, x[i*2].qs); // Compute combined scale for the block - const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+0].d) * y[i/2].d); - const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+1].d) * y[i/2].d); - const __m256 scale = _mm256_set_m128(scale_hi, scale_lo); + const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+0].d)); + const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+1].d)); + __m256 scale = _mm256_set_m128(scale_hi, scale_lo); + scale = _mm256_mul_ps(scale, _mm256_broadcast_ss(&y[i].d)); const __m256i off = _mm256_set1_epi8(2); bx = _mm256_sub_epi8(bx, off); // Load y vector - const __m256i by = _mm256_loadu_si256((const __m256i *)y[i/2].qs); + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); // Get absolute values of x vectors const __m256i ax = _mm256_sign_epi8(bx, bx); @@ -2604,6 +2605,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK3_0 == 0); const int nb = n / QK3_0; + assert(nb % 2 == 0); const block_q3_0 * restrict x = vx; const block_q8_0 * restrict y = vy; @@ -2613,77 +2615,80 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * #if defined(__AVX2__) // Initialize accumulator with zeros __m128 acc = _mm_setzero_ps(); - for (int i = 0; i < nb; i++) { - // Compute combined scale for the block - const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d); - - const __m256i shift_l = _mm256_set_epi64x(2*3, 64, 4*3, 0); - const __m256i shift_r = _mm256_set_epi64x( 64, 2*3, 64, 64); - - __m256i bxx = _mm256_set1_epi64x(x[i].qs); - - // legend: _=zero +=one .=don't care 0-f=3bit quantized values s=fp16 scale - - // shift the copies to be able to reach all values - // 255 192 128 64 0 - // | | | | - // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210 in - // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210 shift left - // _______________________sssssfedcba98765432__________________________________________ shift right - // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210 out - // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ - // e b 6 3 _ . f a 7 2 c 9 4 1 _ . d 8 5 0 - bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r)); - - // add to itself in masked places to shift some values left one bit - // 127 64 0 - // | | | | | | | | | | | | | | | | - // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000 in - // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask - // _____________________.999____________________.111____________________________________.ddd____________________.555_______________ masked - // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000 sum - // - // 255 192 128 - // | | | | | | | | | | | | | | | | - // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222 in - // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask - // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________ masked - // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222 sum - const __m256i doublemask = _mm256_set1_epi64x(0x078000078000); - bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx)); - - // collect 16 bytes from 256 into 128 bits - const __m256i shufmask = _mm256_set_epi8( - 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1, - -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0); - bxx = _mm256_shuffle_epi8(bxx, shufmask); + for (int i = 0; i < nb/2; i++) { + const __m128 scale_y = _mm_set1_ps(y[i].d); + for (int u = 0; u < 2; u++) { // let the compiler unroll this + // Compute combined scale for the block + const __m128 scale_x = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+u].d)); + const __m128 scale = _mm_mul_ps(scale_x, scale_y); + + __m256i bxx = _mm256_set1_epi64x(x[i*2+u].qs); + + // legend: _=zero +=one .=don't care 0-f=3bit quantized values s=fp16 scale + + // shift the copies to be able to reach all values + // 255 192 128 64 0 + // | | | | + // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210 in + // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210 shift left + // _______________________sssssfedcba98765432__________________________________________ shift right + // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210 out + // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + // e b 6 3 _ . f a 7 2 c 9 4 1 _ . d 8 5 0 + const __m256i shift_l = _mm256_set_epi64x(2*3, 64, 4*3, 0); + const __m256i shift_r = _mm256_set_epi64x( 64, 2*3, 64, 64); + bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r)); + + // add to itself in masked places to shift some values left one bit + // 127 64 0 + // | | | | | | | | | | | | | | | | + // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000 in + // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask + // _____________________.999____________________.111____________________________________.ddd____________________.555_______________ masked + // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000 sum + // + // 255 192 128 + // | | | | | | | | | | | | | | | | + // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222 in + // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask + // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________ masked + // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222 sum + const __m256i doublemask = _mm256_set1_epi64x(0x078000078000); + bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx)); + + // collect 16 bytes from 256 into 128 bits + const __m256i shufmask = _mm256_set_epi8( + 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1, + -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0); + bxx = _mm256_shuffle_epi8(bxx, shufmask); + + __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1)); + + const __m128i mask = _mm_set1_epi8(7); + bx = _mm_and_si128(mask, bx); + + const __m128i off = _mm_set1_epi8(4); + bx = _mm_sub_epi8(bx, off); + + const __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + u*QK3_0)); - __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1)); - - const __m128i mask = _mm_set1_epi8(7); - bx = _mm_and_si128(mask, bx); - - const __m128i off = _mm_set1_epi8(4); - bx = _mm_sub_epi8(bx, off); - - const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK3_0)); - - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); + // Get absolute values of x vectors + const __m128i ax = _mm_sign_epi8(bx, bx); + // Sign the values of the y vectors + const __m128i sy = _mm_sign_epi8(by, bx); + // Perform multiplication and create 16-bit values + const __m128i dot = _mm_maddubs_epi16(ax, sy); - // Convert int16_t to int32_t by adding pairwise - const __m128i ones = _mm_set1_epi16(1); - __m128i i32 = _mm_madd_epi16(dot, ones); + // Convert int16_t to int32_t by adding pairwise + const __m128i ones = _mm_set1_epi16(1); + __m128i i32 = _mm_madd_epi16(dot, ones); - // Convert int32_t to float - const __m128 p = _mm_cvtepi32_ps(i32); + // Convert int32_t to float + const __m128 p = _mm_cvtepi32_ps(i32); - // Apply the scale, and accumulate - acc = _mm_fmadd_ps(scale, p, acc); + // Apply the scale, and accumulate + acc = _mm_fmadd_ps(scale, p, acc); + } } // Return horizontal sum of the acc vector From 7aa501cd1cd67334af9f883295d9b58d232edfc1 Mon Sep 17 00:00:00 2001 From: pubby Date: Mon, 17 Apr 2023 10:38:45 -0500 Subject: [PATCH 4/4] Faster q3_0 implementation, using two planes, by @pubby --- ggml.c | 214 +++++++++++++++++++++++++++------------------------------ 1 file changed, 102 insertions(+), 112 deletions(-) diff --git a/ggml.c b/ggml.c index 7148f499714f2..8d5c4e028c6fb 100644 --- a/ggml.c +++ b/ggml.c @@ -662,12 +662,12 @@ typedef struct { static_assert(sizeof(block_q2_0) == sizeof(ggml_fp16_t) + QK2_0 / 4, "wrong q2_0 size/padding"); #define QK3_0 16 -typedef union { - struct { - uint16_t pad[3]; - ggml_fp16_t d; - }; - uint64_t qs; +typedef struct { + ggml_fp16_t d; + // Instead of representing q3_0 as a packed format "...210210210210", + // represent it as two planes: "...10101010" and "...2222" + uint16_t qhi; // The highest bit of each 3-bit number, packed together + uint32_t qlo; // The low 2-bits of each 3-bit number, packed together } block_q3_0; static_assert(sizeof(block_q3_0) == sizeof(ggml_fp16_t) + QK3_0 * 3 / 8, "wrong q3_0 size/padding"); @@ -762,17 +762,20 @@ static void quantize_row_q3_0(const float * restrict x, block_q3_0 * restrict y, const float d = max / -4; const float id = d ? 1.0f/d : 0.0f; - uint64_t qs = 0; + uint32_t lo = 0; + uint16_t hi = 0; for (int l = 0; l < QK3_0; l++) { const float v = x[i*QK3_0 + l]*id; const uint8_t vi = MIN(7, (int8_t)roundf(v) + 4); assert(vi < 8); - qs |= (uint64_t)vi << (l*3); + lo |= (vi & 3) << (l * 2); + hi |= ((vi >> 2) & 1) << l; } - y[i].qs = qs; - y[i].d = GGML_FP32_TO_FP16(d); // overwrite unused part of uint64_t qs + y[i].d = GGML_FP32_TO_FP16(d); + y[i].qlo = lo; + y[i].qhi = hi; } } @@ -1573,13 +1576,15 @@ static void dequantize_row_q3_0(const void * restrict vx, float * restrict y, in for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - uint64_t qs = x[i].qs; + uint_fast32_t lo = x[i].qlo; + uint_fast32_t hi = x[i].qhi << 2; for (int l = 0; l < QK3_0; l++) { - const int8_t vi = qs & 7; + const int8_t vi = (lo & 3) | (hi & 4); const float v = (vi - 4)*d; y[i*QK3_0 + l] = v; assert(!isnan(y[i*QK3_0 + l])); - qs >>= 3; + lo >>= 2; + hi >>= 1; } } } @@ -2525,6 +2530,39 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } +#if __AVX2__ || __AVX512F__ +// Computes the dot product of signed 8-bit integers packed into 256-bit vectors, +// converting the result to 32-bit floats packed into a 256-bit vector. +static inline __m256 dotMul(__m256i bx, __m256i by) { +# if __AVXVNNIINT8__ + // Perform multiplication and sum to 32-bit values + const __m256i i32 = _mm256_dpbssd_epi32(bx, by, _mm256_setzero_si256()); +# else + // Get absolute values of x vectors + const __m256i ax = _mm256_sign_epi8(bx, bx); + // Sign the values of the y vectors + const __m256i sy = _mm256_sign_epi8(by, bx); + // Perform multiplication and create 16-bit values + const __m256i dot = _mm256_maddubs_epi16(ax, sy); + + // Convert int16_t to int32_t by adding pairwise + const __m256i ones = _mm256_set1_epi16(1); + const __m256i i32 = _mm256_madd_epi16(ones, dot); +# endif + // Convert int32_t to float + return _mm256_cvtepi32_ps(i32); +} + +// Return horizontal sum of 32-bit floats packed into a 256-bit vector. +static inline float horizontalSum(__m256 acc) { + __m128 res = _mm256_extractf128_ps(acc, 1); + res = _mm_add_ps(res, _mm256_castps256_ps128(acc)); + res = _mm_add_ps(res, _mm_movehl_ps(res, res)); + res = _mm_add_ss(res, _mm_movehdup_ps(res)); + return _mm_cvtss_f32(res); +} +#endif + static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK2_0 == 0); const int nb = n / QK2_0; @@ -2554,30 +2592,15 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * // Load y vector const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); - // Get absolute values of x vectors - const __m256i ax = _mm256_sign_epi8(bx, bx); - // Sign the values of the y vectors - const __m256i sy = _mm256_sign_epi8(by, bx); - // Perform multiplication and create 16-bit values - const __m256i dot = _mm256_maddubs_epi16(ax, sy); - - // Convert int16_t to int32_t by adding pairwise - const __m256i ones = _mm256_set1_epi16(1); - __m256i i32 = _mm256_madd_epi16(ones, dot); - - // Convert int32_t to float - __m256 p = _mm256_cvtepi32_ps(i32); + // Do the product: + __m256 p = dotMul(bx, by); // Apply the scale, and accumulate acc = _mm256_fmadd_ps(scale, p, acc); } // Return horizontal sum of the acc vector - __m128 res = _mm256_extractf128_ps(acc, 1); - res = _mm_add_ps(res, _mm256_castps256_ps128(acc)); - res = _mm_add_ps(res, _mm_movehl_ps(res, res)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - sumf = _mm_cvtss_f32(res); + sumf = horizontalSum(acc); #else for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); @@ -2602,6 +2625,20 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * *s = sumf; } +// Lookup table used to convert q3_0 to SIMD vectors. +// Expands the bits of an 8-bit value into a 64 bit result, turning each bit into a byte. +// A zero bit turns into 0xFC, while a one bit turns into 0x00. +#define B0(n) 0x ## n +#define B1(n) B0(n ## FC), B0(n ## 00) +#define B2(n) B1(n ## FC), B1(n ## 00) +#define B3(n) B2(n ## FC), B2(n ## 00) +#define B4(n) B3(n ## FC), B3(n ## 00) +#define B5(n) B4(n ## FC), B4(n ## 00) +#define B6(n) B5(n ## FC), B5(n ## 00) +#define B7(n) B6(n ## FC), B6(n ## 00) +#define B8( ) B7( FC), B7( 00) +static const uint64_t ggml_q3_table[256] = { B8() }; + static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { assert(n % QK3_0 == 0); const int nb = n / QK3_0; @@ -2614,103 +2651,54 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * #if defined(__AVX2__) // Initialize accumulator with zeros - __m128 acc = _mm_setzero_ps(); + __m256 acc = _mm256_setzero_ps(); + for (int i = 0; i < nb/2; i++) { - const __m128 scale_y = _mm_set1_ps(y[i].d); - for (int u = 0; u < 2; u++) { // let the compiler unroll this - // Compute combined scale for the block - const __m128 scale_x = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+u].d)); - const __m128 scale = _mm_mul_ps(scale_x, scale_y); - - __m256i bxx = _mm256_set1_epi64x(x[i*2+u].qs); - - // legend: _=zero +=one .=don't care 0-f=3bit quantized values s=fp16 scale - - // shift the copies to be able to reach all values - // 255 192 128 64 0 - // | | | | - // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210 in - // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210 shift left - // _______________________sssssfedcba98765432__________________________________________ shift right - // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210 out - // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ - // e b 6 3 _ . f a 7 2 c 9 4 1 _ . d 8 5 0 - const __m256i shift_l = _mm256_set_epi64x(2*3, 64, 4*3, 0); - const __m256i shift_r = _mm256_set_epi64x( 64, 2*3, 64, 64); - bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r)); - - // add to itself in masked places to shift some values left one bit - // 127 64 0 - // | | | | | | | | | | | | | | | | - // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000 in - // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask - // _____________________.999____________________.111____________________________________.ddd____________________.555_______________ masked - // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000 sum - // - // 255 192 128 - // | | | | | | | | | | | | | | | | - // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222 in - // _____________________++++____________________++++____________________________________++++____________________++++_______________ mask - // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________ masked - // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222 sum - const __m256i doublemask = _mm256_set1_epi64x(0x078000078000); - bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx)); - - // collect 16 bytes from 256 into 128 bits - const __m256i shufmask = _mm256_set_epi8( - 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1, - -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0); - bxx = _mm256_shuffle_epi8(bxx, shufmask); - - __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1)); - - const __m128i mask = _mm_set1_epi8(7); - bx = _mm_and_si128(mask, bx); - - const __m128i off = _mm_set1_epi8(4); - bx = _mm_sub_epi8(bx, off); - - const __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + u*QK3_0)); + __m256i bx = bytes_from_crumbs(x[i*2+1].qlo, x[i*2].qlo); - // Get absolute values of x vectors - const __m128i ax = _mm_sign_epi8(bx, bx); - // Sign the values of the y vectors - const __m128i sy = _mm_sign_epi8(by, bx); - // Perform multiplication and create 16-bit values - const __m128i dot = _mm_maddubs_epi16(ax, sy); + __m256i const bxhi = _mm256_set_epi64x( + ggml_q3_table[x[i*2+1].qhi >> 8], ggml_q3_table[x[i*2+1].qhi & 0xFF], + ggml_q3_table[x[i*2+0].qhi >> 8], ggml_q3_table[x[i*2+0].qhi & 0xFF]); - // Convert int16_t to int32_t by adding pairwise - const __m128i ones = _mm_set1_epi16(1); - __m128i i32 = _mm_madd_epi16(dot, ones); + // OR the high bits (which also handles the sign): + bx = _mm256_or_si256(bx, bxhi); - // Convert int32_t to float - const __m128 p = _mm_cvtepi32_ps(i32); + // Compute combined scale for the block + const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+0].d)); + const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+1].d)); + __m256 scale = _mm256_set_m128(scale_hi, scale_lo); + scale = _mm256_mul_ps(scale, _mm256_broadcast_ss(&y[i].d)); - // Apply the scale, and accumulate - acc = _mm_fmadd_ps(scale, p, acc); - } + // Load y vector + const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs); + + // Do the product, + __m256 p = dotMul(bx, by); + + // Apply the scale, and accumulate + acc = _mm256_fmadd_ps(scale, p, acc); } // Return horizontal sum of the acc vector - __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc)); - res = _mm_add_ss(res, _mm_movehdup_ps(res)); - sumf = _mm_cvtss_f32(res); + sumf = horizontalSum(acc); #else for (int i = 0; i < nb; i++) { const float d0 = GGML_FP16_TO_FP32(x[i].d); const float d1 = y[i/2].d; - uint64_t qs0 = x[i].qs; + uint_fast32_t lo0 = x[i].qlo; + uint_fast32_t hi0 = x[i].qhi << 2; const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK3_0; int sumi = 0; - for (int j = 0; j < QK3_0; j++) { - const int8_t i0 = (int8_t)(qs0 & 7) - 4; - const int_fast16_t i1 = p1[j]; + for (int l = 0; l < QK3_0; l++) { + const int8_t i0 = (int8_t)((lo0 & 3) | ((hi0 & 4) - 4)); + const int_fast16_t i1 = p1[l]; sumi += i0 * i1; - qs0 >>= 3; + lo0 >>= 2; + hi0 >>= 1; } sumf += d0 * d1 * sumi; } @@ -12497,11 +12485,13 @@ size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t h quantize_row_q3_0(src + j, y, k); for (int i = 0; i < nb; i++) { - uint64_t qs = y[i].qs; + uint_fast32_t lo = y[i].qlo; + uint_fast32_t hi = y[i].qhi << 2; for (int l = 0; l < QK3_0; l++) { - const int8_t vi = qs & 7; + int8_t vi = (lo & 3) | (hi & 4); hist[vi]++; - qs >>= 3; + lo >>= 2; + hi >>= 1; } } }