From 6fc51a8c0521be9fc5c255387c40e2acce181799 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Fri, 24 Mar 2023 17:32:35 +0100
Subject: [PATCH 1/4] Q2 and Q3 quantization

---
 examples/quantize/quantize.cpp |   2 +
 ggml.c                         | 418 ++++++++++++++++++++++++++++++++-
 ggml.h                         |  16 +-
 llama.cpp                      |   8 +
 llama.h                        |   6 +-
 5 files changed, 433 insertions(+), 17 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 5b4812c62ba9c..0e8ce9e92d9e1 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -12,6 +12,8 @@ int main(int argc, char ** argv) {
 
     if (argc < 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
+        fprintf(stderr, "  type = %d - q2_0\n", LLAMA_FTYPE_MOSTLY_Q2_0);
+        fprintf(stderr, "  type = %d - q3_0\n", LLAMA_FTYPE_MOSTLY_Q3_0);
         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
diff --git a/ggml.c b/ggml.c
index 8109b36b24432..82876e50ed657 100644
--- a/ggml.c
+++ b/ggml.c
@@ -624,6 +624,25 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
 #endif
 
 
+#define QK2_0 16
+#pragma pack(push, 1)
+typedef struct {
+    ggml_fp16_t d;
+    uint32_t    qs;
+} block_q2_0;
+#pragma pack(pop)
+static_assert(sizeof(block_q2_0) == sizeof(ggml_fp16_t) + QK2_0 / 4, "wrong q2_0 size/padding");
+
+#define QK3_0 16
+typedef union {
+    struct {
+        uint16_t    pad[3];
+        ggml_fp16_t d;
+    };
+    uint64_t qs;
+} block_q3_0;
+static_assert(sizeof(block_q3_0) == sizeof(ggml_fp16_t) + QK3_0 * 3 / 8, "wrong q3_0 size/padding");
+
 #define QK4_0 32
 typedef struct {
     float   d;          // delta
@@ -663,6 +682,72 @@ static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block siz
 
 
 // reference implementation for deterministic creation of model files
+static void quantize_row_q2_0(const float * restrict x, block_q2_0 * restrict y, int k) {
+    assert(k % QK2_0 == 0);
+    const int nb = k / QK2_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+
+        for (int l = 0; l < QK2_0; l++) {
+            const float v = x[i*QK2_0 + l];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
+        }
+
+        const float d = max / -2;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qs = 0;
+
+        for (int l = 0; l < QK2_0; l++) {
+            const float v = x[i*QK2_0 + l]*id;
+            const uint8_t vi = MIN(3, (int8_t)roundf(v) + 2);
+            assert(vi < 4);
+            qs |= (uint32_t)vi << (l*2);
+        }
+        y[i].qs = qs;
+    }
+}
+
+static void quantize_row_q3_0(const float * restrict x, block_q3_0 * restrict y, int k) {
+    assert(k % QK3_0 == 0);
+    const int nb = k / QK3_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+
+        for (int l = 0; l < QK3_0; l++) {
+            const float v = x[i*QK3_0 + l];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
+        }
+
+        const float d = max / -4;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        uint64_t qs = 0;
+
+        for (int l = 0; l < QK3_0; l++) {
+            const float v = x[i*QK3_0 + l]*id;
+            const uint8_t vi = MIN(7, (int8_t)roundf(v) + 4);
+            assert(vi < 8);
+            qs |= (uint64_t)vi << (l*3);
+        }
+
+        y[i].qs = qs;
+        y[i].d = GGML_FP32_TO_FP16(d);  // overwrite unused part of uint64_t qs
+    }
+}
+
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
@@ -1432,6 +1517,45 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
 #endif
 }
 
+// TODO: vectorize
+static void dequantize_row_q2_0(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK2_0 == 0);
+    const int nb = k / QK2_0;
+
+    const block_q2_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        uint32_t qs = x[i].qs;
+        for (int l = 0; l < QK2_0; l++) {
+            const int8_t vi = qs & 3;
+            const float v = (vi - 2)*d;
+            y[i*QK2_0 + l] = v;
+            assert(!isnan(y[i*QK2_0 + l]));
+            qs >>= 2;
+        }
+    }
+}
+
+static void dequantize_row_q3_0(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK3_0 == 0);
+    const int nb = k / QK3_0;
+
+    const block_q3_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        uint64_t qs = x[i].qs;
+        for (int l = 0; l < QK3_0; l++) {
+            const int8_t vi = qs & 7;
+            const float v = (vi - 4)*d;
+            y[i*QK3_0 + l] = v;
+            assert(!isnan(y[i*QK3_0 + l]));
+            qs >>= 3;
+        }
+    }
+}
+
 static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
@@ -1715,12 +1839,28 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
     }
 }
 
+static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_Q2_0] = {
+        .dequantize_row_q         = dequantize_row_q2_0,
+        .quantize_row_q           = (quantize_row_q_t) quantize_row_q2_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q2_0,
+        .quantize_row_q_dot       = quantize_row_q8_0,
+        .vec_dot_q                = ggml_vec_dot_q2_0_q8_0,
+    },
+    [GGML_TYPE_Q3_0] = {
+        .dequantize_row_q         = dequantize_row_q3_0,
+        .quantize_row_q           = (quantize_row_q_t) quantize_row_q3_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q3_0,
+        .quantize_row_q_dot       = quantize_row_q8_0,
+        .vec_dot_q                = ggml_vec_dot_q3_0_q8_0,
+    },
     [GGML_TYPE_Q4_0] = {
         .dequantize_row_q         = dequantize_row_q4_0,
         .quantize_row_q           = quantize_row_q4_0,
@@ -2357,6 +2497,199 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
     *s = sumf;
 }
 
+static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK2_0 == 0);
+    const int nb = n / QK2_0;
+
+    const block_q2_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+    float sumf = 0.0f;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m128 acc = _mm_setzero_ps();
+
+    for (int i = 0; i < nb; i++) {
+        // Compute combined scale for the block
+        const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d);
+
+        __m128i bx = _mm_set1_epi32(x[i].qs);
+
+        // shift counts to get all bit pairs in lowest position of each byte
+        const __m128i shift128 = _mm_set_epi32(6, 4, 2, 0);
+        bx = _mm_srlv_epi32(bx, shift128);
+
+        const __m128i shufmask = _mm_set_epi8(15,11,7,3,14,10,6,2,13,9,5,1,12,8,4,0);
+        bx = _mm_shuffle_epi8(bx, shufmask);
+
+        const __m128i mask = _mm_set1_epi8(3);
+        bx = _mm_and_si128(mask, bx);
+
+        const __m128i off = _mm_set1_epi8(2);
+        bx = _mm_sub_epi8(bx, off);
+
+        const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK2_0));
+
+        // Get absolute values of x vectors
+        const __m128i ax = _mm_sign_epi8(bx, bx);
+        // Sign the values of the y vectors
+        const __m128i sy = _mm_sign_epi8(by, bx);
+        // Perform multiplication and create 16-bit values
+        const __m128i dot = _mm_maddubs_epi16(ax, sy);
+
+        // Convert int16_t to int32_t by adding pairwise
+        const __m128i ones = _mm_set1_epi16(1);
+        __m128i i32 = _mm_madd_epi16(dot, ones);
+
+        // Convert int32_t to float
+        const __m128 p = _mm_cvtepi32_ps(i32);
+
+        // Apply the scale, and accumulate
+        acc = _mm_fmadd_ps(scale, p, acc);
+    }
+
+    // Return horizontal sum of the acc vector
+    __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    sumf = _mm_cvtss_f32(res);
+#else
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d1 = y[i/2].d;
+
+        uint_fast32_t qs0 = x[i].qs;
+        const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK2_0;
+
+        int sumi = 0;
+        for (int j = 0; j < QK2_0; j++) {
+            const int8_t i0 = (int8_t)(qs0 & 3) - 2;
+            const int_fast16_t i1 = p1[j];
+
+            sumi += i0 * i1;
+
+            qs0 >>= 2;
+        }
+        sumf += d0 * d1 * sumi;
+    }
+#endif
+
+    *s = sumf;
+}
+
+static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    assert(n % QK3_0 == 0);
+    const int nb = n / QK3_0;
+
+    const block_q3_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+    float sumf = 0.0f;
+
+#if defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m128 acc = _mm_setzero_ps();
+    for (int i = 0; i < nb; i++) {
+        // Compute combined scale for the block
+        const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d);
+
+        const __m256i shift_l = _mm256_set_epi64x(2*3,  64, 4*3,  0);
+        const __m256i shift_r = _mm256_set_epi64x( 64, 2*3,  64, 64);
+
+        __m256i bxx = _mm256_set1_epi64x(x[i].qs);
+
+        // legend:    _=zero    +=one    .=don't care    0-f=3bit quantized values    s=fp16 scale
+
+        // shift the copies to be able to reach all values
+        // 255               192                  128                   64                    0
+        //                     |                    |                    |                    |
+        // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210  in
+        // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210  shift left
+        // _______________________sssssfedcba98765432__________________________________________  shift right
+        // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210  out
+        //     ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^
+        //     e  b    6  3    _    .  f    a  7    2    c  9    4  1    _    .  d    8  5    0
+        bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r));
+
+        // add to itself in masked places to shift some values left one bit
+        // 127                                                           64                                                               0
+        //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
+        // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000  in
+        // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
+        // _____________________.999____________________.111____________________________________.ddd____________________.555_______________  masked
+        // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000  sum
+        //
+        // 255                                                          192                                                             128
+        //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
+        // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222  in
+        // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
+        // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________  masked
+        // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222  sum
+        const __m256i doublemask = _mm256_set1_epi64x(0x078000078000);
+        bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx));
+
+        // collect 16 bytes from 256 into 128 bits
+        const __m256i shufmask = _mm256_set_epi8(
+             5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1,
+            -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0);
+        bxx = _mm256_shuffle_epi8(bxx, shufmask);
+
+        __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1));
+
+        const __m128i mask = _mm_set1_epi8(7);
+        bx = _mm_and_si128(mask, bx);
+
+        const __m128i off = _mm_set1_epi8(4);
+        bx = _mm_sub_epi8(bx, off);
+
+        const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK3_0));
+
+        // Get absolute values of x vectors
+        const __m128i ax = _mm_sign_epi8(bx, bx);
+        // Sign the values of the y vectors
+        const __m128i sy = _mm_sign_epi8(by, bx);
+        // Perform multiplication and create 16-bit values
+        const __m128i dot = _mm_maddubs_epi16(ax, sy);
+
+        // Convert int16_t to int32_t by adding pairwise
+        const __m128i ones = _mm_set1_epi16(1);
+        __m128i i32 = _mm_madd_epi16(dot, ones);
+
+        // Convert int32_t to float
+        const __m128 p = _mm_cvtepi32_ps(i32);
+
+        // Apply the scale, and accumulate
+        acc = _mm_fmadd_ps(scale, p, acc);
+    }
+
+    // Return horizontal sum of the acc vector
+    __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    sumf = _mm_cvtss_f32(res);
+#else
+    for (int i = 0; i < nb; i++) {
+        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d1 = y[i/2].d;
+
+        uint64_t qs0 = x[i].qs;
+        const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK3_0;
+
+        int sumi = 0;
+        for (int j = 0; j < QK3_0; j++) {
+            const int8_t i0 = (int8_t)(qs0 & 7) - 4;
+            const int_fast16_t i1 = p1[j];
+
+            sumi += i0 * i1;
+
+            qs0 >>= 3;
+        }
+        sumf += d0 * d1 * sumi;
+    }
+#endif
+
+    *s = sumf;
+}
+
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const int nb = n / QK8_0;
 
@@ -3290,6 +3623,8 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
 static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = 1,
     [GGML_TYPE_F16]  = 1,
+    [GGML_TYPE_Q2_0] = QK2_0,
+    [GGML_TYPE_Q3_0] = QK3_0,
     [GGML_TYPE_Q4_0] = QK4_0,
     [GGML_TYPE_Q4_1] = QK4_1,
     [GGML_TYPE_Q4_2] = QK4_2,
@@ -3299,11 +3634,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I16]  = 1,
     [GGML_TYPE_I32]  = 1,
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 12, "GGML_BLCK_SIZE is outdated");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = sizeof(float),
     [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
+    [GGML_TYPE_Q2_0] = sizeof(block_q2_0),
+    [GGML_TYPE_Q3_0] = sizeof(block_q3_0),
     [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
     [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
     [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
@@ -3313,12 +3650,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I16]  = sizeof(int16_t),
     [GGML_TYPE_I32]  = sizeof(int32_t),
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated");
-
+static_assert(GGML_TYPE_COUNT == 12, "GGML_TYPE_SIZE is outdated");
 
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = "f32",
     [GGML_TYPE_F16]  = "f16",
+    [GGML_TYPE_Q2_0] = "q2_0",
+    [GGML_TYPE_Q3_0] = "q3_0",
     [GGML_TYPE_Q4_0] = "q4_0",
     [GGML_TYPE_Q4_1] = "q4_1",
     [GGML_TYPE_Q4_2] = "q4_2",
@@ -3328,11 +3666,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I16]  = "i16",
     [GGML_TYPE_I32]  = "i32",
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated");
+static_assert(GGML_TYPE_COUNT == 12, "GGML_TYPE_NAME is outdated");
 
 static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = false,
     [GGML_TYPE_F16]  = false,
+    [GGML_TYPE_Q2_0] = true,
+    [GGML_TYPE_Q3_0] = true,
     [GGML_TYPE_Q4_0] = true,
     [GGML_TYPE_Q4_1] = true,
     [GGML_TYPE_Q4_2] = true,
@@ -3342,7 +3682,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I16]  = false,
     [GGML_TYPE_I32]  = false,
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated");
+static_assert(GGML_TYPE_COUNT == 12, "GGML_IS_QUANTIZED is outdated");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
@@ -8190,6 +8530,8 @@ static void ggml_compute_forward_mul_mat(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
+        case GGML_TYPE_Q2_0:
+        case GGML_TYPE_Q3_0:
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
@@ -8419,6 +8761,8 @@ static void ggml_compute_forward_get_rows(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     switch (src0->type) {
+        case GGML_TYPE_Q2_0:
+        case GGML_TYPE_Q3_0:
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
@@ -12092,7 +12436,51 @@ enum ggml_opt_result ggml_opt(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t ggml_quantize_q2_0(const float * src, void * dst, int n, int k, int64_t hist[1<<2]) {
+    assert(k % QK2_0 == 0);
+    const int nb = k / QK2_0;
+
+    for (int j = 0; j < n; j += k) {
+        block_q2_0 * restrict y = (block_q2_0 *)dst + j/QK2_0;
+
+        quantize_row_q2_0(src + j, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qs = y[i].qs;
+            for (int l = 0; l < QK2_0; l++) {
+                const int8_t vi = qs & 3;
+                hist[vi]++;
+                qs >>= 2;
+            }
+        }
+    }
+
+    return (n/QK2_0*sizeof(block_q2_0));
+}
+
+size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t hist[1<<3]) {
+    assert(k % QK3_0 == 0);
+    const int nb = k / QK3_0;
+
+    for (int j = 0; j < n; j += k) {
+        block_q3_0 * restrict y = (block_q3_0 *)dst + j/QK3_0;
+
+        quantize_row_q3_0(src + j, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint64_t qs = y[i].qs;
+            for (int l = 0; l < QK3_0; l++) {
+                const int8_t vi = qs & 7;
+                hist[vi]++;
+                qs >>= 3;
+            }
+        }
+    }
+
+    return (n/QK3_0*sizeof(block_q3_0));
+}
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
 
@@ -12115,7 +12503,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_0*sizeof(block_q4_0));
 }
 
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
 
@@ -12138,7 +12526,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_1*sizeof(block_q4_1));
 }
 
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) {
     assert(k % QK4_2 == 0);
     const int nb = k / QK4_2;
 
@@ -12162,7 +12550,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_2*sizeof(block_q4_2));
 }
 
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t hist[1<<4]) {
     assert(k % QK4_3 == 0);
     const int nb = k / QK4_3;
 
@@ -12188,6 +12576,18 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
     size_t result = 0;
     switch (type) {
+        case GGML_TYPE_Q2_0:
+            {
+                GGML_ASSERT(start % QK2_0 == 0);
+                block_q2_0 * block = (block_q2_0*)dst + start / QK2_0;
+                result = ggml_quantize_q2_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q3_0:
+            {
+                GGML_ASSERT(start % QK3_0 == 0);
+                block_q3_0 * block = (block_q3_0*)dst + start / QK3_0;
+                result = ggml_quantize_q3_0(src + start, block, n, n, hist);
+            } break;
         case GGML_TYPE_Q4_0:
             {
                 GGML_ASSERT(start % QK4_0 == 0);
diff --git a/ggml.h b/ggml.h
index a8a7b6b4ff504..d7f99271ca1a6 100644
--- a/ggml.h
+++ b/ggml.h
@@ -205,8 +205,10 @@ enum ggml_type {
     GGML_TYPE_Q4_0 = 2,
     GGML_TYPE_Q4_1 = 3,
     GGML_TYPE_Q4_2 = 4,
-    GGML_TYPE_Q4_3 = 5,
-    GGML_TYPE_Q8_0 = 6,
+    GGML_TYPE_Q2_0 = 5,
+    GGML_TYPE_Q3_0 = 6,
+    GGML_TYPE_Q4_3,
+    GGML_TYPE_Q8_0,
     GGML_TYPE_I8,
     GGML_TYPE_I16,
     GGML_TYPE_I32,
@@ -808,10 +810,12 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //
 
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q2_0(const float * src, void * dst, int n, int k, int64_t hist[1<<2]);
+size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t hist[1<<3]);
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t hist[1<<4]);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t hist[1<<4]);
+size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t hist[1<<4]);
+size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t hist[1<<4]);
 
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 
diff --git a/llama.cpp b/llama.cpp
index e4c414c2dde8e..ebc13b47b1f0e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -479,6 +479,8 @@ struct llama_file_loader {
             switch (shard.type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_F16:
+                case GGML_TYPE_Q2_0:
+                case GGML_TYPE_Q3_0:
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
@@ -553,6 +555,8 @@ struct llama_file_saver {
         switch (new_type) {
             case GGML_TYPE_F32:
             case GGML_TYPE_F16:
+            case GGML_TYPE_Q2_0:
+            case GGML_TYPE_Q3_0:
             case GGML_TYPE_Q4_0:
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
@@ -841,6 +845,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
     switch (ftype) {
         case LLAMA_FTYPE_ALL_F32:     return "all F32";
         case LLAMA_FTYPE_MOSTLY_F16:  return "mostly F16";
+        case LLAMA_FTYPE_MOSTLY_Q2_0: return "mostly Q2_0";
+        case LLAMA_FTYPE_MOSTLY_Q3_0: return "mostly Q3_0";
         case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@@ -1578,6 +1584,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
     ggml_type quantized_type;
     switch (ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q2_0: quantized_type = GGML_TYPE_Q2_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q3_0: quantized_type = GGML_TYPE_Q3_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
diff --git a/llama.h b/llama.h
index e95ff73b8df1d..1d3ebacc8df8c 100644
--- a/llama.h
+++ b/llama.h
@@ -72,8 +72,10 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_0 = 5,  // TODO: revert Q4_2, Q4_3 and give these different values
+        LLAMA_FTYPE_MOSTLY_Q3_0 = 6,
+        LLAMA_FTYPE_MOSTLY_Q4_2,      // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_3,      // except 1d tensors
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();

From c29ab90e06c0ed4a2c95d691d8a619e8a981b0d6 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sun, 16 Apr 2023 09:55:39 +0200
Subject: [PATCH 2/4] Q2 AVX2: do two blocks at a time, by @slaren

---
 ggml.c | 76 ++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 26 deletions(-)

diff --git a/ggml.c b/ggml.c
index 82876e50ed657..bfafaaa330490 100644
--- a/ggml.c
+++ b/ggml.c
@@ -488,6 +488,34 @@ static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
 }
 
 #if __AVX2__ || __AVX512F__
+// Unpack 32 2-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 3 ] interval
+static inline __m256i bytes_from_crumbs(uint32_t packed_hi, uint32_t packed_lo) {
+    __m128i bx_hi = _mm_set1_epi32(packed_hi);
+    __m128i bx_lo = _mm_set1_epi32(packed_lo);
+    __m256i bx = _mm256_set_m128i(bx_hi, bx_lo);
+
+    // shift counts to get all bit pairs in lowest position of each byte
+    const __m256i shift256 = _mm256_set_epi32(6, 4, 2, 0,
+                                              6, 4, 2, 0);
+    bx = _mm256_srlv_epi32(bx, shift256);
+
+    const __m256i shufmask = _mm256_set_epi8(15,11, 7, 3,
+                                             14,10, 6, 2,
+                                             13, 9, 5, 1,
+                                             12, 8, 4, 0,
+                                             15,11, 7, 3,
+                                             14,10, 6, 2,
+                                             13, 9, 5, 1,
+                                             12, 8, 4, 0);
+    bx = _mm256_shuffle_epi8(bx, shufmask);
+
+    const __m256i mask = _mm256_set1_epi8(3);
+    bx = _mm256_and_si256(mask, bx);
+
+    return bx;
+}
+
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
@@ -2500,6 +2528,7 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
 static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % QK2_0 == 0);
     const int nb = n / QK2_0;
+    assert(nb % 2 == 0);
 
     const block_q2_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -2508,49 +2537,44 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
 
 #if defined(__AVX2__)
     // Initialize accumulator with zeros
-    __m128 acc = _mm_setzero_ps();
-
-    for (int i = 0; i < nb; i++) {
-        // Compute combined scale for the block
-        const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d);
-
-        __m128i bx = _mm_set1_epi32(x[i].qs);
-
-        // shift counts to get all bit pairs in lowest position of each byte
-        const __m128i shift128 = _mm_set_epi32(6, 4, 2, 0);
-        bx = _mm_srlv_epi32(bx, shift128);
+    __m256 acc = _mm256_setzero_ps();
 
-        const __m128i shufmask = _mm_set_epi8(15,11,7,3,14,10,6,2,13,9,5,1,12,8,4,0);
-        bx = _mm_shuffle_epi8(bx, shufmask);
+    for (int i = 0; i < nb; i += 2) {
+        __m256i bx = bytes_from_crumbs(x[i+1].qs, x[i].qs);
 
-        const __m128i mask = _mm_set1_epi8(3);
-        bx = _mm_and_si128(mask, bx);
+        // Compute combined scale for the block
+        const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+0].d) * y[i/2].d);
+        const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+1].d) * y[i/2].d);
+        const __m256 scale = _mm256_set_m128(scale_hi, scale_lo);
 
-        const __m128i off = _mm_set1_epi8(2);
-        bx = _mm_sub_epi8(bx, off);
+        const __m256i off = _mm256_set1_epi8(2);
+        bx = _mm256_sub_epi8(bx, off);
 
-        const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK2_0));
+        // Load y vector
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i/2].qs);
 
         // Get absolute values of x vectors
-        const __m128i ax = _mm_sign_epi8(bx, bx);
+        const __m256i ax = _mm256_sign_epi8(bx, bx);
         // Sign the values of the y vectors
-        const __m128i sy = _mm_sign_epi8(by, bx);
+        const __m256i sy = _mm256_sign_epi8(by, bx);
         // Perform multiplication and create 16-bit values
-        const __m128i dot = _mm_maddubs_epi16(ax, sy);
+        const __m256i dot = _mm256_maddubs_epi16(ax, sy);
 
         // Convert int16_t to int32_t by adding pairwise
-        const __m128i ones = _mm_set1_epi16(1);
-        __m128i i32 = _mm_madd_epi16(dot, ones);
+        const __m256i ones = _mm256_set1_epi16(1);
+        __m256i i32 = _mm256_madd_epi16(ones, dot);
 
         // Convert int32_t to float
-        const __m128 p = _mm_cvtepi32_ps(i32);
+        __m256 p = _mm256_cvtepi32_ps(i32);
 
         // Apply the scale, and accumulate
-        acc = _mm_fmadd_ps(scale, p, acc);
+        acc = _mm256_fmadd_ps(scale, p, acc);
     }
 
     // Return horizontal sum of the acc vector
-    __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
+    __m128 res = _mm256_extractf128_ps(acc, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(acc));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
     res = _mm_add_ss(res, _mm_movehdup_ps(res));
     sumf = _mm_cvtss_f32(res);
 #else

From 8c90a860cceb8dc00abeaed0cf877efb79384982 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sun, 16 Apr 2023 15:36:36 +0200
Subject: [PATCH 3/4] More AVX2 optimizations

---
 ggml.c | 151 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 78 insertions(+), 73 deletions(-)

diff --git a/ggml.c b/ggml.c
index bfafaaa330490..7148f499714f2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2539,19 +2539,20 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
-    for (int i = 0; i < nb; i += 2) {
-        __m256i bx = bytes_from_crumbs(x[i+1].qs, x[i].qs);
+    for (int i = 0; i < nb/2; i++) {
+        __m256i bx = bytes_from_crumbs(x[i*2+1].qs, x[i*2].qs);
 
         // Compute combined scale for the block
-        const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+0].d) * y[i/2].d);
-        const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i+1].d) * y[i/2].d);
-        const __m256 scale = _mm256_set_m128(scale_hi, scale_lo);
+        const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+0].d));
+        const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+1].d));
+        __m256 scale = _mm256_set_m128(scale_hi, scale_lo);
+        scale = _mm256_mul_ps(scale, _mm256_broadcast_ss(&y[i].d));
 
         const __m256i off = _mm256_set1_epi8(2);
         bx = _mm256_sub_epi8(bx, off);
 
         // Load y vector
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i/2].qs);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
         // Get absolute values of x vectors
         const __m256i ax = _mm256_sign_epi8(bx, bx);
@@ -2604,6 +2605,7 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
 static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % QK3_0 == 0);
     const int nb = n / QK3_0;
+    assert(nb % 2 == 0);
 
     const block_q3_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
@@ -2613,77 +2615,80 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
 #if defined(__AVX2__)
     // Initialize accumulator with zeros
     __m128 acc = _mm_setzero_ps();
-    for (int i = 0; i < nb; i++) {
-        // Compute combined scale for the block
-        const __m128 scale = _mm_set1_ps(GGML_FP16_TO_FP32(x[i].d) * y[i/2].d);
-
-        const __m256i shift_l = _mm256_set_epi64x(2*3,  64, 4*3,  0);
-        const __m256i shift_r = _mm256_set_epi64x( 64, 2*3,  64, 64);
-
-        __m256i bxx = _mm256_set1_epi64x(x[i].qs);
-
-        // legend:    _=zero    +=one    .=don't care    0-f=3bit quantized values    s=fp16 scale
-
-        // shift the copies to be able to reach all values
-        // 255               192                  128                   64                    0
-        //                     |                    |                    |                    |
-        // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210  in
-        // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210  shift left
-        // _______________________sssssfedcba98765432__________________________________________  shift right
-        // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210  out
-        //     ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^
-        //     e  b    6  3    _    .  f    a  7    2    c  9    4  1    _    .  d    8  5    0
-        bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r));
-
-        // add to itself in masked places to shift some values left one bit
-        // 127                                                           64                                                               0
-        //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
-        // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000  in
-        // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
-        // _____________________.999____________________.111____________________________________.ddd____________________.555_______________  masked
-        // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000  sum
-        //
-        // 255                                                          192                                                             128
-        //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
-        // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222  in
-        // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
-        // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________  masked
-        // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222  sum
-        const __m256i doublemask = _mm256_set1_epi64x(0x078000078000);
-        bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx));
-
-        // collect 16 bytes from 256 into 128 bits
-        const __m256i shufmask = _mm256_set_epi8(
-             5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1,
-            -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0);
-        bxx = _mm256_shuffle_epi8(bxx, shufmask);
+    for (int i = 0; i < nb/2; i++) {
+        const __m128 scale_y = _mm_set1_ps(y[i].d);
+        for (int u = 0; u < 2; u++) {   // let the compiler unroll this
+            // Compute combined scale for the block
+            const __m128 scale_x = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+u].d));
+            const __m128 scale = _mm_mul_ps(scale_x, scale_y);
+
+            __m256i bxx = _mm256_set1_epi64x(x[i*2+u].qs);
+
+            // legend:    _=zero    +=one    .=don't care    0-f=3bit quantized values    s=fp16 scale
+
+            // shift the copies to be able to reach all values
+            // 255               192                  128                   64                    0
+            //                     |                    |                    |                    |
+            // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210  in
+            // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210  shift left
+            // _______________________sssssfedcba98765432__________________________________________  shift right
+            // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210  out
+            //     ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^
+            //     e  b    6  3    _    .  f    a  7    2    c  9    4  1    _    .  d    8  5    0
+            const __m256i shift_l = _mm256_set_epi64x(2*3,  64, 4*3,  0);
+            const __m256i shift_r = _mm256_set_epi64x( 64, 2*3,  64, 64);
+            bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r));
+
+            // add to itself in masked places to shift some values left one bit
+            // 127                                                           64                                                               0
+            //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
+            // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000  in
+            // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
+            // _____________________.999____________________.111____________________________________.ddd____________________.555_______________  masked
+            // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000  sum
+            //
+            // 255                                                          192                                                             128
+            //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
+            // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222  in
+            // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
+            // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________  masked
+            // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222  sum
+            const __m256i doublemask = _mm256_set1_epi64x(0x078000078000);
+            bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx));
+
+            // collect 16 bytes from 256 into 128 bits
+            const __m256i shufmask = _mm256_set_epi8(
+                5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1,
+                -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0);
+            bxx = _mm256_shuffle_epi8(bxx, shufmask);
+
+            __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1));
+
+            const __m128i mask = _mm_set1_epi8(7);
+            bx = _mm_and_si128(mask, bx);
+
+            const __m128i off = _mm_set1_epi8(4);
+            bx = _mm_sub_epi8(bx, off);
+
+            const __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + u*QK3_0));
 
-        __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1));
-
-        const __m128i mask = _mm_set1_epi8(7);
-        bx = _mm_and_si128(mask, bx);
-
-        const __m128i off = _mm_set1_epi8(4);
-        bx = _mm_sub_epi8(bx, off);
-
-        const __m128i by = _mm_loadu_si128((const __m128i *)(y[i/2].qs + (i%2)*QK3_0));
-
-        // Get absolute values of x vectors
-        const __m128i ax = _mm_sign_epi8(bx, bx);
-        // Sign the values of the y vectors
-        const __m128i sy = _mm_sign_epi8(by, bx);
-        // Perform multiplication and create 16-bit values
-        const __m128i dot = _mm_maddubs_epi16(ax, sy);
+            // Get absolute values of x vectors
+            const __m128i ax = _mm_sign_epi8(bx, bx);
+            // Sign the values of the y vectors
+            const __m128i sy = _mm_sign_epi8(by, bx);
+            // Perform multiplication and create 16-bit values
+            const __m128i dot = _mm_maddubs_epi16(ax, sy);
 
-        // Convert int16_t to int32_t by adding pairwise
-        const __m128i ones = _mm_set1_epi16(1);
-        __m128i i32 = _mm_madd_epi16(dot, ones);
+            // Convert int16_t to int32_t by adding pairwise
+            const __m128i ones = _mm_set1_epi16(1);
+            __m128i i32 = _mm_madd_epi16(dot, ones);
 
-        // Convert int32_t to float
-        const __m128 p = _mm_cvtepi32_ps(i32);
+            // Convert int32_t to float
+            const __m128 p = _mm_cvtepi32_ps(i32);
 
-        // Apply the scale, and accumulate
-        acc = _mm_fmadd_ps(scale, p, acc);
+            // Apply the scale, and accumulate
+            acc = _mm_fmadd_ps(scale, p, acc);
+        }
     }
 
     // Return horizontal sum of the acc vector

From 7aa501cd1cd67334af9f883295d9b58d232edfc1 Mon Sep 17 00:00:00 2001
From: pubby <pubby8@gmail.com>
Date: Mon, 17 Apr 2023 10:38:45 -0500
Subject: [PATCH 4/4] Faster q3_0 implementation, using two planes, by @pubby

---
 ggml.c | 214 +++++++++++++++++++++++++++------------------------------
 1 file changed, 102 insertions(+), 112 deletions(-)

diff --git a/ggml.c b/ggml.c
index 7148f499714f2..8d5c4e028c6fb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -662,12 +662,12 @@ typedef struct {
 static_assert(sizeof(block_q2_0) == sizeof(ggml_fp16_t) + QK2_0 / 4, "wrong q2_0 size/padding");
 
 #define QK3_0 16
-typedef union {
-    struct {
-        uint16_t    pad[3];
-        ggml_fp16_t d;
-    };
-    uint64_t qs;
+typedef struct {
+    ggml_fp16_t d;
+    // Instead of representing q3_0 as a packed format "...210210210210",
+    // represent it as two planes: "...10101010" and "...2222"
+    uint16_t qhi; // The highest bit of each 3-bit number, packed together
+    uint32_t qlo; // The low 2-bits of each 3-bit number, packed together
 } block_q3_0;
 static_assert(sizeof(block_q3_0) == sizeof(ggml_fp16_t) + QK3_0 * 3 / 8, "wrong q3_0 size/padding");
 
@@ -762,17 +762,20 @@ static void quantize_row_q3_0(const float * restrict x, block_q3_0 * restrict y,
         const float d = max / -4;
         const float id = d ? 1.0f/d : 0.0f;
 
-        uint64_t qs = 0;
+        uint32_t lo = 0;
+        uint16_t hi = 0;
 
         for (int l = 0; l < QK3_0; l++) {
             const float v = x[i*QK3_0 + l]*id;
             const uint8_t vi = MIN(7, (int8_t)roundf(v) + 4);
             assert(vi < 8);
-            qs |= (uint64_t)vi << (l*3);
+            lo |= (vi & 3) << (l * 2);
+            hi |= ((vi >> 2) & 1) << l;
         }
 
-        y[i].qs = qs;
-        y[i].d = GGML_FP32_TO_FP16(d);  // overwrite unused part of uint64_t qs
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].qlo = lo;
+        y[i].qhi = hi;
     }
 }
 
@@ -1573,13 +1576,15 @@ static void dequantize_row_q3_0(const void * restrict vx, float * restrict y, in
 
     for (int i = 0; i < nb; i++) {
         const float d = GGML_FP16_TO_FP32(x[i].d);
-        uint64_t qs = x[i].qs;
+        uint_fast32_t lo = x[i].qlo;
+        uint_fast32_t hi = x[i].qhi << 2;
         for (int l = 0; l < QK3_0; l++) {
-            const int8_t vi = qs & 7;
+            const int8_t vi = (lo & 3) | (hi & 4);
             const float v = (vi - 4)*d;
             y[i*QK3_0 + l] = v;
             assert(!isnan(y[i*QK3_0 + l]));
-            qs >>= 3;
+            lo >>= 2;
+            hi >>= 1;
         }
     }
 }
@@ -2525,6 +2530,39 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
     *s = sumf;
 }
 
+#if __AVX2__ || __AVX512F__
+// Computes the dot product of signed 8-bit integers packed into 256-bit vectors,
+// converting the result to 32-bit floats packed into a 256-bit vector.
+static inline __m256 dotMul(__m256i bx, __m256i by) {
+#  if __AVXVNNIINT8__
+    // Perform multiplication and sum to 32-bit values
+    const __m256i i32 = _mm256_dpbssd_epi32(bx, by, _mm256_setzero_si256());
+#  else
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(bx, bx);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(by, bx);
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+
+    // Convert int16_t to int32_t by adding pairwise
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i i32 = _mm256_madd_epi16(ones, dot);
+#  endif
+    // Convert int32_t to float
+    return _mm256_cvtepi32_ps(i32);
+}
+
+// Return horizontal sum of 32-bit floats packed into a 256-bit vector.
+static inline float horizontalSum(__m256 acc) {
+    __m128 res = _mm256_extractf128_ps(acc, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(acc));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+#endif
+
 static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % QK2_0 == 0);
     const int nb = n / QK2_0;
@@ -2554,30 +2592,15 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
         // Load y vector
         const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
-        // Get absolute values of x vectors
-        const __m256i ax = _mm256_sign_epi8(bx, bx);
-        // Sign the values of the y vectors
-        const __m256i sy = _mm256_sign_epi8(by, bx);
-        // Perform multiplication and create 16-bit values
-        const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-
-        // Convert int16_t to int32_t by adding pairwise
-        const __m256i ones = _mm256_set1_epi16(1);
-        __m256i i32 = _mm256_madd_epi16(ones, dot);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(i32);
+        // Do the product:
+        __m256 p = dotMul(bx, by);
 
         // Apply the scale, and accumulate
         acc = _mm256_fmadd_ps(scale, p, acc);
     }
 
     // Return horizontal sum of the acc vector
-    __m128 res = _mm256_extractf128_ps(acc, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(acc));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    sumf = _mm_cvtss_f32(res);
+    sumf = horizontalSum(acc);
 #else
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
@@ -2602,6 +2625,20 @@ static void ggml_vec_dot_q2_0_q8_0(const int n, float * restrict s, const void *
     *s = sumf;
 }
 
+// Lookup table used to convert q3_0 to SIMD vectors.
+// Expands the bits of an 8-bit value into a 64 bit result, turning each bit into a byte.
+// A zero bit turns into 0xFC, while a one bit turns into 0x00.
+#define B0(n) 0x ## n
+#define B1(n) B0(n ## FC), B0(n ## 00)
+#define B2(n) B1(n ## FC), B1(n ## 00)
+#define B3(n) B2(n ## FC), B2(n ## 00)
+#define B4(n) B3(n ## FC), B3(n ## 00)
+#define B5(n) B4(n ## FC), B4(n ## 00)
+#define B6(n) B5(n ## FC), B5(n ## 00)
+#define B7(n) B6(n ## FC), B6(n ## 00)
+#define B8( ) B7(     FC), B7(     00)
+static const uint64_t ggml_q3_table[256] = { B8() };
+
 static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     assert(n % QK3_0 == 0);
     const int nb = n / QK3_0;
@@ -2614,103 +2651,54 @@ static void ggml_vec_dot_q3_0_q8_0(const int n, float * restrict s, const void *
 
 #if defined(__AVX2__)
     // Initialize accumulator with zeros
-    __m128 acc = _mm_setzero_ps();
+    __m256 acc = _mm256_setzero_ps();
+
     for (int i = 0; i < nb/2; i++) {
-        const __m128 scale_y = _mm_set1_ps(y[i].d);
-        for (int u = 0; u < 2; u++) {   // let the compiler unroll this
-            // Compute combined scale for the block
-            const __m128 scale_x = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+u].d));
-            const __m128 scale = _mm_mul_ps(scale_x, scale_y);
-
-            __m256i bxx = _mm256_set1_epi64x(x[i*2+u].qs);
-
-            // legend:    _=zero    +=one    .=don't care    0-f=3bit quantized values    s=fp16 scale
-
-            // shift the copies to be able to reach all values
-            // 255               192                  128                   64                    0
-            //                     |                    |                    |                    |
-            // sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210sssssfedcba9876543210  in
-            // sssfedcba9876543210_______________________sfedcba9876543210____sssssfedcba9876543210  shift left
-            // _______________________sssssfedcba98765432__________________________________________  shift right
-            // sssfedcba9876543210____sssssfedcba98765432sfedcba9876543210____sssssfedcba9876543210  out
-            //     ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^    ^  ^    ^  ^    ^
-            //     e  b    6  3    _    .  f    a  7    2    c  9    4  1    _    .  d    8  5    0
-            const __m256i shift_l = _mm256_set_epi64x(2*3,  64, 4*3,  0);
-            const __m256i shift_r = _mm256_set_epi64x( 64, 2*3,  64, 64);
-            bxx = _mm256_or_si256(_mm256_sllv_epi64(bxx, shift_l), _mm256_srlv_epi64(bxx, shift_r));
-
-            // add to itself in masked places to shift some values left one bit
-            // 127                                                           64                                                               0
-            //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
-            // ssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222111000  in
-            // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
-            // _____________________.999____________________.111____________________________________.ddd____________________.555_______________  masked
-            // .............ccc.....999.............444.....111....____________.....................ddd.............888.....555.............000  sum
-            //
-            // 255                                                          192                                                             128
-            //        |       |       |       |       |       |       |       |       |       |       |       |       |       |       |       |
-            // ssssssssssfffeeedddcccbbbaaa999888777666555444333222111000____________ssssssssssssssssfffeeedddcccbbbaaa999888777666555444333222  in
-            // _____________________++++____________________++++____________________________________++++____________________++++_______________  mask
-            // _____________________.bbb____________________.333____________________________________.fff____________________.777_______________  masked
-            // .............eee.....bbb.............666.....333..........____________...............fff.............aaa.....777.............222  sum
-            const __m256i doublemask = _mm256_set1_epi64x(0x078000078000);
-            bxx = _mm256_add_epi64(bxx, _mm256_and_si256(doublemask, bxx));
-
-            // collect 16 bytes from 256 into 128 bits
-            const __m256i shufmask = _mm256_set_epi8(
-                5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0,-1,-1,
-                -1,-1, 5,14,-1,-1,13, 3,-1,-1, 2,11,-1,-1,10, 0);
-            bxx = _mm256_shuffle_epi8(bxx, shufmask);
-
-            __m128i bx = _mm_or_si128(_mm256_castsi256_si128(bxx), _mm256_extracti128_si256(bxx, 1));
-
-            const __m128i mask = _mm_set1_epi8(7);
-            bx = _mm_and_si128(mask, bx);
-
-            const __m128i off = _mm_set1_epi8(4);
-            bx = _mm_sub_epi8(bx, off);
-
-            const __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + u*QK3_0));
+        __m256i bx = bytes_from_crumbs(x[i*2+1].qlo, x[i*2].qlo);
 
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
+        __m256i const bxhi = _mm256_set_epi64x(
+            ggml_q3_table[x[i*2+1].qhi >> 8], ggml_q3_table[x[i*2+1].qhi & 0xFF],
+            ggml_q3_table[x[i*2+0].qhi >> 8], ggml_q3_table[x[i*2+0].qhi & 0xFF]);
 
-            // Convert int16_t to int32_t by adding pairwise
-            const __m128i ones = _mm_set1_epi16(1);
-            __m128i i32 = _mm_madd_epi16(dot, ones);
+        // OR the high bits (which also handles the sign):
+        bx = _mm256_or_si256(bx, bxhi);
 
-            // Convert int32_t to float
-            const __m128 p = _mm_cvtepi32_ps(i32);
+        // Compute combined scale for the block
+        const __m128 scale_lo = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+0].d));
+        const __m128 scale_hi = _mm_set1_ps(GGML_FP16_TO_FP32(x[i*2+1].d));
+        __m256 scale = _mm256_set_m128(scale_hi, scale_lo);
+        scale = _mm256_mul_ps(scale, _mm256_broadcast_ss(&y[i].d));
 
-            // Apply the scale, and accumulate
-            acc = _mm_fmadd_ps(scale, p, acc);
-        }
+        // Load y vector
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        // Do the product,
+        __m256 p = dotMul(bx, by);
+
+        // Apply the scale, and accumulate
+        acc = _mm256_fmadd_ps(scale, p, acc);
     }
 
     // Return horizontal sum of the acc vector
-    __m128 res = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    sumf = _mm_cvtss_f32(res);
+    sumf = horizontalSum(acc);
 #else
     for (int i = 0; i < nb; i++) {
         const float d0 = GGML_FP16_TO_FP32(x[i].d);
         const float d1 = y[i/2].d;
 
-        uint64_t qs0 = x[i].qs;
+        uint_fast32_t lo0 = x[i].qlo;
+        uint_fast32_t hi0 = x[i].qhi << 2;
         const int8_t * restrict p1 = y[i/2].qs + (i%2)*QK3_0;
 
         int sumi = 0;
-        for (int j = 0; j < QK3_0; j++) {
-            const int8_t i0 = (int8_t)(qs0 & 7) - 4;
-            const int_fast16_t i1 = p1[j];
+        for (int l = 0; l < QK3_0; l++) {
+            const int8_t i0 = (int8_t)((lo0 & 3) | ((hi0 & 4) - 4));
+            const int_fast16_t i1 = p1[l];
 
             sumi += i0 * i1;
 
-            qs0 >>= 3;
+            lo0 >>= 2;
+            hi0 >>= 1;
         }
         sumf += d0 * d1 * sumi;
     }
@@ -12497,11 +12485,13 @@ size_t ggml_quantize_q3_0(const float * src, void * dst, int n, int k, int64_t h
         quantize_row_q3_0(src + j, y, k);
 
         for (int i = 0; i < nb; i++) {
-            uint64_t qs = y[i].qs;
+            uint_fast32_t lo = y[i].qlo;
+            uint_fast32_t hi = y[i].qhi << 2;
             for (int l = 0; l < QK3_0; l++) {
-                const int8_t vi = qs & 7;
+                int8_t vi = (lo & 3) | (hi & 4);
                 hist[vi]++;
-                qs >>= 3;
+                lo >>= 2;
+                hi >>= 1;
             }
         }
     }