Skip to content

Commit 2e01d68

Browse files
committed
ggml : rename Q5_0 -> Q5_1
1 parent 0fcbf8b commit 2e01d68

File tree

7 files changed

+76
-76
lines changed

7 files changed

+76
-76
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
1717
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
1818
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
1919
fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
20-
fprintf(stderr, " type = %d - q5_0\n", LLAMA_FTYPE_MOSTLY_Q5_0);
20+
fprintf(stderr, " type = %d - q5_1\n", LLAMA_FTYPE_MOSTLY_Q5_1);
2121
return 1;
2222
}
2323

ggml-cuda.cu

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,14 @@ typedef struct {
3737
} block_q4_3;
3838
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
3939

40-
#define QK5_0 32
40+
#define QK5_1 32
4141
typedef struct {
4242
__half d; // delta
4343
__half m; // min
4444
uint32_t qh; // 5-th bit of quants
45-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
46-
} block_q5_0;
47-
static_assert(sizeof(block_q5_0) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
45+
uint8_t qs[QK5_1 / 2]; // nibbles / quants
46+
} block_q5_1;
47+
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
4848

4949
#define QK8_0 32
5050
typedef struct {
@@ -147,8 +147,8 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
147147
}
148148
}
149149

150-
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
151-
const block_q5_0 * x = (const block_q5_0 *) vx;
150+
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
151+
const block_q5_1 * x = (const block_q5_1 *) vx;
152152

153153
const int i = blockIdx.x;
154154

@@ -159,7 +159,7 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
159159

160160
const uint32_t qh = x[i].qh;
161161

162-
for (int l = 0; l < QK5_0; l += 2) {
162+
for (int l = 0; l < QK5_1; l += 2) {
163163
const uint8_t vi = pp[l/2];
164164

165165
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
@@ -171,8 +171,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
171171
const float v0 = vi0*d + m;
172172
const float v1 = vi1*d + m;
173173

174-
y[i*QK5_0 + l + 0] = v0;
175-
y[i*QK5_0 + l + 1] = v1;
174+
y[i*QK5_1 + l + 0] = v0;
175+
y[i*QK5_1 + l + 1] = v1;
176176
}
177177
}
178178

@@ -212,9 +212,9 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
212212
dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
213213
}
214214

215-
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
216-
const int nb = k / QK5_0;
217-
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
215+
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
216+
const int nb = k / QK5_1;
217+
dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
218218
}
219219

220220
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {

ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
3535
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3636
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3737
void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
38-
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
38+
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
3939
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
4040

4141
#ifdef __cplusplus

ggml.c

Lines changed: 55 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -676,14 +676,14 @@ typedef struct {
676676
} block_q4_3;
677677
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
678678

679-
#define QK5_0 32
679+
#define QK5_1 32
680680
typedef struct {
681681
ggml_fp16_t d; // delta
682682
ggml_fp16_t m; // min
683683
uint32_t qh; // 5-th bit of quants
684-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
685-
} block_q5_0;
686-
static_assert(sizeof(block_q5_0) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
684+
uint8_t qs[QK5_1 / 2]; // nibbles / quants
685+
} block_q5_1;
686+
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
687687

688688
#define QK8_0 32
689689
typedef struct {
@@ -1300,16 +1300,16 @@ static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int
13001300
quantize_row_q4_3_reference(x, y, k);
13011301
}
13021302

1303-
static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
1304-
assert(k % QK5_0 == 0);
1305-
const int nb = k / QK5_0;
1303+
static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
1304+
assert(k % QK5_1 == 0);
1305+
const int nb = k / QK5_1;
13061306

13071307
for (int i = 0; i < nb; i++) {
13081308
float min = FLT_MAX;
13091309
float max = -FLT_MAX;
13101310

1311-
for (int l = 0; l < QK5_0; l++) {
1312-
const float v = x[i*QK5_0 + l];
1311+
for (int l = 0; l < QK5_1; l++) {
1312+
const float v = x[i*QK5_1 + l];
13131313
if (v < min) min = v;
13141314
if (v > max) max = v;
13151315
}
@@ -1321,9 +1321,9 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
13211321
y[i].m = GGML_FP32_TO_FP16(min);
13221322
y[i].qh = 0;
13231323

1324-
for (int l = 0; l < QK5_0; l += 2) {
1325-
const float v0 = (x[i*QK5_0 + l + 0] - min)*id;
1326-
const float v1 = (x[i*QK5_0 + l + 1] - min)*id;
1324+
for (int l = 0; l < QK5_1; l += 2) {
1325+
const float v0 = (x[i*QK5_1 + l + 0] - min)*id;
1326+
const float v1 = (x[i*QK5_1 + l + 1] - min)*id;
13271327

13281328
const uint32_t vi0 = (int) (v0 + 0.5f);
13291329
const uint32_t vi1 = (int) (v1 + 0.5f);
@@ -1337,12 +1337,12 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
13371337
}
13381338
}
13391339

1340-
static void quantize_row_q5_0(const float * restrict x, void * restrict vy, int k) {
1341-
assert(k % QK5_0 == 0);
1340+
static void quantize_row_q5_1(const float * restrict x, void * restrict vy, int k) {
1341+
assert(k % QK5_1 == 0);
13421342

1343-
block_q5_0 * restrict y = vy;
1343+
block_q5_1 * restrict y = vy;
13441344

1345-
quantize_row_q5_0_reference(x, y, k);
1345+
quantize_row_q5_1_reference(x, y, k);
13461346
}
13471347

13481348
// reference implementation for deterministic creation of model files
@@ -1861,11 +1861,11 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
18611861
}
18621862
}
18631863

1864-
static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
1865-
assert(k % QK5_0 == 0);
1866-
const int nb = k / QK5_0;
1864+
static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, int k) {
1865+
assert(k % QK5_1 == 0);
1866+
const int nb = k / QK5_1;
18671867

1868-
const block_q5_0 * restrict x = vx;
1868+
const block_q5_1 * restrict x = vx;
18691869

18701870
for (int i = 0; i < nb; i++) {
18711871
const float d = GGML_FP16_TO_FP32(x[i].d);
@@ -1875,7 +1875,7 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
18751875

18761876
const uint32_t qh = x[i].qh;
18771877

1878-
for (int l = 0; l < QK5_0; l += 2) {
1878+
for (int l = 0; l < QK5_1; l += 2) {
18791879
const uint8_t vi = pp[l/2];
18801880

18811881
// extract the 5-th bit from qh
@@ -1888,11 +1888,11 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
18881888
const float v0 = vi0*d + m;
18891889
const float v1 = vi1*d + m;
18901890

1891-
y[i*QK5_0 + l + 0] = v0;
1892-
y[i*QK5_0 + l + 1] = v1;
1891+
y[i*QK5_1 + l + 0] = v0;
1892+
y[i*QK5_1 + l + 1] = v1;
18931893

1894-
assert(!isnan(y[i*QK5_0 + l + 0]));
1895-
assert(!isnan(y[i*QK5_0 + l + 1]));
1894+
assert(!isnan(y[i*QK5_1 + l + 0]));
1895+
assert(!isnan(y[i*QK5_1 + l + 1]));
18961896
}
18971897
}
18981898
}
@@ -1918,7 +1918,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
19181918
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
19191919
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
19201920
static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1921-
static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1921+
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
19221922
static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
19231923

19241924
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
@@ -1954,12 +1954,12 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
19541954
.vec_dot_q = ggml_vec_dot_q4_3_q8_1,
19551955
.vec_dot_type = GGML_TYPE_Q8_1,
19561956
},
1957-
[GGML_TYPE_Q5_0] = {
1958-
.dequantize_row_q = dequantize_row_q5_0,
1959-
.quantize_row_q = quantize_row_q5_0,
1960-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
1957+
[GGML_TYPE_Q5_1] = {
1958+
.dequantize_row_q = dequantize_row_q5_1,
1959+
.quantize_row_q = quantize_row_q5_1,
1960+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
19611961
.quantize_row_q_dot = quantize_row_q8_1,
1962-
.vec_dot_q = ggml_vec_dot_q5_0_q8_1,
1962+
.vec_dot_q = ggml_vec_dot_q5_1_q8_1,
19631963
.vec_dot_type = GGML_TYPE_Q8_1,
19641964
},
19651965
[GGML_TYPE_Q8_0] = {
@@ -3169,14 +3169,14 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
31693169
#endif
31703170
}
31713171

3172-
static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3172+
static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
31733173
const int nb = n / QK8_1;
31743174

31753175
assert(n % QK8_1 == 0);
31763176
assert(nb % 2 == 0);
3177-
assert(QK8_1 == QK5_0);
3177+
assert(QK8_1 == QK5_1);
31783178

3179-
const block_q5_0 * restrict x = vx;
3179+
const block_q5_1 * restrict x = vx;
31803180
const block_q8_1 * restrict y = vy;
31813181

31823182
#if defined(__ARM_NEON)
@@ -3187,7 +3187,7 @@ static void ggml_vec_dot_q5_0_q8_1(const int n, float * restrict s, const void *
31873187
uint64_t tmp[4];
31883188

31893189
for (int i = 0; i < nb; ++i) {
3190-
const block_q5_0 * restrict x0 = &x[i];
3190+
const block_q5_1 * restrict x0 = &x[i];
31913191
const block_q8_1 * restrict y0 = &y[i];
31923192

31933193
summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
@@ -3646,7 +3646,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
36463646
[GGML_TYPE_Q4_1] = QK4_1,
36473647
[GGML_TYPE_Q4_2] = QK4_2,
36483648
[GGML_TYPE_Q4_3] = QK4_3,
3649-
[GGML_TYPE_Q5_0] = QK5_0,
3649+
[GGML_TYPE_Q5_1] = QK5_1,
36503650
[GGML_TYPE_Q8_0] = QK8_0,
36513651
[GGML_TYPE_Q8_1] = QK8_1,
36523652
[GGML_TYPE_I8] = 1,
@@ -3662,7 +3662,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
36623662
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
36633663
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
36643664
[GGML_TYPE_Q4_3] = sizeof(block_q4_3),
3665-
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3665+
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
36663666
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
36673667
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
36683668
[GGML_TYPE_I8] = sizeof(int8_t),
@@ -3679,7 +3679,7 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
36793679
[GGML_TYPE_Q4_1] = "q4_1",
36803680
[GGML_TYPE_Q4_2] = "q4_2",
36813681
[GGML_TYPE_Q4_3] = "q4_3",
3682-
[GGML_TYPE_Q5_0] = "q5_0",
3682+
[GGML_TYPE_Q5_1] = "q5_1",
36833683
[GGML_TYPE_Q8_0] = "q8_0",
36843684
[GGML_TYPE_Q8_1] = "q8_1",
36853685
[GGML_TYPE_I8] = "i8",
@@ -3695,7 +3695,7 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
36953695
[GGML_TYPE_Q4_1] = true,
36963696
[GGML_TYPE_Q4_2] = true,
36973697
[GGML_TYPE_Q4_3] = true,
3698-
[GGML_TYPE_Q5_0] = true,
3698+
[GGML_TYPE_Q5_1] = true,
36993699
[GGML_TYPE_Q8_0] = true,
37003700
[GGML_TYPE_Q8_1] = true,
37013701
[GGML_TYPE_I8] = false,
@@ -6923,7 +6923,7 @@ static void ggml_compute_forward_add(
69236923
case GGML_TYPE_Q4_1:
69246924
case GGML_TYPE_Q4_2:
69256925
case GGML_TYPE_Q4_3:
6926-
case GGML_TYPE_Q5_0:
6926+
case GGML_TYPE_Q5_1:
69276927
case GGML_TYPE_Q8_0:
69286928
{
69296929
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
@@ -8412,8 +8412,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
84128412
else if (type == GGML_TYPE_Q4_3) {
84138413
dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
84148414
}
8415-
else if (type == GGML_TYPE_Q5_0) {
8416-
dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
8415+
else if (type == GGML_TYPE_Q5_1) {
8416+
dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
84178417
}
84188418
else if (type == GGML_TYPE_Q8_0) {
84198419
dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
@@ -8573,7 +8573,7 @@ static void ggml_compute_forward_mul_mat(
85738573
case GGML_TYPE_Q4_1:
85748574
case GGML_TYPE_Q4_2:
85758575
case GGML_TYPE_Q4_3:
8576-
case GGML_TYPE_Q5_0:
8576+
case GGML_TYPE_Q5_1:
85778577
case GGML_TYPE_Q8_0:
85788578
case GGML_TYPE_Q8_1:
85798579
{
@@ -8804,7 +8804,7 @@ static void ggml_compute_forward_get_rows(
88048804
case GGML_TYPE_Q4_1:
88058805
case GGML_TYPE_Q4_2:
88068806
case GGML_TYPE_Q4_3:
8807-
case GGML_TYPE_Q5_0:
8807+
case GGML_TYPE_Q5_1:
88088808
case GGML_TYPE_Q8_0:
88098809
case GGML_TYPE_Q8_1:
88108810
{
@@ -12598,17 +12598,17 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
1259812598
return (n/QK4_3*sizeof(block_q4_3));
1259912599
}
1260012600

12601-
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
12602-
assert(k % QK5_0 == 0);
12603-
const int nb = k / QK5_0;
12601+
size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
12602+
assert(k % QK5_1 == 0);
12603+
const int nb = k / QK5_1;
1260412604

1260512605
for (int j = 0; j < n; j += k) {
12606-
block_q5_0 * restrict y = (block_q5_0 *)dst + j/QK5_0;
12606+
block_q5_1 * restrict y = (block_q5_1 *)dst + j/QK5_1;
1260712607

12608-
quantize_row_q5_0_reference(src + j, y, k);
12608+
quantize_row_q5_1_reference(src + j, y, k);
1260912609

1261012610
for (int i = 0; i < nb; i++) {
12611-
for (int l = 0; l < QK5_0; l += 2) {
12611+
for (int l = 0; l < QK5_1; l += 2) {
1261212612
const uint8_t vh0 = ((y[i].qh & (1 << (l + 0))) >> (l + 0)) << 4;
1261312613
const uint8_t vh1 = ((y[i].qh & (1 << (l + 1))) >> (l + 1)) << 4;
1261412614

@@ -12622,7 +12622,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
1262212622
}
1262312623
}
1262412624

12625-
return (n/QK5_0*sizeof(block_q5_0));
12625+
return (n/QK5_1*sizeof(block_q5_1));
1262612626
}
1262712627

1262812628
size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
@@ -12673,11 +12673,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
1267312673
block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
1267412674
result = ggml_quantize_q4_3(src + start, block, n, n, hist);
1267512675
} break;
12676-
case GGML_TYPE_Q5_0:
12676+
case GGML_TYPE_Q5_1:
1267712677
{
12678-
GGML_ASSERT(start % QK5_0 == 0);
12679-
block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
12680-
result = ggml_quantize_q5_0(src + start, block, n, n, hist);
12678+
GGML_ASSERT(start % QK5_1 == 0);
12679+
block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
12680+
result = ggml_quantize_q5_1(src + start, block, n, n, hist);
1268112681
} break;
1268212682
case GGML_TYPE_Q8_0:
1268312683
{

ggml.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ extern "C" {
222222
GGML_TYPE_Q4_1 = 3,
223223
GGML_TYPE_Q4_2 = 4,
224224
GGML_TYPE_Q4_3 = 5,
225-
GGML_TYPE_Q5_0 = 6,
225+
GGML_TYPE_Q5_1 = 6,
226226
GGML_TYPE_Q8_0 = 7,
227227
GGML_TYPE_Q8_1 = 8,
228228
GGML_TYPE_I8,
@@ -834,7 +834,7 @@ extern "C" {
834834
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
835835
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
836836
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
837-
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
837+
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
838838
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
839839

840840
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

0 commit comments

Comments
 (0)