@@ -19447,6 +19447,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19447
19447
}
19448
19448
} else if (name.find("attn_v.weight") != std::string::npos) {
19449
19449
if (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
19450
+ else if (qs.model.hparams.n_expert >= 4) {
19451
+ // for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
19452
+ // TODO: explore better strategies
19453
+ new_type = GGML_TYPE_Q8_0;
19454
+ }
19455
+ else if (qs.model.type == MODEL_70B) {
19456
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
19457
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
19458
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
19459
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
19460
+ if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
19461
+ }
19450
19462
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
19451
19463
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
19452
19464
}
@@ -19531,18 +19543,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
19531
19543
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
19532
19544
new_type = GGML_TYPE_IQ6_K;
19533
19545
}
19534
- if (qs.model.type == MODEL_70B) {
19535
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
19536
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
19537
- // nearly negligible increase in model size by quantizing this tensor with more bits:
19538
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
19539
- if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
19540
- }
19541
- if (qs.model.hparams.n_expert >= 4) {
19542
- // for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
19543
- // TODO: explore better strategies
19544
- new_type = GGML_TYPE_Q8_0;
19545
- }
19546
19546
else if (qs.model.hparams.n_gqa() >= 4) {
19547
19547
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
19548
19548
else if (new_type == GGML_TYPE_Q2_K_R4 || new_type == GGML_TYPE_IQ3_XXS_R4) new_type = GGML_TYPE_IQ3_K_R4;
0 commit comments