Skip to content

Commit e2b1a5e

Browse files
authored
Fix attn_v conditionality (ikawrakow#604)
To retain compatibility with : ikawrakow#91 We need "else if" and not "if", otherwise the MOE and 70b condition takes precedence over the specified quant in the CLI.
1 parent b5ddec9 commit e2b1a5e

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

src/llama.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19447,6 +19447,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1944719447
}
1944819448
} else if (name.find("attn_v.weight") != std::string::npos) {
1944919449
if (qs.params->attn_v_type < GGML_TYPE_COUNT) new_type = qs.params->attn_v_type;
19450+
else if (qs.model.hparams.n_expert >= 4) {
19451+
// for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
19452+
// TODO: explore better strategies
19453+
new_type = GGML_TYPE_Q8_0;
19454+
}
19455+
else if (qs.model.type == MODEL_70B) {
19456+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
19457+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
19458+
// nearly negligible increase in model size by quantizing this tensor with more bits:
19459+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
19460+
if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
19461+
}
1945019462
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
1945119463
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1945219464
}
@@ -19531,18 +19543,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1953119543
if (qs.model.hparams.n_vocab >= 127999 && (qs.model.type == MODEL_8B || qs.model.type == MODEL_70B))
1953219544
new_type = GGML_TYPE_IQ6_K;
1953319545
}
19534-
if (qs.model.type == MODEL_70B) {
19535-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
19536-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
19537-
// nearly negligible increase in model size by quantizing this tensor with more bits:
19538-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
19539-
if (new_type == GGML_TYPE_IQ3_K) new_type = GGML_TYPE_IQ5_K;
19540-
}
19541-
if (qs.model.hparams.n_expert >= 4) {
19542-
// for the 4-8-expert model, bumping this to Q8_0 trades just ~128MB
19543-
// TODO: explore better strategies
19544-
new_type = GGML_TYPE_Q8_0;
19545-
}
1954619546
else if (qs.model.hparams.n_gqa() >= 4) {
1954719547
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
1954819548
else if (new_type == GGML_TYPE_Q2_K_R4 || new_type == GGML_TYPE_IQ3_XXS_R4) new_type = GGML_TYPE_IQ3_K_R4;

0 commit comments

Comments
 (0)