Skip to content

Commit be1e61d

Browse files
committed
Temporary settings for IQ3 attn_k and attn_v
1 parent 8e98fc8 commit be1e61d

File tree

1 file changed

+41
-21
lines changed

1 file changed

+41
-21
lines changed

src/llama.cpp

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16621,14 +16621,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1662116621
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
1662216622
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1662316623
}
16624-
else if (qs.model.hparams.n_gqa() >= 7) {
16624+
// else if (qs.model.hparams.n_gqa() >= 7) {
1662516625
// The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
1662616626
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
1662716627
// nearly negligible increase in model size by quantizing this tensor with more bits.
1662816628
// That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
16629-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630-
new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631-
}
16629+
// if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630+
// new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631+
// }
1663216632
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
1663316633
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
1663416634
}
@@ -16650,30 +16650,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665016650
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
1665116651
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
1665216652
}
16653-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16653+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
1665416654
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1665516655
}
16656+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16657+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16658+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16659+
else new_type = GGML_TYPE_IQ3_S;
16660+
}
1665616661
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16657-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16662+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16663+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1665816664
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1665916665
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1666016666
}
1666116667
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16662-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16668+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16669+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1666316670
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
1666416671
else new_type = GGML_TYPE_Q4_K;
1666516672
}
16666-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16667-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16668-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16673+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16674+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16675+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16676+
else new_type = GGML_TYPE_Q4_K;
16677+
}
16678+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16679+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16680+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1666916681
else new_type = GGML_TYPE_Q4_K;
1667016682
}
1667116683
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
1667216684
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
1667316685
new_type = GGML_TYPE_Q5_K;
1667416686
}
1667516687
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16676-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16688+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16689+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1667716690
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
1667816691
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1667916692
}
@@ -16846,41 +16859,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1684616859
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
1684716860
}
1684816861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16849-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
16862+
if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16863+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
1685016864
else new_type = GGML_TYPE_IQ3_XXS;
1685116865
}
1685216866
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16853-
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16867+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16868+
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
1685416869
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1685516870
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1685616871
}
1685716872
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16858-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16873+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16874+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1685916875
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1686016876
else new_type = GGML_TYPE_IQ3_S;
1686116877
}
1686216878
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16863-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16879+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16880+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1686416881
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1686516882
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1686616883
}
1686716884
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16868-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16885+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16886+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1686916887
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1687016888
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687116889
}
1687216890
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16873-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16891+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16892+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1687416893
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1687516894
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1687616895
}
1687716896
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16878-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16897+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16898+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
1687916899
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1688016900
else new_type = GGML_TYPE_IQ4_XS;
1688116901
}
1688216902
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16883-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16903+
if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16904+
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
1688416905
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
1688516906
difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1688616907
}
@@ -16992,8 +17013,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1699217013
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
1699317014
}
1699417015
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16995-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16996-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17016+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1699717017
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1699817018
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1699917019
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;

0 commit comments

Comments
 (0)