@@ -16621,14 +16621,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16621
16621
new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ4_XS;
16622
16622
else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16623
16623
}
16624
- else if (qs.model.hparams.n_gqa() >= 7) {
16624
+ // else if (qs.model.hparams.n_gqa() >= 7) {
16625
16625
// The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is
16626
16626
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
16627
16627
// nearly negligible increase in model size by quantizing this tensor with more bits.
16628
16628
// That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12).
16629
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630
- new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631
- }
16629
+ // if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S ||
16630
+ // new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K;
16631
+ // }
16632
16632
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
16633
16633
new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
16634
16634
}
@@ -16650,30 +16650,43 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16650
16650
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
16651
16651
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS;
16652
16652
}
16653
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL ) {
16653
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
16654
16654
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16655
16655
}
16656
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16657
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16658
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16659
+ else new_type = GGML_TYPE_IQ3_S;
16660
+ }
16656
16661
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16657
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16662
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16663
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16658
16664
new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16659
16665
else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16660
16666
}
16661
16667
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16662
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16668
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16669
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16663
16670
new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16664
16671
else new_type = GGML_TYPE_Q4_K;
16665
16672
}
16666
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16667
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16668
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16673
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16674
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16675
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16676
+ else new_type = GGML_TYPE_Q4_K;
16677
+ }
16678
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16679
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16680
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16669
16681
else new_type = GGML_TYPE_Q4_K;
16670
16682
}
16671
16683
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) &&
16672
16684
(qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16673
16685
new_type = GGML_TYPE_Q5_K;
16674
16686
}
16675
16687
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16676
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16688
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16689
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16677
16690
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
16678
16691
difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16679
16692
}
@@ -16846,41 +16859,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16846
16859
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16847
16860
}
16848
16861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16849
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
16862
+ if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16863
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
16850
16864
else new_type = GGML_TYPE_IQ3_XXS;
16851
16865
}
16852
16866
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16853
- if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16867
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16868
+ else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16854
16869
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16855
16870
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16856
16871
}
16857
16872
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16858
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16873
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16874
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16859
16875
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16860
16876
else new_type = GGML_TYPE_IQ3_S;
16861
16877
}
16862
16878
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16863
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16879
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16880
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16864
16881
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16865
16882
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16866
16883
}
16867
16884
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16868
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16885
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16886
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16869
16887
new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16870
16888
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16871
16889
}
16872
16890
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16873
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16891
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K;
16892
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16874
16893
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16875
16894
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16876
16895
}
16877
16896
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16878
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16897
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16898
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16879
16899
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16880
16900
else new_type = GGML_TYPE_IQ4_XS;
16881
16901
}
16882
16902
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16883
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16903
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K;
16904
+ else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16884
16905
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
16885
16906
difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16886
16907
}
@@ -16992,8 +17013,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16992
17013
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
16993
17014
}
16994
17015
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16995
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16996
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
17016
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16997
17017
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
16998
17018
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
16999
17019
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
0 commit comments