@@ -5020,9 +5020,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
5020
5020
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
5021
5021
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
5022
5022
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
5023
- case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw";
5024
- case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
5025
- case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
5023
+ case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.90 bpw";
5024
+ case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 4.10 bpw";
5026
5025
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
5027
5026
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5028
5027
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -16487,7 +16486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16487
16486
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
16488
16487
else new_type = GGML_TYPE_IQ3_S;
16489
16488
}
16490
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ) {
16489
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16491
16490
new_type = GGML_TYPE_IQ4_XS;
16492
16491
}
16493
16492
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -16545,7 +16544,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16545
16544
else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16546
16545
}
16547
16546
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16548
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ) {
16547
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16549
16548
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
16550
16549
else new_type = GGML_TYPE_Q4_K;
16551
16550
}
@@ -16555,9 +16554,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16555
16554
}
16556
16555
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16557
16556
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16558
- new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16559
- difquant_half_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16557
+ new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
16558
+ difquant_more_fl_tensors (qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16560
16559
}
16560
+ else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16561
16561
}
16562
16562
++qs.i_attention_wv;
16563
16563
} else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16651,16 +16651,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16651
16651
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16652
16652
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16653
16653
}
16654
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16655
- if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16656
- new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16657
- else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16658
- }
16659
16654
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16660
16655
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16661
- new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
16662
- difquant_half_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16656
+ new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
16657
+ difquant_more_fl_tensors (qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16663
16658
}
16659
+ else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16664
16660
}
16665
16661
++qs.i_attention_wk;
16666
16662
} else if (name.find("attn_q.weight") != std::string::npos) {
@@ -16679,16 +16675,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16679
16675
}
16680
16676
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16681
16677
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16682
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ) {
16678
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16683
16679
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
16684
16680
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
16685
16681
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
16686
16682
}
16687
16683
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16688
16684
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16689
- new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S :
16690
- difquant_half_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S;
16685
+ new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16686
+ difquant_more_fl_tensors (qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16691
16687
}
16688
+ else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16692
16689
}
16693
16690
++qs.i_attention_wq;
16694
16691
} else if (name.find("ffn_down") != std::string::npos) {
@@ -16741,37 +16738,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16741
16738
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16742
16739
}
16743
16740
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16744
- new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16741
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16742
+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16743
+ else new_type = GGML_TYPE_IQ3_XXS;
16745
16744
}
16746
16745
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16747
- new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16746
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16747
+ new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16748
+ else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16748
16749
}
16749
16750
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16750
- new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16751
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16752
+ new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16753
+ else new_type = GGML_TYPE_IQ3_S;
16751
16754
}
16752
16755
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16753
- new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16756
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16757
+ new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16758
+ else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16754
16759
}
16755
16760
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16756
- new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16761
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16762
+ new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16763
+ else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16757
16764
}
16758
16765
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16759
- new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16766
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16767
+ new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16768
+ else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16760
16769
}
16761
16770
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16762
- new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16763
- }
16764
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16765
- new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16771
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16772
+ new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16773
+ else new_type = GGML_TYPE_IQ4_XS;
16766
16774
}
16767
16775
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
16768
16776
new_type = GGML_TYPE_Q5_K;
16769
16777
}
16770
16778
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16771
16779
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16772
- new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS :
16773
- difquant_half_tensors (i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
16780
+ new_type = (difquant_first_last_tensors( i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16781
+ difquant_three_eights_tensors (i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16774
16782
}
16783
+ else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16775
16784
}
16776
16785
++qs.i_ffn_down;
16777
16786
} else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16786,10 +16795,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16786
16795
}
16787
16796
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
16788
16797
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16789
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
16798
+ ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
16790
16799
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
16791
16800
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16792
- ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
16793
16801
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
16794
16802
new_type = GGML_TYPE_Q5_K;
16795
16803
}
@@ -16816,13 +16824,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16816
16824
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16817
16825
new_type = GGML_TYPE_IQ3_S;
16818
16826
}
16819
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16820
- ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
16827
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL)
16828
+ new_type = GGML_TYPE_IQ4_XS;
16821
16829
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16822
16830
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16823
- new_type = qs.i_attention_wo < qs.n_attention_wo /8 ? GGML_TYPE_IQ4_XS :
16824
- difquant_half_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS ;
16831
+ new_type = qs.i_attention_wq < qs.n_attention_wq /8 ? GGML_TYPE_Q6_K :
16832
+ difquant_more_fl_tensors (qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K ;
16825
16833
}
16834
+ else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
16826
16835
}
16827
16836
}
16828
16837
} else {
@@ -16852,7 +16861,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16852
16861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
16853
16862
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K;
16854
16863
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
16855
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K;
16864
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16865
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16866
+ new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16867
+ difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16868
+ }
16869
+ else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16870
+ }
16856
16871
++qs.i_attention_wv;
16857
16872
} else if (name.find("ffn_gate") != std::string::npos) {
16858
16873
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -16947,7 +16962,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16947
16962
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16948
16963
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16949
16964
}
16950
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16965
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16966
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16967
+ new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16968
+ else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16969
+ }
16951
16970
++qs.i_ffn_gate;
16952
16971
} else if (name.find("ffn_up") != std::string::npos) {
16953
16972
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
@@ -17042,7 +17061,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
17042
17061
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17043
17062
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
17044
17063
}
17045
- else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
17064
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17065
+ if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17066
+ new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17067
+ else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17068
+ }
17046
17069
++qs.i_ffn_up;
17047
17070
} else if (name.find("attn_norm.weight") != std::string::npos) {
17048
17071
if (qs.params->attn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->attn_norm_type;
@@ -17201,7 +17224,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
17201
17224
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
17202
17225
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
17203
17226
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
17204
- case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
17205
17227
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
17206
17228
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
17207
17229
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
0 commit comments