@@ -16654,7 +16654,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16654
16654
new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
16655
16655
}
16656
16656
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16657
- if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16657
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
16658
16658
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
16659
16659
else new_type = GGML_TYPE_IQ3_S;
16660
16660
}
@@ -16859,24 +16859,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
16859
16859
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
16860
16860
}
16861
16861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16862
- if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K ;
16862
+ if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS ;
16863
16863
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
16864
16864
else new_type = GGML_TYPE_IQ3_XXS;
16865
16865
}
16866
16866
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16867
- if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16867
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
16868
16868
else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2)
16869
16869
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16870
16870
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16871
16871
}
16872
16872
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16873
- if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16873
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K ;
16874
16874
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16875
16875
new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16876
16876
else new_type = GGML_TYPE_IQ3_S;
16877
16877
}
16878
16878
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) {
16879
- if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q6_K ;
16879
+ if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q5_K ;
16880
16880
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16881
16881
new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16882
16882
else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
0 commit comments