Skip to content

Commit d221567

Browse files
committed
Revamp IQ4_XSR, remove IQ3_XXXL
1 parent 37fb59b commit d221567

File tree

4 files changed

+63
-44
lines changed

4 files changed

+63
-44
lines changed

examples/quantize/quantize.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
3535
{ "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", },
3636
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
3737
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", },
38-
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.80 bpw quantization mix", },
39-
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", },
40-
{ "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", },
38+
{ "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.90 bpw quantization mix", },
39+
{ "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 4.10 bpw quantization mix", },
4140
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
4241
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
4342
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },

gguf-py/gguf/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1257,7 +1257,6 @@ class LlamaFileType(IntEnum):
12571257
MOSTLY_IQ1_XL = 42 # except 1d tensors
12581258
MOSTLY_IQ4_XSR = 43 # except 1d tensors
12591259
MOSTLY_IQ3_XXL = 44 # except 1d tensors
1260-
MOSTLY_IQ3_XXXL = 45 # except 1d tensors
12611260

12621261
GUESSED = 1024 # not specified in the model file
12631262

include/llama.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ extern "C" {
173173
LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors
174174
LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors
175175
LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors
176-
LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 45, // except 1d tensors
177176
LLAMA_FTYPE_CQS = 99, // except 1d tensors
178177

179178
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file

src/llama.cpp

Lines changed: 61 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5020,9 +5020,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
50205020
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
50215021
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
50225022
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw";
5023-
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw";
5024-
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw";
5025-
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw";
5023+
case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.90 bpw";
5024+
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 4.10 bpw";
50265025
case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw";
50275026
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
50285027
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
@@ -16487,7 +16486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1648716486
if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
1648816487
else new_type = GGML_TYPE_IQ3_S;
1648916488
}
16490-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16489+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1649116490
new_type = GGML_TYPE_IQ4_XS;
1649216491
}
1649316492
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
@@ -16545,7 +16544,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1654516544
else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S;
1654616545
}
1654716546
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ||
16548-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16547+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1654916548
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
1655016549
else new_type = GGML_TYPE_Q4_K;
1655116550
}
@@ -16555,9 +16554,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1655516554
}
1655616555
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1655716556
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16558-
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16559-
difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16557+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K :
16558+
difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1656016559
}
16560+
else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1656116561
}
1656216562
++qs.i_attention_wv;
1656316563
} else if (name.find("attn_k.weight") != std::string::npos) {
@@ -16651,16 +16651,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1665116651
new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1665216652
else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1665316653
}
16654-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16655-
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16656-
new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16657-
else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16658-
}
1665916654
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1666016655
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16661-
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K :
16662-
difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K;
16656+
new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K :
16657+
difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1666316658
}
16659+
else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1666416660
}
1666516661
++qs.i_attention_wk;
1666616662
} else if (name.find("attn_q.weight") != std::string::npos) {
@@ -16679,16 +16675,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1667916675
}
1668016676
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
1668116677
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16682-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16678+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
1668316679
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
1668416680
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S;
1668516681
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS;
1668616682
}
1668716683
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1668816684
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16689-
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S :
16690-
difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S;
16685+
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS :
16686+
difquant_more_fl_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1669116687
}
16688+
else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1669216689
}
1669316690
++qs.i_attention_wq;
1669416691
} else if (name.find("ffn_down") != std::string::npos) {
@@ -16741,37 +16738,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1674116738
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
1674216739
}
1674316740
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
16744-
new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16741+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16742+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
16743+
else new_type = GGML_TYPE_IQ3_XXS;
1674516744
}
1674616745
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
16747-
new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16746+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16747+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16748+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
1674816749
}
1674916750
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
16750-
new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16751+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16752+
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
16753+
else new_type = GGML_TYPE_IQ3_S;
1675116754
}
1675216755
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
16753-
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16756+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16757+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16758+
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1675416759
}
1675516760
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
16756-
new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16761+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16762+
new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16763+
else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1675716764
}
1675816765
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
16759-
new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16766+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16767+
new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16768+
else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1676016769
}
1676116770
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
16762-
new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16763-
}
16764-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) {
16765-
new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16771+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16772+
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
16773+
else new_type = GGML_TYPE_IQ4_XS;
1676616774
}
1676716775
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
1676816776
new_type = GGML_TYPE_Q5_K;
1676916777
}
1677016778
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1677116779
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16772-
new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS :
16773-
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
16780+
new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K :
16781+
difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1677416782
}
16783+
else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
1677516784
}
1677616785
++qs.i_ffn_down;
1677716786
} else if (name.find("attn_output.weight") != std::string::npos) {
@@ -16786,10 +16795,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1678616795
}
1678716796
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
1678816797
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16789-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL ||
16798+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1679016799
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
1679116800
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
16792-
ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR ||
1679316801
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
1679416802
new_type = GGML_TYPE_Q5_K;
1679516803
}
@@ -16816,13 +16824,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1681616824
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
1681716825
new_type = GGML_TYPE_IQ3_S;
1681816826
}
16819-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL ||
16820-
ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS;
16827+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL)
16828+
new_type = GGML_TYPE_IQ4_XS;
1682116829
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
1682216830
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16823-
new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS :
16824-
difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS;
16831+
new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K :
16832+
difquant_more_fl_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1682516833
}
16834+
else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K;
1682616835
}
1682716836
}
1682816837
} else {
@@ -16852,7 +16861,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1685216861
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS;
1685316862
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K;
1685416863
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K;
16855-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K;
16864+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16865+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) {
16866+
new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K :
16867+
difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16868+
}
16869+
else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
16870+
}
1685616871
++qs.i_attention_wv;
1685716872
} else if (name.find("ffn_gate") != std::string::npos) {
1685816873
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -16947,7 +16962,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1694716962
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1694816963
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1694916964
}
16950-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
16965+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
16966+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
16967+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16968+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
16969+
}
1695116970
++qs.i_ffn_gate;
1695216971
} else if (name.find("ffn_up") != std::string::npos) {
1695316972
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
@@ -17042,7 +17061,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
1704217061
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1704317062
else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
1704417063
}
17045-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS;
17064+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
17065+
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
17066+
new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17067+
else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS;
17068+
}
1704617069
++qs.i_ffn_up;
1704717070
} else if (name.find("attn_norm.weight") != std::string::npos) {
1704817071
if (qs.params->attn_norm_type < GGML_TYPE_COUNT) new_type = qs.params->attn_norm_type;
@@ -17201,7 +17224,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1720117224
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
1720217225
case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break;
1720317226
case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break;
17204-
case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break;
1720517227
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
1720617228
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
1720717229
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;

0 commit comments

Comments
 (0)