diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index fbb04426abe7e..bd4218d416dea 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -368,8 +368,8 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); -// 3.4375 bpw #define IQ3S_N_SCALE QK_K/64 +// 3.4375 bpw typedef struct { ggml_half d; uint8_t qs[QK_K/4]; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4b5713d7dd9a..acdab324d2dec 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -84,10 +84,22 @@ struct quantize_state_impl { int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; + int n_ffn_down_exps = 0; + int n_ffn_gate_exps = 0; + int n_ffn_up_exps = 0; + int n_ffn_down_shexp = 0; + int n_ffn_gate_shexp = 0; + int n_ffn_up_shexp = 0; int i_attention_wv = 0; int i_ffn_down = 0; int i_ffn_gate = 0; int i_ffn_up = 0; + int i_ffn_down_exps = 0; + int i_ffn_gate_exps = 0; + int i_ffn_up_exps = 0; + int i_ffn_down_shexp = 0; + int i_ffn_gate_shexp = 0; + int i_ffn_up_shexp = 0; int n_k_quantized = 0; int n_fallback = 0; @@ -175,6 +187,56 @@ static void llama_tensor_dequantize_impl( workers.clear(); } + +// Returns the appropriate type for expert _exps tensors based on ftype +static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M; + return new_type; +} + +static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_type new_type) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ1_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_S; + return new_type; +} + +static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + // Bump I-quants + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K; + + return new_type; +} + +static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { + // Squash K-quants + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + // Squash I-quants + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K; + } + return new_type; +} + static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); @@ -245,21 +307,108 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + ++qs.i_attention_wv; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) { + if (qs.i_attention_wv < qs.n_attention_wv/8) { + new_type = GGML_TYPE_Q4_K; + } + else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) { + if (qs.i_ffn_down < qs.n_ffn_down/16) { + new_type = GGML_TYPE_Q6_K; + } + else if (qs.i_ffn_down < qs.n_ffn_down/8) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_down; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) { + if (qs.i_ffn_gate < qs.n_ffn_gate/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_gate < qs.n_ffn_gate/8) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_gate; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) { + if (qs.i_ffn_up < qs.n_ffn_up/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_up < qs.n_ffn_up/8) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_up; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { + if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); + } + ++qs.i_ffn_down_exps; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) { + if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); + } + ++qs.i_ffn_gate_exps; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) { + if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); + } + ++qs.i_ffn_up_exps; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { + new_type = GGML_TYPE_Q6_K; + } + ++qs.i_ffn_down_shexp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { + new_type = GGML_TYPE_Q6_K; + } + ++qs.i_ffn_gate_shexp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { + new_type = GGML_TYPE_Q6_K; + } + ++qs.i_ffn_up_shexp; + } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { - new_type = GGML_TYPE_Q5_K; + if (qs.model.hparams.n_expert >= 8) { + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; @@ -304,7 +453,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -322,6 +471,48 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) { + if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_down_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) { + if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_gate_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) { + if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_up_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) { + if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } else { + new_type = get_exps_type_high_bpw_bump(ftype, new_type, qs.has_imatrix); + } + } + ++qs.i_ffn_down_exps; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -369,7 +560,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -409,6 +600,24 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; } // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -700,6 +909,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; + } else if (name.find("ffn_gate_exps.weight") != std::string::npos) { + ++qs.n_ffn_gate_exps; + } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { + ++qs.n_ffn_gate_shexp; + } else if (name.find("ffn_down_exps.weight") != std::string::npos) { + ++qs.n_ffn_down_exps; + } else if (name.find("ffn_down_shexp.weight") != std::string::npos) { + ++qs.n_ffn_down_shexp; + } else if (name.find("ffn_up_exps.weight") != std::string::npos) { + ++qs.n_ffn_up_exps; + } else if (name.find("ffn_up_shexp.weight") != std::string::npos) { + ++qs.n_ffn_up_shexp; } }